From cb4dba14a0fdc9614c6089456aba43432e23a638 Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Thu, 21 Aug 2025 22:04:52 +0300 Subject: [PATCH] fix: Apply URL transformation before crawling in recursive strategy - Transform URLs to raw content (e.g., GitHub blob -> raw) before sending to crawler - Maintain mapping dictionary to preserve original URLs in results - Align progress callback signatures between batch and recursive strategies - Add safety guards for missing links attribute - Remove unused loop counter in batch strategy - Optimize binary file checks to avoid duplicate calls This ensures GitHub files are crawled as raw content instead of HTML pages, fixing the issue where content extraction was degraded due to HTML wrapping. --- .../services/crawling/strategies/batch.py | 7 ++--- .../services/crawling/strategies/recursive.py | 28 +++++++++++-------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index bf2cf57..e3ecb0e 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -119,10 +119,11 @@ class BatchCrawlStrategy: max_session_permit=max_concurrent, ) - async def report_progress(percentage: int, message: str): + async def report_progress(percentage: int, message: str, **kwargs): """Helper to report progress if callback is available""" if progress_callback: - await progress_callback("crawling", percentage, message) + step_info = {"currentStep": message, "stepMessage": message, **kwargs} + await progress_callback("crawling", percentage, message, step_info=step_info) total_urls = len(urls) await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...") @@ -162,7 +163,6 @@ class BatchCrawlStrategy: ) # Handle streaming results - j = 0 async for result in batch_results: processed += 1 if result.success and result.markdown: @@ -190,7 +190,6 @@ class BatchCrawlStrategy: progress_percentage, f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)", ) - j += 1 await report_progress( end_progress, diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 2ce252f..448a010 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -169,6 +169,14 @@ class RecursiveCrawlStrategy: batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size] batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl)) + # Transform URLs and create mapping for this batch + url_mapping = {} + transformed_batch_urls = [] + for url in batch_urls: + transformed = transform_url_func(url) + transformed_batch_urls.append(transformed) + url_mapping[transformed] = url + # Calculate progress for this batch within the depth batch_progress = depth_start + int( (batch_idx / len(urls_to_crawl)) * (depth_end - depth_start) @@ -183,18 +191,14 @@ class RecursiveCrawlStrategy: # Use arun_many for native parallel crawling with streaming logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many") batch_results = await self.crawler.arun_many( - urls=batch_urls, config=run_config, dispatcher=dispatcher + urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher ) # Handle streaming results from arun_many i = 0 async for result in batch_results: - # Map back to original URL if transformed - original_url = result.url - for orig_url in batch_urls: - if transform_url_func(orig_url) == result.url: - original_url = orig_url - break + # Map back to original URL using the mapping dict + original_url = url_mapping.get(result.url, result.url) norm_url = normalize_url(original_url) visited.add(norm_url) @@ -209,14 +213,14 @@ class RecursiveCrawlStrategy: depth_successful += 1 # Find internal links for next depth - for link in result.links.get("internal", []): + links = getattr(result, "links", {}) or {} + for link in links.get("internal", []): next_url = normalize_url(link["href"]) # Skip binary files and already visited URLs - if next_url not in visited and not self.url_handler.is_binary_file( - next_url - ): + is_binary = self.url_handler.is_binary_file(next_url) + if next_url not in visited and not is_binary: next_level_urls.add(next_url) - elif self.url_handler.is_binary_file(next_url): + elif is_binary: logger.debug(f"Skipping binary file from crawl queue: {next_url}") else: logger.warning(