fix: Apply URL transformation before crawling in recursive strategy

- Transform URLs to raw content (e.g., GitHub blob -> raw) before sending to crawler - Maintain mapping dictionary to preserve original URLs in results - Align progress callback signatures between batch and recursive strategies - Add safety guards for missing links attribute - Remove unused loop counter in batch strategy - Optimize binary file checks to avoid duplicate calls This ensures GitHub files are crawled as raw content instead of HTML pages, fixing the issue where content extraction was degraded due to HTML wrapping.
2025-08-21 22:04:52 +03:00 · 2025-08-21 22:04:52 +03:00 · cb4dba14a0
commit cb4dba14a0
parent 573e5c18c5
2 changed files with 19 additions and 16 deletions
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@ -119,10 +119,11 @@ class BatchCrawlStrategy:
            max_session_permit=max_concurrent,
        )
-        async def report_progress(percentage: int, message: str):
+        async def report_progress(percentage: int, message: str, **kwargs):
            """Helper to report progress if callback is available"""
            if progress_callback:
-                await progress_callback("crawling", percentage, message)
+                step_info = {"currentStep": message, "stepMessage": message, **kwargs}
                await progress_callback("crawling", percentage, message, step_info=step_info)
        total_urls = len(urls)
        await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
@ -162,7 +163,6 @@ class BatchCrawlStrategy:
            )
            # Handle streaming results
            j = 0
            async for result in batch_results:
                processed += 1
                if result.success and result.markdown:
@ -190,7 +190,6 @@ class BatchCrawlStrategy:
                        progress_percentage,
                        f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
                    )
                j += 1
        await report_progress(
            end_progress,
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@ -169,6 +169,14 @@ class RecursiveCrawlStrategy:
                batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
                batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
                # Transform URLs and create mapping for this batch
                url_mapping = {}
                transformed_batch_urls = []
                for url in batch_urls:
                    transformed = transform_url_func(url)
                    transformed_batch_urls.append(transformed)
                    url_mapping[transformed] = url
                # Calculate progress for this batch within the depth
                batch_progress = depth_start + int(
                    (batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
@ -183,18 +191,14 @@ class RecursiveCrawlStrategy:
                # Use arun_many for native parallel crawling with streaming
                logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
                batch_results = await self.crawler.arun_many(
-                    urls=batch_urls, config=run_config, dispatcher=dispatcher
+                    urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher
                )
                # Handle streaming results from arun_many
                i = 0
                async for result in batch_results:
-                    # Map back to original URL if transformed
+                    # Map back to original URL using the mapping dict
-                    original_url = result.url
+                    original_url = url_mapping.get(result.url, result.url)
                    for orig_url in batch_urls:
                        if transform_url_func(orig_url) == result.url:
                            original_url = orig_url
                            break
                    norm_url = normalize_url(original_url)
                    visited.add(norm_url)
@ -209,14 +213,14 @@ class RecursiveCrawlStrategy:
                        depth_successful += 1
                        # Find internal links for next depth
-                        for link in result.links.get("internal", []):
+                        links = getattr(result, "links", {}) or {}
                        for link in links.get("internal", []):
                            next_url = normalize_url(link["href"])
                            # Skip binary files and already visited URLs
-                            if next_url not in visited and not self.url_handler.is_binary_file(
+                            is_binary = self.url_handler.is_binary_file(next_url)
-                                next_url
+                            if next_url not in visited and not is_binary:
                            ):
                                next_level_urls.add(next_url)
-                            elif self.url_handler.is_binary_file(next_url):
+                            elif is_binary:
                                logger.debug(f"Skipping binary file from crawl queue: {next_url}")
                    else:
                        logger.warning(