fix: Apply URL transformation before crawling in recursive strategy

- Transform URLs to raw content (e.g., GitHub blob -> raw) before sending to crawler
- Maintain mapping dictionary to preserve original URLs in results
- Align progress callback signatures between batch and recursive strategies
- Add safety guards for missing links attribute
- Remove unused loop counter in batch strategy
- Optimize binary file checks to avoid duplicate calls

This ensures GitHub files are crawled as raw content instead of HTML pages,
fixing the issue where content extraction was degraded due to HTML wrapping.
This commit is contained in:
Rasmus Widing 2025-08-21 22:04:52 +03:00 committed by Wirasm
parent 573e5c18c5
commit cb4dba14a0
2 changed files with 19 additions and 16 deletions

View File

@ -119,10 +119,11 @@ class BatchCrawlStrategy:
max_session_permit=max_concurrent, max_session_permit=max_concurrent,
) )
async def report_progress(percentage: int, message: str): async def report_progress(percentage: int, message: str, **kwargs):
"""Helper to report progress if callback is available""" """Helper to report progress if callback is available"""
if progress_callback: if progress_callback:
await progress_callback("crawling", percentage, message) step_info = {"currentStep": message, "stepMessage": message, **kwargs}
await progress_callback("crawling", percentage, message, step_info=step_info)
total_urls = len(urls) total_urls = len(urls)
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...") await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
@ -162,7 +163,6 @@ class BatchCrawlStrategy:
) )
# Handle streaming results # Handle streaming results
j = 0
async for result in batch_results: async for result in batch_results:
processed += 1 processed += 1
if result.success and result.markdown: if result.success and result.markdown:
@ -190,7 +190,6 @@ class BatchCrawlStrategy:
progress_percentage, progress_percentage,
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)", f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
) )
j += 1
await report_progress( await report_progress(
end_progress, end_progress,

View File

@ -169,6 +169,14 @@ class RecursiveCrawlStrategy:
batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size] batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl)) batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
# Transform URLs and create mapping for this batch
url_mapping = {}
transformed_batch_urls = []
for url in batch_urls:
transformed = transform_url_func(url)
transformed_batch_urls.append(transformed)
url_mapping[transformed] = url
# Calculate progress for this batch within the depth # Calculate progress for this batch within the depth
batch_progress = depth_start + int( batch_progress = depth_start + int(
(batch_idx / len(urls_to_crawl)) * (depth_end - depth_start) (batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
@ -183,18 +191,14 @@ class RecursiveCrawlStrategy:
# Use arun_many for native parallel crawling with streaming # Use arun_many for native parallel crawling with streaming
logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many") logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
batch_results = await self.crawler.arun_many( batch_results = await self.crawler.arun_many(
urls=batch_urls, config=run_config, dispatcher=dispatcher urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher
) )
# Handle streaming results from arun_many # Handle streaming results from arun_many
i = 0 i = 0
async for result in batch_results: async for result in batch_results:
# Map back to original URL if transformed # Map back to original URL using the mapping dict
original_url = result.url original_url = url_mapping.get(result.url, result.url)
for orig_url in batch_urls:
if transform_url_func(orig_url) == result.url:
original_url = orig_url
break
norm_url = normalize_url(original_url) norm_url = normalize_url(original_url)
visited.add(norm_url) visited.add(norm_url)
@ -209,14 +213,14 @@ class RecursiveCrawlStrategy:
depth_successful += 1 depth_successful += 1
# Find internal links for next depth # Find internal links for next depth
for link in result.links.get("internal", []): links = getattr(result, "links", {}) or {}
for link in links.get("internal", []):
next_url = normalize_url(link["href"]) next_url = normalize_url(link["href"])
# Skip binary files and already visited URLs # Skip binary files and already visited URLs
if next_url not in visited and not self.url_handler.is_binary_file( is_binary = self.url_handler.is_binary_file(next_url)
next_url if next_url not in visited and not is_binary:
):
next_level_urls.add(next_url) next_level_urls.add(next_url)
elif self.url_handler.is_binary_file(next_url): elif is_binary:
logger.debug(f"Skipping binary file from crawl queue: {next_url}") logger.debug(f"Skipping binary file from crawl queue: {next_url}")
else: else:
logger.warning( logger.warning(