fix: Apply URL transformation before crawling in recursive strategy
- Transform URLs to raw content (e.g., GitHub blob -> raw) before sending to crawler - Maintain mapping dictionary to preserve original URLs in results - Align progress callback signatures between batch and recursive strategies - Add safety guards for missing links attribute - Remove unused loop counter in batch strategy - Optimize binary file checks to avoid duplicate calls This ensures GitHub files are crawled as raw content instead of HTML pages, fixing the issue where content extraction was degraded due to HTML wrapping.
This commit is contained in:
parent
573e5c18c5
commit
cb4dba14a0
@ -119,10 +119,11 @@ class BatchCrawlStrategy:
|
|||||||
max_session_permit=max_concurrent,
|
max_session_permit=max_concurrent,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def report_progress(percentage: int, message: str):
|
async def report_progress(percentage: int, message: str, **kwargs):
|
||||||
"""Helper to report progress if callback is available"""
|
"""Helper to report progress if callback is available"""
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
await progress_callback("crawling", percentage, message)
|
step_info = {"currentStep": message, "stepMessage": message, **kwargs}
|
||||||
|
await progress_callback("crawling", percentage, message, step_info=step_info)
|
||||||
|
|
||||||
total_urls = len(urls)
|
total_urls = len(urls)
|
||||||
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
|
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
|
||||||
@ -162,7 +163,6 @@ class BatchCrawlStrategy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Handle streaming results
|
# Handle streaming results
|
||||||
j = 0
|
|
||||||
async for result in batch_results:
|
async for result in batch_results:
|
||||||
processed += 1
|
processed += 1
|
||||||
if result.success and result.markdown:
|
if result.success and result.markdown:
|
||||||
@ -190,7 +190,6 @@ class BatchCrawlStrategy:
|
|||||||
progress_percentage,
|
progress_percentage,
|
||||||
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
|
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
|
||||||
)
|
)
|
||||||
j += 1
|
|
||||||
|
|
||||||
await report_progress(
|
await report_progress(
|
||||||
end_progress,
|
end_progress,
|
||||||
|
|||||||
@ -169,6 +169,14 @@ class RecursiveCrawlStrategy:
|
|||||||
batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
|
batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
|
||||||
batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
|
batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
|
||||||
|
|
||||||
|
# Transform URLs and create mapping for this batch
|
||||||
|
url_mapping = {}
|
||||||
|
transformed_batch_urls = []
|
||||||
|
for url in batch_urls:
|
||||||
|
transformed = transform_url_func(url)
|
||||||
|
transformed_batch_urls.append(transformed)
|
||||||
|
url_mapping[transformed] = url
|
||||||
|
|
||||||
# Calculate progress for this batch within the depth
|
# Calculate progress for this batch within the depth
|
||||||
batch_progress = depth_start + int(
|
batch_progress = depth_start + int(
|
||||||
(batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
|
(batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
|
||||||
@ -183,18 +191,14 @@ class RecursiveCrawlStrategy:
|
|||||||
# Use arun_many for native parallel crawling with streaming
|
# Use arun_many for native parallel crawling with streaming
|
||||||
logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
|
logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
|
||||||
batch_results = await self.crawler.arun_many(
|
batch_results = await self.crawler.arun_many(
|
||||||
urls=batch_urls, config=run_config, dispatcher=dispatcher
|
urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle streaming results from arun_many
|
# Handle streaming results from arun_many
|
||||||
i = 0
|
i = 0
|
||||||
async for result in batch_results:
|
async for result in batch_results:
|
||||||
# Map back to original URL if transformed
|
# Map back to original URL using the mapping dict
|
||||||
original_url = result.url
|
original_url = url_mapping.get(result.url, result.url)
|
||||||
for orig_url in batch_urls:
|
|
||||||
if transform_url_func(orig_url) == result.url:
|
|
||||||
original_url = orig_url
|
|
||||||
break
|
|
||||||
|
|
||||||
norm_url = normalize_url(original_url)
|
norm_url = normalize_url(original_url)
|
||||||
visited.add(norm_url)
|
visited.add(norm_url)
|
||||||
@ -209,14 +213,14 @@ class RecursiveCrawlStrategy:
|
|||||||
depth_successful += 1
|
depth_successful += 1
|
||||||
|
|
||||||
# Find internal links for next depth
|
# Find internal links for next depth
|
||||||
for link in result.links.get("internal", []):
|
links = getattr(result, "links", {}) or {}
|
||||||
|
for link in links.get("internal", []):
|
||||||
next_url = normalize_url(link["href"])
|
next_url = normalize_url(link["href"])
|
||||||
# Skip binary files and already visited URLs
|
# Skip binary files and already visited URLs
|
||||||
if next_url not in visited and not self.url_handler.is_binary_file(
|
is_binary = self.url_handler.is_binary_file(next_url)
|
||||||
next_url
|
if next_url not in visited and not is_binary:
|
||||||
):
|
|
||||||
next_level_urls.add(next_url)
|
next_level_urls.add(next_url)
|
||||||
elif self.url_handler.is_binary_file(next_url):
|
elif is_binary:
|
||||||
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user