Removing junk from sitemap and full site (recursive) crawls (#711)

* Removing junk from sitemap and full site (recursive) crawls

* Small typo fix for result.markdown
This commit is contained in:
Cole Medin 2025-09-20 12:58:42 -05:00 committed by GitHub
parent c3be65322b
commit b1085a53df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 38 additions and 6 deletions

View File

@ -74,10 +74,11 @@ class CrawlingService:
self.url_handler = URLHandler() self.url_handler = URLHandler()
self.site_config = SiteConfig() self.site_config = SiteConfig()
self.markdown_generator = self.site_config.get_markdown_generator() self.markdown_generator = self.site_config.get_markdown_generator()
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
# Initialize strategies # Initialize strategies
self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator) self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator) self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator) self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
self.sitemap_strategy = SitemapCrawlStrategy() self.sitemap_strategy = SitemapCrawlStrategy()

View File

@ -4,6 +4,7 @@ Site Configuration Helper
Handles site-specific configurations and detection. Handles site-specific configurations and detection.
""" """
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from ....config.logfire_config import get_logger from ....config.logfire_config import get_logger
@ -96,3 +97,33 @@ class SiteConfig:
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else '' "code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
} }
) )
@staticmethod
def get_link_pruning_markdown_generator():
"""
Get markdown generator for the recursive crawling strategy that cleans up pages crawled.
Returns:
Configured markdown generator
"""
prune_filter = PruningContentFilter(
threshold=0.2,
threshold_type="fixed"
)
return DefaultMarkdownGenerator(
content_source="html", # Use raw HTML to preserve code blocks
content_filter=prune_filter,
options={
"mark_code": True, # Mark code blocks properly
"handle_code_in_pre": True, # Handle <pre><code> tags
"body_width": 0, # No line wrapping
"skip_internal_links": True, # Add to reduce noise
"include_raw_html": False, # Prevent HTML in markdown
"escape": False, # Don't escape special chars in code
"decode_unicode": True, # Decode unicode characters
"strip_empty_lines": False, # Preserve empty lines in code
"preserve_code_formatting": True, # Custom option if supported
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
}
)

View File

@ -231,12 +231,12 @@ class BatchCrawlStrategy:
raise raise
processed += 1 processed += 1
if result.success and result.markdown: if result.success and result.markdown and result.markdown.fit_markdown:
# Map back to original URL # Map back to original URL
original_url = url_mapping.get(result.url, result.url) original_url = url_mapping.get(result.url, result.url)
successful_results.append({ successful_results.append({
"url": original_url, "url": original_url,
"markdown": result.markdown, "markdown": result.markdown.fit_markdown,
"html": result.html, # Use raw HTML "html": result.html, # Use raw HTML
}) })
else: else:

View File

@ -276,10 +276,10 @@ class RecursiveCrawlStrategy:
visited.add(norm_url) visited.add(norm_url)
total_processed += 1 total_processed += 1
if result.success and result.markdown: if result.success and result.markdown and result.markdown.fit_markdown:
results_all.append({ results_all.append({
"url": original_url, "url": original_url,
"markdown": result.markdown, "markdown": result.markdown.fit_markdown,
"html": result.html, # Always use raw HTML for code extraction "html": result.html, # Always use raw HTML for code extraction
}) })
depth_successful += 1 depth_successful += 1