Removing junk from sitemap and full site (recursive) crawls (#711)
* Removing junk from sitemap and full site (recursive) crawls * Small typo fix for result.markdown
This commit is contained in:
parent
c3be65322b
commit
b1085a53df
@ -74,10 +74,11 @@ class CrawlingService:
|
||||
self.url_handler = URLHandler()
|
||||
self.site_config = SiteConfig()
|
||||
self.markdown_generator = self.site_config.get_markdown_generator()
|
||||
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
|
||||
|
||||
# Initialize strategies
|
||||
self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator)
|
||||
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator)
|
||||
self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
|
||||
self.sitemap_strategy = SitemapCrawlStrategy()
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ Site Configuration Helper
|
||||
Handles site-specific configurations and detection.
|
||||
"""
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
|
||||
@ -96,3 +97,33 @@ class SiteConfig:
|
||||
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
|
||||
}
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_link_pruning_markdown_generator():
|
||||
"""
|
||||
Get markdown generator for the recursive crawling strategy that cleans up pages crawled.
|
||||
|
||||
Returns:
|
||||
Configured markdown generator
|
||||
"""
|
||||
prune_filter = PruningContentFilter(
|
||||
threshold=0.2,
|
||||
threshold_type="fixed"
|
||||
)
|
||||
|
||||
return DefaultMarkdownGenerator(
|
||||
content_source="html", # Use raw HTML to preserve code blocks
|
||||
content_filter=prune_filter,
|
||||
options={
|
||||
"mark_code": True, # Mark code blocks properly
|
||||
"handle_code_in_pre": True, # Handle <pre><code> tags
|
||||
"body_width": 0, # No line wrapping
|
||||
"skip_internal_links": True, # Add to reduce noise
|
||||
"include_raw_html": False, # Prevent HTML in markdown
|
||||
"escape": False, # Don't escape special chars in code
|
||||
"decode_unicode": True, # Decode unicode characters
|
||||
"strip_empty_lines": False, # Preserve empty lines in code
|
||||
"preserve_code_formatting": True, # Custom option if supported
|
||||
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
|
||||
}
|
||||
)
|
||||
|
||||
@ -231,12 +231,12 @@ class BatchCrawlStrategy:
|
||||
raise
|
||||
|
||||
processed += 1
|
||||
if result.success and result.markdown:
|
||||
if result.success and result.markdown and result.markdown.fit_markdown:
|
||||
# Map back to original URL
|
||||
original_url = url_mapping.get(result.url, result.url)
|
||||
successful_results.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
"html": result.html, # Use raw HTML
|
||||
})
|
||||
else:
|
||||
|
||||
@ -276,10 +276,10 @@ class RecursiveCrawlStrategy:
|
||||
visited.add(norm_url)
|
||||
total_processed += 1
|
||||
|
||||
if result.success and result.markdown:
|
||||
if result.success and result.markdown and result.markdown.fit_markdown:
|
||||
results_all.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
"html": result.html, # Always use raw HTML for code extraction
|
||||
})
|
||||
depth_successful += 1
|
||||
|
||||
Loading…
Reference in New Issue
Block a user