Removing junk from sitemap and full site (recursive) crawls (#711)
* Removing junk from sitemap and full site (recursive) crawls * Small typo fix for result.markdown
This commit is contained in:
parent
c3be65322b
commit
b1085a53df
@ -74,10 +74,11 @@ class CrawlingService:
|
|||||||
self.url_handler = URLHandler()
|
self.url_handler = URLHandler()
|
||||||
self.site_config = SiteConfig()
|
self.site_config = SiteConfig()
|
||||||
self.markdown_generator = self.site_config.get_markdown_generator()
|
self.markdown_generator = self.site_config.get_markdown_generator()
|
||||||
|
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
|
||||||
|
|
||||||
# Initialize strategies
|
# Initialize strategies
|
||||||
self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator)
|
self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||||
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator)
|
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||||
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
|
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
|
||||||
self.sitemap_strategy = SitemapCrawlStrategy()
|
self.sitemap_strategy = SitemapCrawlStrategy()
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ Site Configuration Helper
|
|||||||
Handles site-specific configurations and detection.
|
Handles site-specific configurations and detection.
|
||||||
"""
|
"""
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
|
||||||
from ....config.logfire_config import get_logger
|
from ....config.logfire_config import get_logger
|
||||||
|
|
||||||
@ -96,3 +97,33 @@ class SiteConfig:
|
|||||||
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
|
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_link_pruning_markdown_generator():
|
||||||
|
"""
|
||||||
|
Get markdown generator for the recursive crawling strategy that cleans up pages crawled.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Configured markdown generator
|
||||||
|
"""
|
||||||
|
prune_filter = PruningContentFilter(
|
||||||
|
threshold=0.2,
|
||||||
|
threshold_type="fixed"
|
||||||
|
)
|
||||||
|
|
||||||
|
return DefaultMarkdownGenerator(
|
||||||
|
content_source="html", # Use raw HTML to preserve code blocks
|
||||||
|
content_filter=prune_filter,
|
||||||
|
options={
|
||||||
|
"mark_code": True, # Mark code blocks properly
|
||||||
|
"handle_code_in_pre": True, # Handle <pre><code> tags
|
||||||
|
"body_width": 0, # No line wrapping
|
||||||
|
"skip_internal_links": True, # Add to reduce noise
|
||||||
|
"include_raw_html": False, # Prevent HTML in markdown
|
||||||
|
"escape": False, # Don't escape special chars in code
|
||||||
|
"decode_unicode": True, # Decode unicode characters
|
||||||
|
"strip_empty_lines": False, # Preserve empty lines in code
|
||||||
|
"preserve_code_formatting": True, # Custom option if supported
|
||||||
|
"code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|||||||
@ -231,12 +231,12 @@ class BatchCrawlStrategy:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
processed += 1
|
processed += 1
|
||||||
if result.success and result.markdown:
|
if result.success and result.markdown and result.markdown.fit_markdown:
|
||||||
# Map back to original URL
|
# Map back to original URL
|
||||||
original_url = url_mapping.get(result.url, result.url)
|
original_url = url_mapping.get(result.url, result.url)
|
||||||
successful_results.append({
|
successful_results.append({
|
||||||
"url": original_url,
|
"url": original_url,
|
||||||
"markdown": result.markdown,
|
"markdown": result.markdown.fit_markdown,
|
||||||
"html": result.html, # Use raw HTML
|
"html": result.html, # Use raw HTML
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -276,10 +276,10 @@ class RecursiveCrawlStrategy:
|
|||||||
visited.add(norm_url)
|
visited.add(norm_url)
|
||||||
total_processed += 1
|
total_processed += 1
|
||||||
|
|
||||||
if result.success and result.markdown:
|
if result.success and result.markdown and result.markdown.fit_markdown:
|
||||||
results_all.append({
|
results_all.append({
|
||||||
"url": original_url,
|
"url": original_url,
|
||||||
"markdown": result.markdown,
|
"markdown": result.markdown.fit_markdown,
|
||||||
"html": result.html, # Always use raw HTML for code extraction
|
"html": result.html, # Always use raw HTML for code extraction
|
||||||
})
|
})
|
||||||
depth_successful += 1
|
depth_successful += 1
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user