From b1085a53dfd4c2a7cd3793bab5c827a9073baa27 Mon Sep 17 00:00:00 2001 From: Cole Medin Date: Sat, 20 Sep 2025 12:58:42 -0500 Subject: [PATCH] Removing junk from sitemap and full site (recursive) crawls (#711) * Removing junk from sitemap and full site (recursive) crawls * Small typo fix for result.markdown --- .../services/crawling/crawling_service.py | 5 +-- .../services/crawling/helpers/site_config.py | 31 +++++++++++++++++++ .../services/crawling/strategies/batch.py | 4 +-- .../services/crawling/strategies/recursive.py | 4 +-- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index e05cd6e..55cd6d9 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -74,10 +74,11 @@ class CrawlingService: self.url_handler = URLHandler() self.site_config = SiteConfig() self.markdown_generator = self.site_config.get_markdown_generator() + self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator() # Initialize strategies - self.batch_strategy = BatchCrawlStrategy(crawler, self.markdown_generator) - self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.markdown_generator) + self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator) + self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator) self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator) self.sitemap_strategy = SitemapCrawlStrategy() diff --git a/python/src/server/services/crawling/helpers/site_config.py b/python/src/server/services/crawling/helpers/site_config.py index e7ea9f7..846fe45 100644 --- a/python/src/server/services/crawling/helpers/site_config.py +++ b/python/src/server/services/crawling/helpers/site_config.py @@ -4,6 +4,7 @@ Site Configuration Helper Handles site-specific configurations and detection. """ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter from ....config.logfire_config import get_logger @@ -96,3 +97,33 @@ class SiteConfig: "code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else '' } ) + + @staticmethod + def get_link_pruning_markdown_generator(): + """ + Get markdown generator for the recursive crawling strategy that cleans up pages crawled. + + Returns: + Configured markdown generator + """ + prune_filter = PruningContentFilter( + threshold=0.2, + threshold_type="fixed" + ) + + return DefaultMarkdownGenerator( + content_source="html", # Use raw HTML to preserve code blocks + content_filter=prune_filter, + options={ + "mark_code": True, # Mark code blocks properly + "handle_code_in_pre": True, # Handle
 tags
+                "body_width": 0,            # No line wrapping
+                "skip_internal_links": True,  # Add to reduce noise
+                "include_raw_html": False,    # Prevent HTML in markdown
+                "escape": False,             # Don't escape special chars in code
+                "decode_unicode": True,      # Decode unicode characters
+                "strip_empty_lines": False,  # Preserve empty lines in code
+                "preserve_code_formatting": True,  # Custom option if supported
+                "code_language_callback": lambda el: el.get('class', '').replace('language-', '') if el else ''
+            }
+        )
diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py
index 2834d55..1457fdc 100644
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -231,12 +231,12 @@ class BatchCrawlStrategy:
                         raise
 
                 processed += 1
-                if result.success and result.markdown:
+                if result.success and result.markdown and result.markdown.fit_markdown:
                     # Map back to original URL
                     original_url = url_mapping.get(result.url, result.url)
                     successful_results.append({
                         "url": original_url,
-                        "markdown": result.markdown,
+                        "markdown": result.markdown.fit_markdown,
                         "html": result.html,  # Use raw HTML
                     })
                 else:
diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py
index 436902e..d13b51d 100644
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -276,10 +276,10 @@ class RecursiveCrawlStrategy:
                     visited.add(norm_url)
                     total_processed += 1
 
-                    if result.success and result.markdown:
+                    if result.success and result.markdown and result.markdown.fit_markdown:
                         results_all.append({
                             "url": original_url,
-                            "markdown": result.markdown,
+                            "markdown": result.markdown.fit_markdown,
                             "html": result.html,  # Always use raw HTML for code extraction
                         })
                         depth_successful += 1