Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl

Fix crawler attempting to navigate to binary files
2025-08-16 00:39:17 +03:00 · 2025-08-16 00:39:17 +03:00 · 8743c059bb
commit 8743c059bb
parent f96a9a4c4a 8157670936
3 changed files with 179 additions and 1 deletions
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@ -48,6 +48,54 @@ class URLHandler:
            logger.warning(f"Error checking if URL is text file: {e}")
            return False
    
+    @staticmethod
+    def is_binary_file(url: str) -> bool:
+        """
+        Check if a URL points to a binary file that shouldn't be crawled.
+        
+        Args:
+            url: URL to check
+            
+        Returns:
+            True if URL is a binary file, False otherwise
+        """
+        try:
+            # Remove query parameters and fragments for cleaner extension checking
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+            
+            # Comprehensive list of binary and non-HTML file extensions
+            binary_extensions = {
+                # Archives
+                '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
+                # Executables and installers
+                '.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
+                # Documents (non-HTML)
+                '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
+                # Images
+                '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
+                # Audio/Video
+                '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
+                # Data files
+                '.csv', '.sql', '.db', '.sqlite',
+                # Binary data
+                '.iso', '.img', '.bin', '.dat',
+                # Development files (usually not meant to be crawled as pages)
+                '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
+            }
+            
+            # Check if the path ends with any binary extension
+            for ext in binary_extensions:
+                if path.endswith(ext):
+                    logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
+                    return True
+                    
+            return False
+        except Exception as e:
+            logger.warning(f"Error checking if URL is binary file: {e}")
+            # In case of error, don't skip the URL (safer to attempt crawl than miss content)
+            return False
+    
    @staticmethod
    def transform_github_url(url: str) -> str:
        """
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@ -10,6 +10,7 @@ from urllib.parse import urldefrag
 from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.url_handler import URLHandler

 logger = get_logger(__name__)

@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
        """
        self.crawler = crawler
        self.markdown_generator = markdown_generator
+        self.url_handler = URLHandler()
    
    async def crawl_recursive_with_progress(
        self,
@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
                        # Find internal links for next depth
                        for link in result.links.get("internal", []):
                            next_url = normalize_url(link["href"])
-                            if next_url not in visited:
+                            # Skip binary files and already visited URLs
+                            if next_url not in visited and not self.url_handler.is_binary_file(next_url):
                                next_level_urls.add(next_url)
+                            elif self.url_handler.is_binary_file(next_url):
+                                logger.debug(f"Skipping binary file from crawl queue: {next_url}")
                    else:
                        logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
                    
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@ -0,0 +1,125 @@
+"""Unit tests for URLHandler class."""
+import pytest
+from src.server.services.crawling.helpers.url_handler import URLHandler
+
+
+class TestURLHandler:
+    """Test suite for URLHandler class."""
+
+    def test_is_binary_file_archives(self):
+        """Test detection of archive file formats."""
+        handler = URLHandler()
+        
+        # Should detect various archive formats
+        assert handler.is_binary_file("https://example.com/file.zip") is True
+        assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
+        assert handler.is_binary_file("https://example.com/compressed.rar") is True
+        assert handler.is_binary_file("https://example.com/package.7z") is True
+        assert handler.is_binary_file("https://example.com/backup.tgz") is True
+
+    def test_is_binary_file_executables(self):
+        """Test detection of executable and installer files."""
+        handler = URLHandler()
+        
+        assert handler.is_binary_file("https://example.com/setup.exe") is True
+        assert handler.is_binary_file("https://example.com/installer.dmg") is True
+        assert handler.is_binary_file("https://example.com/package.deb") is True
+        assert handler.is_binary_file("https://example.com/app.msi") is True
+        assert handler.is_binary_file("https://example.com/program.appimage") is True
+
+    def test_is_binary_file_documents(self):
+        """Test detection of document files."""
+        handler = URLHandler()
+        
+        assert handler.is_binary_file("https://example.com/document.pdf") is True
+        assert handler.is_binary_file("https://example.com/report.docx") is True
+        assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
+        assert handler.is_binary_file("https://example.com/presentation.pptx") is True
+
+    def test_is_binary_file_media(self):
+        """Test detection of image and media files."""
+        handler = URLHandler()
+        
+        # Images
+        assert handler.is_binary_file("https://example.com/photo.jpg") is True
+        assert handler.is_binary_file("https://example.com/image.png") is True
+        assert handler.is_binary_file("https://example.com/icon.svg") is True
+        assert handler.is_binary_file("https://example.com/favicon.ico") is True
+        
+        # Audio/Video
+        assert handler.is_binary_file("https://example.com/song.mp3") is True
+        assert handler.is_binary_file("https://example.com/video.mp4") is True
+        assert handler.is_binary_file("https://example.com/movie.mkv") is True
+
+    def test_is_binary_file_case_insensitive(self):
+        """Test that detection is case-insensitive."""
+        handler = URLHandler()
+        
+        assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
+        assert handler.is_binary_file("https://example.com/Document.PDF") is True
+        assert handler.is_binary_file("https://example.com/Image.PNG") is True
+
+    def test_is_binary_file_with_query_params(self):
+        """Test that query parameters don't affect detection."""
+        handler = URLHandler()
+        
+        assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
+        assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
+        assert handler.is_binary_file("https://example.com/image.png#section") is True
+
+    def test_is_binary_file_html_pages(self):
+        """Test that HTML pages are not detected as binary."""
+        handler = URLHandler()
+        
+        # Regular HTML pages should not be detected as binary
+        assert handler.is_binary_file("https://example.com/") is False
+        assert handler.is_binary_file("https://example.com/index.html") is False
+        assert handler.is_binary_file("https://example.com/page") is False
+        assert handler.is_binary_file("https://example.com/blog/post") is False
+        assert handler.is_binary_file("https://example.com/about.htm") is False
+        assert handler.is_binary_file("https://example.com/contact.php") is False
+
+    def test_is_binary_file_edge_cases(self):
+        """Test edge cases and special scenarios."""
+        handler = URLHandler()
+        
+        # URLs with periods in path but not file extensions
+        assert handler.is_binary_file("https://example.com/v1.0/api") is False
+        assert handler.is_binary_file("https://example.com/jquery.min.js") is False  # JS files might be crawlable
+        
+        # Real-world example from the error
+        assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
+
+    def test_is_sitemap(self):
+        """Test sitemap detection."""
+        handler = URLHandler()
+        
+        assert handler.is_sitemap("https://example.com/sitemap.xml") is True
+        assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
+        assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
+        assert handler.is_sitemap("https://example.com/regular-page") is False
+
+    def test_is_txt(self):
+        """Test text file detection."""
+        handler = URLHandler()
+        
+        assert handler.is_txt("https://example.com/robots.txt") is True
+        assert handler.is_txt("https://example.com/readme.txt") is True
+        assert handler.is_txt("https://example.com/file.pdf") is False
+
+    def test_transform_github_url(self):
+        """Test GitHub URL transformation."""
+        handler = URLHandler()
+        
+        # Should transform GitHub blob URLs to raw URLs
+        original = "https://github.com/owner/repo/blob/main/file.py"
+        expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
+        assert handler.transform_github_url(original) == expected
+        
+        # Should not transform non-blob URLs
+        non_blob = "https://github.com/owner/repo"
+        assert handler.transform_github_url(non_blob) == non_blob
+        
+        # Should not transform non-GitHub URLs
+        other = "https://example.com/file"
+        assert handler.transform_github_url(other) == other