diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index aa44998..d66a2a8 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -48,6 +48,54 @@ class URLHandler: logger.warning(f"Error checking if URL is text file: {e}") return False + @staticmethod + def is_binary_file(url: str) -> bool: + """ + Check if a URL points to a binary file that shouldn't be crawled. + + Args: + url: URL to check + + Returns: + True if URL is a binary file, False otherwise + """ + try: + # Remove query parameters and fragments for cleaner extension checking + parsed = urlparse(url) + path = parsed.path.lower() + + # Comprehensive list of binary and non-HTML file extensions + binary_extensions = { + # Archives + '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz', + # Executables and installers + '.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage', + # Documents (non-HTML) + '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', + # Images + '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff', + # Audio/Video + '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac', + # Data files + '.csv', '.sql', '.db', '.sqlite', + # Binary data + '.iso', '.img', '.bin', '.dat', + # Development files (usually not meant to be crawled as pages) + '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib' + } + + # Check if the path ends with any binary extension + for ext in binary_extensions: + if path.endswith(ext): + logger.debug(f"Skipping binary file: {url} (matched extension: {ext})") + return True + + return False + except Exception as e: + logger.warning(f"Error checking if URL is binary file: {e}") + # In case of error, don't skip the URL (safer to attempt crawl than miss content) + return False + @staticmethod def transform_github_url(url: str) -> str: """ diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 675c97f..fff7438 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -10,6 +10,7 @@ from urllib.parse import urldefrag from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher from ....config.logfire_config import get_logger from ...credential_service import credential_service +from ..helpers.url_handler import URLHandler logger = get_logger(__name__) @@ -27,6 +28,7 @@ class RecursiveCrawlStrategy: """ self.crawler = crawler self.markdown_generator = markdown_generator + self.url_handler = URLHandler() async def crawl_recursive_with_progress( self, @@ -190,8 +192,11 @@ class RecursiveCrawlStrategy: # Find internal links for next depth for link in result.links.get("internal", []): next_url = normalize_url(link["href"]) - if next_url not in visited: + # Skip binary files and already visited URLs + if next_url not in visited and not self.url_handler.is_binary_file(next_url): next_level_urls.add(next_url) + elif self.url_handler.is_binary_file(next_url): + logger.debug(f"Skipping binary file from crawl queue: {next_url}") else: logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}") diff --git a/python/tests/test_url_handler.py b/python/tests/test_url_handler.py new file mode 100644 index 0000000..1310bd8 --- /dev/null +++ b/python/tests/test_url_handler.py @@ -0,0 +1,125 @@ +"""Unit tests for URLHandler class.""" +import pytest +from src.server.services.crawling.helpers.url_handler import URLHandler + + +class TestURLHandler: + """Test suite for URLHandler class.""" + + def test_is_binary_file_archives(self): + """Test detection of archive file formats.""" + handler = URLHandler() + + # Should detect various archive formats + assert handler.is_binary_file("https://example.com/file.zip") is True + assert handler.is_binary_file("https://example.com/archive.tar.gz") is True + assert handler.is_binary_file("https://example.com/compressed.rar") is True + assert handler.is_binary_file("https://example.com/package.7z") is True + assert handler.is_binary_file("https://example.com/backup.tgz") is True + + def test_is_binary_file_executables(self): + """Test detection of executable and installer files.""" + handler = URLHandler() + + assert handler.is_binary_file("https://example.com/setup.exe") is True + assert handler.is_binary_file("https://example.com/installer.dmg") is True + assert handler.is_binary_file("https://example.com/package.deb") is True + assert handler.is_binary_file("https://example.com/app.msi") is True + assert handler.is_binary_file("https://example.com/program.appimage") is True + + def test_is_binary_file_documents(self): + """Test detection of document files.""" + handler = URLHandler() + + assert handler.is_binary_file("https://example.com/document.pdf") is True + assert handler.is_binary_file("https://example.com/report.docx") is True + assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True + assert handler.is_binary_file("https://example.com/presentation.pptx") is True + + def test_is_binary_file_media(self): + """Test detection of image and media files.""" + handler = URLHandler() + + # Images + assert handler.is_binary_file("https://example.com/photo.jpg") is True + assert handler.is_binary_file("https://example.com/image.png") is True + assert handler.is_binary_file("https://example.com/icon.svg") is True + assert handler.is_binary_file("https://example.com/favicon.ico") is True + + # Audio/Video + assert handler.is_binary_file("https://example.com/song.mp3") is True + assert handler.is_binary_file("https://example.com/video.mp4") is True + assert handler.is_binary_file("https://example.com/movie.mkv") is True + + def test_is_binary_file_case_insensitive(self): + """Test that detection is case-insensitive.""" + handler = URLHandler() + + assert handler.is_binary_file("https://example.com/FILE.ZIP") is True + assert handler.is_binary_file("https://example.com/Document.PDF") is True + assert handler.is_binary_file("https://example.com/Image.PNG") is True + + def test_is_binary_file_with_query_params(self): + """Test that query parameters don't affect detection.""" + handler = URLHandler() + + assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True + assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True + assert handler.is_binary_file("https://example.com/image.png#section") is True + + def test_is_binary_file_html_pages(self): + """Test that HTML pages are not detected as binary.""" + handler = URLHandler() + + # Regular HTML pages should not be detected as binary + assert handler.is_binary_file("https://example.com/") is False + assert handler.is_binary_file("https://example.com/index.html") is False + assert handler.is_binary_file("https://example.com/page") is False + assert handler.is_binary_file("https://example.com/blog/post") is False + assert handler.is_binary_file("https://example.com/about.htm") is False + assert handler.is_binary_file("https://example.com/contact.php") is False + + def test_is_binary_file_edge_cases(self): + """Test edge cases and special scenarios.""" + handler = URLHandler() + + # URLs with periods in path but not file extensions + assert handler.is_binary_file("https://example.com/v1.0/api") is False + assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable + + # Real-world example from the error + assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True + + def test_is_sitemap(self): + """Test sitemap detection.""" + handler = URLHandler() + + assert handler.is_sitemap("https://example.com/sitemap.xml") is True + assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True + assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True + assert handler.is_sitemap("https://example.com/regular-page") is False + + def test_is_txt(self): + """Test text file detection.""" + handler = URLHandler() + + assert handler.is_txt("https://example.com/robots.txt") is True + assert handler.is_txt("https://example.com/readme.txt") is True + assert handler.is_txt("https://example.com/file.pdf") is False + + def test_transform_github_url(self): + """Test GitHub URL transformation.""" + handler = URLHandler() + + # Should transform GitHub blob URLs to raw URLs + original = "https://github.com/owner/repo/blob/main/file.py" + expected = "https://raw.githubusercontent.com/owner/repo/main/file.py" + assert handler.transform_github_url(original) == expected + + # Should not transform non-blob URLs + non_blob = "https://github.com/owner/repo" + assert handler.transform_github_url(non_blob) == non_blob + + # Should not transform non-GitHub URLs + other = "https://example.com/file" + assert handler.transform_github_url(other) == other \ No newline at end of file