Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl

Fix crawler attempting to navigate to binary files
This commit is contained in:
Wirasm 2025-08-16 00:39:17 +03:00 committed by GitHub
commit 8743c059bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 179 additions and 1 deletions

View File

@ -48,6 +48,54 @@ class URLHandler:
logger.warning(f"Error checking if URL is text file: {e}")
return False
@staticmethod
def is_binary_file(url: str) -> bool:
"""
Check if a URL points to a binary file that shouldn't be crawled.
Args:
url: URL to check
Returns:
True if URL is a binary file, False otherwise
"""
try:
# Remove query parameters and fragments for cleaner extension checking
parsed = urlparse(url)
path = parsed.path.lower()
# Comprehensive list of binary and non-HTML file extensions
binary_extensions = {
# Archives
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
# Executables and installers
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
# Documents (non-HTML)
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
# Images
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
# Audio/Video
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
# Data files
'.csv', '.sql', '.db', '.sqlite',
# Binary data
'.iso', '.img', '.bin', '.dat',
# Development files (usually not meant to be crawled as pages)
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
}
# Check if the path ends with any binary extension
for ext in binary_extensions:
if path.endswith(ext):
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
return True
return False
except Exception as e:
logger.warning(f"Error checking if URL is binary file: {e}")
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
return False
@staticmethod
def transform_github_url(url: str) -> str:
"""

View File

@ -10,6 +10,7 @@ from urllib.parse import urldefrag
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service
from ..helpers.url_handler import URLHandler
logger = get_logger(__name__)
@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
"""
self.crawler = crawler
self.markdown_generator = markdown_generator
self.url_handler = URLHandler()
async def crawl_recursive_with_progress(
self,
@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
# Find internal links for next depth
for link in result.links.get("internal", []):
next_url = normalize_url(link["href"])
if next_url not in visited:
# Skip binary files and already visited URLs
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
next_level_urls.add(next_url)
elif self.url_handler.is_binary_file(next_url):
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
else:
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")

View File

@ -0,0 +1,125 @@
"""Unit tests for URLHandler class."""
import pytest
from src.server.services.crawling.helpers.url_handler import URLHandler
class TestURLHandler:
"""Test suite for URLHandler class."""
def test_is_binary_file_archives(self):
"""Test detection of archive file formats."""
handler = URLHandler()
# Should detect various archive formats
assert handler.is_binary_file("https://example.com/file.zip") is True
assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
assert handler.is_binary_file("https://example.com/compressed.rar") is True
assert handler.is_binary_file("https://example.com/package.7z") is True
assert handler.is_binary_file("https://example.com/backup.tgz") is True
def test_is_binary_file_executables(self):
"""Test detection of executable and installer files."""
handler = URLHandler()
assert handler.is_binary_file("https://example.com/setup.exe") is True
assert handler.is_binary_file("https://example.com/installer.dmg") is True
assert handler.is_binary_file("https://example.com/package.deb") is True
assert handler.is_binary_file("https://example.com/app.msi") is True
assert handler.is_binary_file("https://example.com/program.appimage") is True
def test_is_binary_file_documents(self):
"""Test detection of document files."""
handler = URLHandler()
assert handler.is_binary_file("https://example.com/document.pdf") is True
assert handler.is_binary_file("https://example.com/report.docx") is True
assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
assert handler.is_binary_file("https://example.com/presentation.pptx") is True
def test_is_binary_file_media(self):
"""Test detection of image and media files."""
handler = URLHandler()
# Images
assert handler.is_binary_file("https://example.com/photo.jpg") is True
assert handler.is_binary_file("https://example.com/image.png") is True
assert handler.is_binary_file("https://example.com/icon.svg") is True
assert handler.is_binary_file("https://example.com/favicon.ico") is True
# Audio/Video
assert handler.is_binary_file("https://example.com/song.mp3") is True
assert handler.is_binary_file("https://example.com/video.mp4") is True
assert handler.is_binary_file("https://example.com/movie.mkv") is True
def test_is_binary_file_case_insensitive(self):
"""Test that detection is case-insensitive."""
handler = URLHandler()
assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
assert handler.is_binary_file("https://example.com/Document.PDF") is True
assert handler.is_binary_file("https://example.com/Image.PNG") is True
def test_is_binary_file_with_query_params(self):
"""Test that query parameters don't affect detection."""
handler = URLHandler()
assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
assert handler.is_binary_file("https://example.com/image.png#section") is True
def test_is_binary_file_html_pages(self):
"""Test that HTML pages are not detected as binary."""
handler = URLHandler()
# Regular HTML pages should not be detected as binary
assert handler.is_binary_file("https://example.com/") is False
assert handler.is_binary_file("https://example.com/index.html") is False
assert handler.is_binary_file("https://example.com/page") is False
assert handler.is_binary_file("https://example.com/blog/post") is False
assert handler.is_binary_file("https://example.com/about.htm") is False
assert handler.is_binary_file("https://example.com/contact.php") is False
def test_is_binary_file_edge_cases(self):
"""Test edge cases and special scenarios."""
handler = URLHandler()
# URLs with periods in path but not file extensions
assert handler.is_binary_file("https://example.com/v1.0/api") is False
assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable
# Real-world example from the error
assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
def test_is_sitemap(self):
"""Test sitemap detection."""
handler = URLHandler()
assert handler.is_sitemap("https://example.com/sitemap.xml") is True
assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
assert handler.is_sitemap("https://example.com/regular-page") is False
def test_is_txt(self):
"""Test text file detection."""
handler = URLHandler()
assert handler.is_txt("https://example.com/robots.txt") is True
assert handler.is_txt("https://example.com/readme.txt") is True
assert handler.is_txt("https://example.com/file.pdf") is False
def test_transform_github_url(self):
"""Test GitHub URL transformation."""
handler = URLHandler()
# Should transform GitHub blob URLs to raw URLs
original = "https://github.com/owner/repo/blob/main/file.py"
expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
assert handler.transform_github_url(original) == expected
# Should not transform non-blob URLs
non_blob = "https://github.com/owner/repo"
assert handler.transform_github_url(non_blob) == non_blob
# Should not transform non-GitHub URLs
other = "https://example.com/file"
assert handler.transform_github_url(other) == other