Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl
Fix crawler attempting to navigate to binary files
This commit is contained in:
commit
8743c059bb
@ -48,6 +48,54 @@ class URLHandler:
|
||||
logger.warning(f"Error checking if URL is text file: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def is_binary_file(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL points to a binary file that shouldn't be crawled.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
True if URL is a binary file, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Remove query parameters and fragments for cleaner extension checking
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Comprehensive list of binary and non-HTML file extensions
|
||||
binary_extensions = {
|
||||
# Archives
|
||||
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
|
||||
# Executables and installers
|
||||
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
|
||||
# Documents (non-HTML)
|
||||
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
|
||||
# Images
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
|
||||
# Audio/Video
|
||||
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
|
||||
# Data files
|
||||
'.csv', '.sql', '.db', '.sqlite',
|
||||
# Binary data
|
||||
'.iso', '.img', '.bin', '.dat',
|
||||
# Development files (usually not meant to be crawled as pages)
|
||||
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
|
||||
}
|
||||
|
||||
# Check if the path ends with any binary extension
|
||||
for ext in binary_extensions:
|
||||
if path.endswith(ext):
|
||||
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
|
||||
return True
|
||||
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is binary file: {e}")
|
||||
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def transform_github_url(url: str) -> str:
|
||||
"""
|
||||
|
||||
@ -10,6 +10,7 @@ from urllib.parse import urldefrag
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
||||
from ....config.logfire_config import get_logger
|
||||
from ...credential_service import credential_service
|
||||
from ..helpers.url_handler import URLHandler
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.markdown_generator = markdown_generator
|
||||
self.url_handler = URLHandler()
|
||||
|
||||
async def crawl_recursive_with_progress(
|
||||
self,
|
||||
@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
|
||||
# Find internal links for next depth
|
||||
for link in result.links.get("internal", []):
|
||||
next_url = normalize_url(link["href"])
|
||||
if next_url not in visited:
|
||||
# Skip binary files and already visited URLs
|
||||
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
|
||||
next_level_urls.add(next_url)
|
||||
elif self.url_handler.is_binary_file(next_url):
|
||||
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||
else:
|
||||
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
|
||||
|
||||
|
||||
125
python/tests/test_url_handler.py
Normal file
125
python/tests/test_url_handler.py
Normal file
@ -0,0 +1,125 @@
|
||||
"""Unit tests for URLHandler class."""
|
||||
import pytest
|
||||
from src.server.services.crawling.helpers.url_handler import URLHandler
|
||||
|
||||
|
||||
class TestURLHandler:
|
||||
"""Test suite for URLHandler class."""
|
||||
|
||||
def test_is_binary_file_archives(self):
|
||||
"""Test detection of archive file formats."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Should detect various archive formats
|
||||
assert handler.is_binary_file("https://example.com/file.zip") is True
|
||||
assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
|
||||
assert handler.is_binary_file("https://example.com/compressed.rar") is True
|
||||
assert handler.is_binary_file("https://example.com/package.7z") is True
|
||||
assert handler.is_binary_file("https://example.com/backup.tgz") is True
|
||||
|
||||
def test_is_binary_file_executables(self):
|
||||
"""Test detection of executable and installer files."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_binary_file("https://example.com/setup.exe") is True
|
||||
assert handler.is_binary_file("https://example.com/installer.dmg") is True
|
||||
assert handler.is_binary_file("https://example.com/package.deb") is True
|
||||
assert handler.is_binary_file("https://example.com/app.msi") is True
|
||||
assert handler.is_binary_file("https://example.com/program.appimage") is True
|
||||
|
||||
def test_is_binary_file_documents(self):
|
||||
"""Test detection of document files."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_binary_file("https://example.com/document.pdf") is True
|
||||
assert handler.is_binary_file("https://example.com/report.docx") is True
|
||||
assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
|
||||
assert handler.is_binary_file("https://example.com/presentation.pptx") is True
|
||||
|
||||
def test_is_binary_file_media(self):
|
||||
"""Test detection of image and media files."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Images
|
||||
assert handler.is_binary_file("https://example.com/photo.jpg") is True
|
||||
assert handler.is_binary_file("https://example.com/image.png") is True
|
||||
assert handler.is_binary_file("https://example.com/icon.svg") is True
|
||||
assert handler.is_binary_file("https://example.com/favicon.ico") is True
|
||||
|
||||
# Audio/Video
|
||||
assert handler.is_binary_file("https://example.com/song.mp3") is True
|
||||
assert handler.is_binary_file("https://example.com/video.mp4") is True
|
||||
assert handler.is_binary_file("https://example.com/movie.mkv") is True
|
||||
|
||||
def test_is_binary_file_case_insensitive(self):
|
||||
"""Test that detection is case-insensitive."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
|
||||
assert handler.is_binary_file("https://example.com/Document.PDF") is True
|
||||
assert handler.is_binary_file("https://example.com/Image.PNG") is True
|
||||
|
||||
def test_is_binary_file_with_query_params(self):
|
||||
"""Test that query parameters don't affect detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
|
||||
assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
|
||||
assert handler.is_binary_file("https://example.com/image.png#section") is True
|
||||
|
||||
def test_is_binary_file_html_pages(self):
|
||||
"""Test that HTML pages are not detected as binary."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Regular HTML pages should not be detected as binary
|
||||
assert handler.is_binary_file("https://example.com/") is False
|
||||
assert handler.is_binary_file("https://example.com/index.html") is False
|
||||
assert handler.is_binary_file("https://example.com/page") is False
|
||||
assert handler.is_binary_file("https://example.com/blog/post") is False
|
||||
assert handler.is_binary_file("https://example.com/about.htm") is False
|
||||
assert handler.is_binary_file("https://example.com/contact.php") is False
|
||||
|
||||
def test_is_binary_file_edge_cases(self):
|
||||
"""Test edge cases and special scenarios."""
|
||||
handler = URLHandler()
|
||||
|
||||
# URLs with periods in path but not file extensions
|
||||
assert handler.is_binary_file("https://example.com/v1.0/api") is False
|
||||
assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable
|
||||
|
||||
# Real-world example from the error
|
||||
assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
|
||||
|
||||
def test_is_sitemap(self):
|
||||
"""Test sitemap detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_sitemap("https://example.com/sitemap.xml") is True
|
||||
assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
|
||||
assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
|
||||
assert handler.is_sitemap("https://example.com/regular-page") is False
|
||||
|
||||
def test_is_txt(self):
|
||||
"""Test text file detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
assert handler.is_txt("https://example.com/robots.txt") is True
|
||||
assert handler.is_txt("https://example.com/readme.txt") is True
|
||||
assert handler.is_txt("https://example.com/file.pdf") is False
|
||||
|
||||
def test_transform_github_url(self):
|
||||
"""Test GitHub URL transformation."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Should transform GitHub blob URLs to raw URLs
|
||||
original = "https://github.com/owner/repo/blob/main/file.py"
|
||||
expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
|
||||
assert handler.transform_github_url(original) == expected
|
||||
|
||||
# Should not transform non-blob URLs
|
||||
non_blob = "https://github.com/owner/repo"
|
||||
assert handler.transform_github_url(non_blob) == non_blob
|
||||
|
||||
# Should not transform non-GitHub URLs
|
||||
other = "https://example.com/file"
|
||||
assert handler.transform_github_url(other) == other
|
||||
Loading…
Reference in New Issue
Block a user