Fix crawler attempting to navigate to binary files
- Add is_binary_file() method to URLHandler to detect 40+ binary extensions - Update RecursiveCrawlStrategy to filter binary URLs before crawl queue - Add comprehensive unit tests for binary file detection - Prevents net::ERR_ABORTED errors when crawler encounters ZIP, PDF, etc. This fixes the issue where the crawler was treating binary file URLs (like .zip downloads) as navigable web pages, causing errors in crawl4ai.
This commit is contained in:
parent
ad1b8bf70f
commit
8157670936
@ -48,6 +48,54 @@ class URLHandler:
|
|||||||
logger.warning(f"Error checking if URL is text file: {e}")
|
logger.warning(f"Error checking if URL is text file: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_binary_file(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL points to a binary file that shouldn't be crawled.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL is a binary file, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Remove query parameters and fragments for cleaner extension checking
|
||||||
|
parsed = urlparse(url)
|
||||||
|
path = parsed.path.lower()
|
||||||
|
|
||||||
|
# Comprehensive list of binary and non-HTML file extensions
|
||||||
|
binary_extensions = {
|
||||||
|
# Archives
|
||||||
|
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
|
||||||
|
# Executables and installers
|
||||||
|
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
|
||||||
|
# Documents (non-HTML)
|
||||||
|
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
|
||||||
|
# Images
|
||||||
|
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
|
||||||
|
# Audio/Video
|
||||||
|
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
|
||||||
|
# Data files
|
||||||
|
'.csv', '.sql', '.db', '.sqlite',
|
||||||
|
# Binary data
|
||||||
|
'.iso', '.img', '.bin', '.dat',
|
||||||
|
# Development files (usually not meant to be crawled as pages)
|
||||||
|
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if the path ends with any binary extension
|
||||||
|
for ext in binary_extensions:
|
||||||
|
if path.endswith(ext):
|
||||||
|
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error checking if URL is binary file: {e}")
|
||||||
|
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
|
||||||
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def transform_github_url(url: str) -> str:
|
def transform_github_url(url: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from urllib.parse import urldefrag
|
|||||||
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
||||||
from ....config.logfire_config import get_logger
|
from ....config.logfire_config import get_logger
|
||||||
from ...credential_service import credential_service
|
from ...credential_service import credential_service
|
||||||
|
from ..helpers.url_handler import URLHandler
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
|
|||||||
"""
|
"""
|
||||||
self.crawler = crawler
|
self.crawler = crawler
|
||||||
self.markdown_generator = markdown_generator
|
self.markdown_generator = markdown_generator
|
||||||
|
self.url_handler = URLHandler()
|
||||||
|
|
||||||
async def crawl_recursive_with_progress(
|
async def crawl_recursive_with_progress(
|
||||||
self,
|
self,
|
||||||
@ -190,8 +192,11 @@ class RecursiveCrawlStrategy:
|
|||||||
# Find internal links for next depth
|
# Find internal links for next depth
|
||||||
for link in result.links.get("internal", []):
|
for link in result.links.get("internal", []):
|
||||||
next_url = normalize_url(link["href"])
|
next_url = normalize_url(link["href"])
|
||||||
if next_url not in visited:
|
# Skip binary files and already visited URLs
|
||||||
|
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
|
||||||
next_level_urls.add(next_url)
|
next_level_urls.add(next_url)
|
||||||
|
elif self.url_handler.is_binary_file(next_url):
|
||||||
|
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
|
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
|
||||||
|
|
||||||
|
|||||||
125
python/tests/test_url_handler.py
Normal file
125
python/tests/test_url_handler.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
"""Unit tests for URLHandler class."""
|
||||||
|
import pytest
|
||||||
|
from src.server.services.crawling.helpers.url_handler import URLHandler
|
||||||
|
|
||||||
|
|
||||||
|
class TestURLHandler:
|
||||||
|
"""Test suite for URLHandler class."""
|
||||||
|
|
||||||
|
def test_is_binary_file_archives(self):
|
||||||
|
"""Test detection of archive file formats."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Should detect various archive formats
|
||||||
|
assert handler.is_binary_file("https://example.com/file.zip") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/archive.tar.gz") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/compressed.rar") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/package.7z") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/backup.tgz") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_executables(self):
|
||||||
|
"""Test detection of executable and installer files."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_binary_file("https://example.com/setup.exe") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/installer.dmg") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/package.deb") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/app.msi") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/program.appimage") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_documents(self):
|
||||||
|
"""Test detection of document files."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_binary_file("https://example.com/document.pdf") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/report.docx") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/spreadsheet.xlsx") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/presentation.pptx") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_media(self):
|
||||||
|
"""Test detection of image and media files."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Images
|
||||||
|
assert handler.is_binary_file("https://example.com/photo.jpg") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/image.png") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/icon.svg") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/favicon.ico") is True
|
||||||
|
|
||||||
|
# Audio/Video
|
||||||
|
assert handler.is_binary_file("https://example.com/song.mp3") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/video.mp4") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/movie.mkv") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_case_insensitive(self):
|
||||||
|
"""Test that detection is case-insensitive."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_binary_file("https://example.com/FILE.ZIP") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/Document.PDF") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/Image.PNG") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_with_query_params(self):
|
||||||
|
"""Test that query parameters don't affect detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_binary_file("https://example.com/file.zip?version=1.0") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/document.pdf?download=true") is True
|
||||||
|
assert handler.is_binary_file("https://example.com/image.png#section") is True
|
||||||
|
|
||||||
|
def test_is_binary_file_html_pages(self):
|
||||||
|
"""Test that HTML pages are not detected as binary."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Regular HTML pages should not be detected as binary
|
||||||
|
assert handler.is_binary_file("https://example.com/") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/index.html") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/page") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/blog/post") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/about.htm") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/contact.php") is False
|
||||||
|
|
||||||
|
def test_is_binary_file_edge_cases(self):
|
||||||
|
"""Test edge cases and special scenarios."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# URLs with periods in path but not file extensions
|
||||||
|
assert handler.is_binary_file("https://example.com/v1.0/api") is False
|
||||||
|
assert handler.is_binary_file("https://example.com/jquery.min.js") is False # JS files might be crawlable
|
||||||
|
|
||||||
|
# Real-world example from the error
|
||||||
|
assert handler.is_binary_file("https://docs.crawl4ai.com/apps/crawl4ai-assistant/crawl4ai-assistant-v1.3.0.zip") is True
|
||||||
|
|
||||||
|
def test_is_sitemap(self):
|
||||||
|
"""Test sitemap detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_sitemap("https://example.com/sitemap.xml") is True
|
||||||
|
assert handler.is_sitemap("https://example.com/path/sitemap.xml") is True
|
||||||
|
assert handler.is_sitemap("https://example.com/sitemap/index.xml") is True
|
||||||
|
assert handler.is_sitemap("https://example.com/regular-page") is False
|
||||||
|
|
||||||
|
def test_is_txt(self):
|
||||||
|
"""Test text file detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
assert handler.is_txt("https://example.com/robots.txt") is True
|
||||||
|
assert handler.is_txt("https://example.com/readme.txt") is True
|
||||||
|
assert handler.is_txt("https://example.com/file.pdf") is False
|
||||||
|
|
||||||
|
def test_transform_github_url(self):
|
||||||
|
"""Test GitHub URL transformation."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Should transform GitHub blob URLs to raw URLs
|
||||||
|
original = "https://github.com/owner/repo/blob/main/file.py"
|
||||||
|
expected = "https://raw.githubusercontent.com/owner/repo/main/file.py"
|
||||||
|
assert handler.transform_github_url(original) == expected
|
||||||
|
|
||||||
|
# Should not transform non-blob URLs
|
||||||
|
non_blob = "https://github.com/owner/repo"
|
||||||
|
assert handler.transform_github_url(non_blob) == non_blob
|
||||||
|
|
||||||
|
# Should not transform non-GitHub URLs
|
||||||
|
other = "https://example.com/file"
|
||||||
|
assert handler.transform_github_url(other) == other
|
||||||
Loading…
Reference in New Issue
Block a user