From 8792a1b0dd4f761f55aefa2a7f7ff9527a75b722 Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Wed, 20 Aug 2025 21:55:58 +0300 Subject: [PATCH] Fix crawler timeout for JavaScript-heavy documentation sites Remove wait_for='body' selector from documentation site crawling config. The body element exists immediately in HTML, causing unnecessary timeouts for JavaScript-rendered content. Now relies on domcontentloaded event and delay_before_return_html for proper JavaScript execution. --- python/src/server/services/crawling/strategies/batch.py | 1 - python/src/server/services/crawling/strategies/recursive.py | 1 - 2 files changed, 2 deletions(-) diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index d97b0bc..17932ac 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -91,7 +91,6 @@ class BatchCrawlStrategy: cache_mode=CacheMode.BYPASS, stream=True, # Enable streaming for faster parallel processing markdown_generator=self.markdown_generator, - wait_for="body", # Simple selector for batch wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"), page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")), delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")), diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index d6eb5b3..d258143 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -94,7 +94,6 @@ class RecursiveCrawlStrategy: cache_mode=CacheMode.BYPASS, stream=True, # Enable streaming for faster parallel processing markdown_generator=self.markdown_generator, - wait_for='body', wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"), page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")), delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),