From e98f52aa57a571281541679d67da2f641d1b5dfd Mon Sep 17 00:00:00 2001 From: Rasmus Widing Date: Fri, 15 Aug 2025 16:02:00 +0300 Subject: [PATCH] Address code review feedback: improve error handling and documentation - Implement fail-fast error handling for configuration errors - Distinguish between critical config errors (fail) and network issues (use defaults) - Add detailed error logging with stack traces for debugging - Document new crawler settings in .env.example - Add inline comments explaining safe defaults Critical configuration errors (ValueError, KeyError, TypeError) now fail fast as per alpha principles, while transient errors still fall back to safe defaults with prominent error logging. --- .env.example | 7 ++++++- .../src/server/services/crawling/strategies/batch.py | 11 ++++++++--- .../server/services/crawling/strategies/recursive.py | 11 ++++++++--- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index f2cf73e..981f1da 100644 --- a/.env.example +++ b/.env.example @@ -33,4 +33,9 @@ EMBEDDING_DIMENSIONS=1536 # - OPENAI_API_KEY (encrypted) # - MODEL_CHOICE # - TRANSPORT settings -# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.) \ No newline at end of file +# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.) +# - Crawler settings: +# * CRAWL_MAX_CONCURRENT (default: 10) - Max concurrent pages per crawl operation +# * CRAWL_BATCH_SIZE (default: 50) - URLs processed per batch +# * MEMORY_THRESHOLD_PERCENT (default: 80) - Memory % before throttling +# * DISPATCHER_CHECK_INTERVAL (default: 0.5) - Memory check interval in seconds \ No newline at end of file diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index a17124d..d97b0bc 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -59,7 +59,7 @@ class BatchCrawlStrategy: await progress_callback("error", 0, "Crawler not available") return [] - # Load settings from database first + # Load settings from database - fail fast on configuration errors try: settings = await credential_service.get_credentials_by_category("rag_strategy") batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50")) @@ -67,11 +67,16 @@ class BatchCrawlStrategy: max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10")) memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80")) check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5")) + except (ValueError, KeyError, TypeError) as e: + # Critical configuration errors should fail fast in alpha + logger.error(f"Invalid crawl settings format: {e}", exc_info=True) + raise ValueError(f"Failed to load crawler configuration: {e}") except Exception as e: - logger.warning(f"Failed to load crawl settings: {e}, using defaults") + # For non-critical errors (e.g., network issues), use defaults but log prominently + logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) batch_size = 50 if max_concurrent is None: - max_concurrent = 10 + max_concurrent = 10 # Safe default to prevent memory issues memory_threshold = 80.0 check_interval = 0.5 settings = {} # Empty dict for defaults diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 675c97f..8b9cf93 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -61,7 +61,7 @@ class RecursiveCrawlStrategy: await progress_callback('error', 0, 'Crawler not available') return [] - # Load settings from database + # Load settings from database - fail fast on configuration errors try: settings = await credential_service.get_credentials_by_category("rag_strategy") batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50")) @@ -69,11 +69,16 @@ class RecursiveCrawlStrategy: max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10")) memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80")) check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5")) + except (ValueError, KeyError, TypeError) as e: + # Critical configuration errors should fail fast in alpha + logger.error(f"Invalid crawl settings format: {e}", exc_info=True) + raise ValueError(f"Failed to load crawler configuration: {e}") except Exception as e: - logger.warning(f"Failed to load crawl settings: {e}, using defaults") + # For non-critical errors (e.g., network issues), use defaults but log prominently + logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) batch_size = 50 if max_concurrent is None: - max_concurrent = 10 + max_concurrent = 10 # Safe default to prevent memory issues memory_threshold = 80.0 check_interval = 0.5 settings = {} # Empty dict for defaults