Address code review feedback: improve error handling and documentation

- Implement fail-fast error handling for configuration errors - Distinguish between critical config errors (fail) and network issues (use defaults) - Add detailed error logging with stack traces for debugging - Document new crawler settings in .env.example - Add inline comments explaining safe defaults Critical configuration errors (ValueError, KeyError, TypeError) now fail fast as per alpha principles, while transient errors still fall back to safe defaults with prominent error logging.
2025-08-15 16:02:00 +03:00 · 2025-08-15 16:02:00 +03:00 · e98f52aa57
commit e98f52aa57
parent aab0721f0c
3 changed files with 22 additions and 7 deletions
--- a/.env.example
+++ b/.env.example
@ -33,4 +33,9 @@ EMBEDDING_DIMENSIONS=1536
 # - OPENAI_API_KEY (encrypted)
 # - MODEL_CHOICE 
 # - TRANSPORT settings
-# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
+# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
+# - Crawler settings:
+#   * CRAWL_MAX_CONCURRENT (default: 10) - Max concurrent pages per crawl operation
+#   * CRAWL_BATCH_SIZE (default: 50) - URLs processed per batch
+#   * MEMORY_THRESHOLD_PERCENT (default: 80) - Memory % before throttling
+#   * DISPATCHER_CHECK_INTERVAL (default: 0.5) - Memory check interval in seconds
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@ -59,7 +59,7 @@ class BatchCrawlStrategy:
                await progress_callback("error", 0, "Crawler not available")
            return []

-        # Load settings from database first
+        # Load settings from database - fail fast on configuration errors
        try:
            settings = await credential_service.get_credentials_by_category("rag_strategy")
            batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
@ -67,11 +67,16 @@ class BatchCrawlStrategy:
                max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
            memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
            check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
+        except (ValueError, KeyError, TypeError) as e:
+            # Critical configuration errors should fail fast in alpha
+            logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
+            raise ValueError(f"Failed to load crawler configuration: {e}")
        except Exception as e:
-            logger.warning(f"Failed to load crawl settings: {e}, using defaults")
+            # For non-critical errors (e.g., network issues), use defaults but log prominently
+            logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
            batch_size = 50
            if max_concurrent is None:
-                max_concurrent = 10
+                max_concurrent = 10  # Safe default to prevent memory issues
            memory_threshold = 80.0
            check_interval = 0.5
            settings = {}  # Empty dict for defaults
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@ -61,7 +61,7 @@ class RecursiveCrawlStrategy:
                await progress_callback('error', 0, 'Crawler not available')
            return []
        
-        # Load settings from database
+        # Load settings from database - fail fast on configuration errors
        try:
            settings = await credential_service.get_credentials_by_category("rag_strategy")
            batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
@ -69,11 +69,16 @@ class RecursiveCrawlStrategy:
                max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
            memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
            check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
+        except (ValueError, KeyError, TypeError) as e:
+            # Critical configuration errors should fail fast in alpha
+            logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
+            raise ValueError(f"Failed to load crawler configuration: {e}")
        except Exception as e:
-            logger.warning(f"Failed to load crawl settings: {e}, using defaults")
+            # For non-critical errors (e.g., network issues), use defaults but log prominently
+            logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
            batch_size = 50
            if max_concurrent is None:
-                max_concurrent = 10
+                max_concurrent = 10  # Safe default to prevent memory issues
            memory_threshold = 80.0
            check_interval = 0.5
            settings = {}  # Empty dict for defaults