Address code review feedback: improve error handling and documentation

- Implement fail-fast error handling for configuration errors
- Distinguish between critical config errors (fail) and network issues (use defaults)
- Add detailed error logging with stack traces for debugging
- Document new crawler settings in .env.example
- Add inline comments explaining safe defaults

Critical configuration errors (ValueError, KeyError, TypeError) now fail fast
as per alpha principles, while transient errors still fall back to safe defaults
with prominent error logging.
This commit is contained in:
Rasmus Widing 2025-08-15 16:02:00 +03:00
parent aab0721f0c
commit e98f52aa57
3 changed files with 22 additions and 7 deletions

View File

@ -33,4 +33,9 @@ EMBEDDING_DIMENSIONS=1536
# - OPENAI_API_KEY (encrypted)
# - MODEL_CHOICE
# - TRANSPORT settings
# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.)
# - Crawler settings:
# * CRAWL_MAX_CONCURRENT (default: 10) - Max concurrent pages per crawl operation
# * CRAWL_BATCH_SIZE (default: 50) - URLs processed per batch
# * MEMORY_THRESHOLD_PERCENT (default: 80) - Memory % before throttling
# * DISPATCHER_CHECK_INTERVAL (default: 0.5) - Memory check interval in seconds

View File

@ -59,7 +59,7 @@ class BatchCrawlStrategy:
await progress_callback("error", 0, "Crawler not available")
return []
# Load settings from database first
# Load settings from database - fail fast on configuration errors
try:
settings = await credential_service.get_credentials_by_category("rag_strategy")
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
@ -67,11 +67,16 @@ class BatchCrawlStrategy:
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}")
except Exception as e:
logger.warning(f"Failed to load crawl settings: {e}, using defaults")
# For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
batch_size = 50
if max_concurrent is None:
max_concurrent = 10
max_concurrent = 10 # Safe default to prevent memory issues
memory_threshold = 80.0
check_interval = 0.5
settings = {} # Empty dict for defaults

View File

@ -61,7 +61,7 @@ class RecursiveCrawlStrategy:
await progress_callback('error', 0, 'Crawler not available')
return []
# Load settings from database
# Load settings from database - fail fast on configuration errors
try:
settings = await credential_service.get_credentials_by_category("rag_strategy")
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
@ -69,11 +69,16 @@ class RecursiveCrawlStrategy:
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}")
except Exception as e:
logger.warning(f"Failed to load crawl settings: {e}, using defaults")
# For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
batch_size = 50
if max_concurrent is None:
max_concurrent = 10
max_concurrent = 10 # Safe default to prevent memory issues
memory_threshold = 80.0
check_interval = 0.5
settings = {} # Empty dict for defaults