Hotfix - crawls hanging after embedding rate limiting

2025-08-30 08:20:02 -05:00 · 2025-08-30 08:20:02 -05:00 · 96c84b5cf2
commit 96c84b5cf2
parent 3e204b0be1
3 changed files with 89 additions and 23 deletions
--- a/python/src/server/services/embeddings/embedding_service.py
+++ b/python/src/server/services/embeddings/embedding_service.py
@ -205,8 +205,25 @@ async def create_embeddings_batch(
                        batch_tokens = sum(len(text.split()) for text in batch) * 1.3
                        total_tokens_used += batch_tokens

+                        # Create rate limit progress callback if we have a progress callback
+                        rate_limit_callback = None
+                        if progress_callback or websocket:
+                            async def rate_limit_callback(data: dict):
+                                # Send heartbeat during rate limit wait
+                                if progress_callback:
+                                    processed = result.success_count + result.failure_count
+                                    message = f"Rate limited: {data.get('message', 'Waiting...')}"
+                                    await progress_callback(message, (processed / len(texts)) * 100)
+                                
+                                if websocket:
+                                    await websocket.send_json({
+                                        "type": "rate_limit_wait",
+                                        "message": data.get("message", "Rate limited, waiting..."),
+                                        "remaining_seconds": data.get("remaining_seconds", 0)
+                                    })
+
                        # Rate limit each batch
-                        async with threading_service.rate_limited_operation(batch_tokens):
+                        async with threading_service.rate_limited_operation(batch_tokens, rate_limit_callback):
                            retry_count = 0
                            max_retries = 3

--- a/python/src/server/services/storage/document_storage_service.py
+++ b/python/src/server/services/storage/document_storage_service.py
@ -233,9 +233,23 @@ async def add_documents_to_supabase(
                # If not using contextual embeddings, use original contents
                contextual_contents = batch_contents

-            # Create embeddings for the batch - no progress reporting
-            # Don't pass websocket to avoid Socket.IO issues
-            result = await create_embeddings_batch(contextual_contents, provider=provider)
+            # Create embeddings for the batch with rate limit progress support
+            # Create a wrapper for progress callback to handle rate limiting updates
+            async def embedding_progress_wrapper(message: str, percentage: float):
+                # Forward rate limiting messages to the main progress callback
+                if progress_callback and "rate limit" in message.lower():
+                    await progress_callback(
+                        message,
+                        current_percentage,  # Use current batch progress
+                        {"batch": batch_num, "type": "rate_limit_wait"}
+                    )
+            
+            # Pass progress callback for rate limiting updates
+            result = await create_embeddings_batch(
+                contextual_contents,
+                provider=provider,
+                progress_callback=embedding_progress_wrapper if progress_callback else None
+            )

            # Log any failures
            if result.has_failures:
--- a/python/src/server/services/threading_service.py
+++ b/python/src/server/services/threading_service.py
@ -84,16 +84,30 @@ class RateLimiter:
        self.semaphore = asyncio.Semaphore(config.max_concurrent)
        self._lock = asyncio.Lock()

-    async def acquire(self, estimated_tokens: int = 8000) -> bool:
-        """Acquire permission to make API call with token awareness"""
-        async with self._lock:
-            now = time.time()
+    async def acquire(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None) -> bool:
+        """Acquire permission to make API call with token awareness
+        
+        Args:
+            estimated_tokens: Estimated number of tokens for the operation
+            progress_callback: Optional async callback for progress updates during wait
+        """
+        while True:  # Loop instead of recursion to avoid stack overflow
+            wait_time_to_sleep = None
+            
+            async with self._lock:
+                now = time.time()

-            # Clean old entries
-            self._clean_old_entries(now)
+                # Clean old entries
+                self._clean_old_entries(now)

-            # Check if we can make the request
-            if not self._can_make_request(estimated_tokens):
+                # Check if we can make the request
+                if self._can_make_request(estimated_tokens):
+                    # Record the request
+                    self.request_times.append(now)
+                    self.token_usage.append((now, estimated_tokens))
+                    return True
+                
+                # Calculate wait time if we can't make the request
                wait_time = self._calculate_wait_time(estimated_tokens)
                if wait_time > 0:
                    logfire_logger.info(
@ -103,14 +117,30 @@ class RateLimiter:
                            "current_usage": self._get_current_usage(),
                        }
                    )
-                    await asyncio.sleep(wait_time)
-                    return await self.acquire(estimated_tokens)
-                return False
-
-            # Record the request
-            self.request_times.append(now)
-            self.token_usage.append((now, estimated_tokens))
-            return True
+                    wait_time_to_sleep = wait_time
+                else:
+                    return False
+            
+            # Sleep outside the lock to avoid deadlock
+            if wait_time_to_sleep is not None:
+                # For long waits, break into smaller chunks with progress updates
+                if wait_time_to_sleep > 5 and progress_callback:
+                    chunks = int(wait_time_to_sleep / 5)  # 5 second chunks
+                    for i in range(chunks):
+                        await asyncio.sleep(5)
+                        remaining = wait_time_to_sleep - (i + 1) * 5
+                        if progress_callback:
+                            await progress_callback({
+                                "type": "rate_limit_wait",
+                                "remaining_seconds": max(0, remaining),
+                                "message": f"waiting {max(0, remaining):.1f}s more..."
+                            })
+                    # Sleep any remaining time
+                    if wait_time_to_sleep % 5 > 0:
+                        await asyncio.sleep(wait_time_to_sleep % 5)
+                else:
+                    await asyncio.sleep(wait_time_to_sleep)
+                # Continue the loop to try again

    def _can_make_request(self, estimated_tokens: int) -> bool:
        """Check if request can be made within limits"""
@ -510,10 +540,15 @@ class ThreadingService:
        logfire_logger.info("Threading service stopped")

    @asynccontextmanager
-    async def rate_limited_operation(self, estimated_tokens: int = 8000):
-        """Context manager for rate-limited operations"""
+    async def rate_limited_operation(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None):
+        """Context manager for rate-limited operations
+        
+        Args:
+            estimated_tokens: Estimated number of tokens for the operation
+            progress_callback: Optional async callback for progress updates during wait
+        """
        async with self.rate_limiter.semaphore:
-            can_proceed = await self.rate_limiter.acquire(estimated_tokens)
+            can_proceed = await self.rate_limiter.acquire(estimated_tokens, progress_callback)
            if not can_proceed:
                raise Exception("Rate limit exceeded")