From 81720674132f512f98ce70c7e873c99d0b3f27b3 Mon Sep 17 00:00:00 2001 From: Josh Date: Sat, 6 Sep 2025 14:33:00 -0500 Subject: [PATCH] Fix/(llms.txt) not crawling links inside of file (#437) * fixed the llms.txt/fulls-llm.txt/llms.md etc. to be crawleed finally. intelligently determines if theres links in the llms.txt and crawls them as it should. tested fully everything works! * updated coderabbits suggestion - resolved * refined to code rabbits suggestions take 2, should be final take. didnt add the max link paramter suggestion though. * 3rd times the charm, added nit picky thing from code rabbit. code rabbit makes me crave nicotine * Fixed progress bar accuracy and OpenAI API compatibility issues Changes Made: 1. Progress Bar Fix: Fixed llms.txt crawling progress jumping to 90% then regressing to 45% by adjusting batch crawling progress ranges (20-30% instead of 40-90%) and using consistent ProgressMapper ranges 2. OpenAI API Compatibility: Added robust fallback logic in contextual embedding service to handle newer models (GPT-5) that require max_completion_tokens instead of max_tokens and don't support custom temperature values Files Modified: - src/server/services/crawling/crawling_service.py - Fixed progress ranges - src/server/services/crawling/progress_mapper.py - Restored original stage ranges - src/server/services/embeddings/contextual_embedding_service.py - Added fallback API logic Result: - Progress bar now smoothly progresses 030% (crawling) 35-80% (storage) 100% - Automatic compatibility with both old (GPT-4.1-nano) and new (GPT-5-nano) OpenAI models - Eliminates max_tokens not supported and temperature not supported errors * removed gpt-5-handlings since thats a seprate issue and doesnt pertain to here, definitley recommend looking at that though since gpt-5-nano is considered a reasoning model and doesnt use max_tokens, requires a diffrent output. also removed my upsert fix from documentstorage since thats not apart of this exact issue and i have another PR open for it. checked in code rabbit in my ide no issues, no nitpicks. should be good? might flag me for the UPSERT logic not being in here. owell has nothing to do with this was pr, was submitted in the last revision by mistake. everythings tested and good to go! * fixed the llms-full.txt crawling issue. now crawls just that page when crawling llms-full.txt. fixed the 100% crawl url when multiple urls are present and hasnt finished crawling. also fixed a styling issue in CrawlingProgressCard.tsx , when batching code examples the batching progress bar would sometimes glitch out of the ui fixed it to where it wont do that now. * fixed a few things so it will work with the current branch! * added some enhancemments to ui rendering aswell and other little misc. fixes from code rabbit --------- Co-authored-by: Chillbruhhh Co-authored-by: Claude Code --- .../services/crawling/crawling_service.py | 87 +++++++- .../services/crawling/helpers/url_handler.py | 191 +++++++++++++++++- .../contextual_embedding_service.py | 2 +- .../services/storage/code_storage_service.py | 7 + .../storage/document_storage_service.py | 17 +- 5 files changed, 289 insertions(+), 15 deletions(-) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index cddb331..e85c1fa 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -526,6 +526,40 @@ class CrawlingService: f"Unregistered orchestration service on error | progress_id={self.progress_id}" ) + def _is_self_link(self, link: str, base_url: str) -> bool: + """ + Check if a link is a self-referential link to the base URL. + Handles query parameters, fragments, trailing slashes, and normalizes + scheme/host/ports for accurate comparison. + + Args: + link: The link to check + base_url: The base URL to compare against + + Returns: + True if the link is self-referential, False otherwise + """ + try: + from urllib.parse import urlparse + + def _core(u: str) -> str: + p = urlparse(u) + scheme = (p.scheme or "http").lower() + host = (p.hostname or "").lower() + port = p.port + if (scheme == "http" and port in (None, 80)) or (scheme == "https" and port in (None, 443)): + port_part = "" + else: + port_part = f":{port}" if port else "" + path = p.path.rstrip("/") + return f"{scheme}://{host}{port_part}{path}" + + return _core(link) == _core(base_url) + except Exception as e: + logger.warning(f"Error checking if link is self-referential: {e}", exc_info=True) + # Fallback to simple string comparison + return link.rstrip('/') == base_url.rstrip('/') + async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple: """ Detect URL type and perform appropriate crawling. @@ -536,8 +570,8 @@ class CrawlingService: crawl_results = [] crawl_type = None - if self.url_handler.is_txt(url): - # Handle text files + if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url): + # Handle text files crawl_type = "llms-txt" if "llms" in url.lower() else "text_file" if self.progress_tracker: await self.progress_tracker.update( @@ -550,9 +584,54 @@ class CrawlingService: crawl_results = await self.crawl_markdown_file( url, progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, + start_progress=5, + end_progress=10, ) + # Check if this is a link collection file and extract links + if crawl_results and len(crawl_results) > 0: + content = crawl_results[0].get('markdown', '') + if self.url_handler.is_link_collection_file(url, content): + # Extract links from the content + extracted_links = self.url_handler.extract_markdown_links(content, url) + + # Filter out self-referential links to avoid redundant crawling + if extracted_links: + original_count = len(extracted_links) + extracted_links = [ + link for link in extracted_links + if not self._is_self_link(link, url) + ] + self_filtered_count = original_count - len(extracted_links) + if self_filtered_count > 0: + logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links") + + # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling + if extracted_links: + original_count = len(extracted_links) + extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)] + filtered_count = original_count - len(extracted_links) + if filtered_count > 0: + logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links") + + if extracted_links: + # Crawl the extracted links using batch crawling + logger.info(f"Crawling {len(extracted_links)} extracted links from {url}") + batch_results = await self.crawl_batch_with_progress( + extracted_links, + max_concurrent=request.get('max_concurrent'), # None -> use DB settings + progress_callback=await self._create_crawl_progress_callback("crawling"), + start_progress=10, + end_progress=20, + ) + + # Combine original text file results with batch results + crawl_results.extend(batch_results) + crawl_type = "link_collection_with_crawled_links" + + logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)") + else: + logger.info(f"No valid links found in link collection file: {url}") + logger.info(f"Text file crawling completed: {len(crawl_results)} results") elif self.url_handler.is_sitemap(url): # Handle sitemaps diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 19d9535..33c75c5 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -6,7 +6,8 @@ Handles URL transformations and validations. import hashlib import re -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin +from typing import List, Optional from ....config.logfire_config import get_logger @@ -32,6 +33,26 @@ class URLHandler: except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False + + @staticmethod + def is_markdown(url: str) -> bool: + """ + Check if a URL points to a markdown file (.md, .mdx, .markdown). + + Args: + url: URL to check + + Returns: + True if URL is a markdown file, False otherwise + """ + try: + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + path = parsed.path.lower() + return path.endswith(('.md', '.mdx', '.markdown')) + except Exception as e: + logger.warning(f"Error checking if URL is markdown file: {e}", exc_info=True) + return False @staticmethod def is_txt(url: str) -> bool: @@ -45,9 +66,11 @@ class URLHandler: True if URL is a text file, False otherwise """ try: - return url.endswith(".txt") + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + return parsed.path.lower().endswith('.txt') except Exception as e: - logger.warning(f"Error checking if URL is text file: {e}") + logger.warning(f"Error checking if URL is text file: {e}", exc_info=True) return False @staticmethod @@ -240,7 +263,7 @@ class URLHandler: return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16] except Exception as e: - # Redact sensitive query params from error logs + # Redacted sensitive query params from error logs try: redacted = url.split("?", 1)[0] if "?" in url else url except Exception: @@ -251,6 +274,166 @@ class URLHandler: # Fallback: use a hash of the error message + url to still get something unique fallback = f"error_{redacted}_{str(e)}" return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16] + + @staticmethod + def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]: + """ + Extract markdown-style links from text content. + + Args: + content: Text content to extract links from + base_url: Base URL to resolve relative links against + + Returns: + List of absolute URLs found in the content + """ + try: + if not content: + return [] + + # Ultimate URL pattern with comprehensive format support: + # 1) [text](url) - markdown links + # 2) - autolinks + # 3) https://... - bare URLs with protocol + # 4) //example.com - protocol-relative URLs + # 5) www.example.com - scheme-less www URLs + combined_pattern = re.compile( + r'\[(?P[^\]]*)\]\((?P[^)]+)\)' # named: md + r'|<\s*(?Phttps?://[^>\s]+)\s*>' # named: auto + r'|(?Phttps?://[^\s<>()\[\]"]+)' # named: bare + r'|(?P//[^\s<>()\[\]"]+)' # named: protocol-relative + r'|(?Pwww\.[^\s<>()\[\]"]+)' # named: www.* without scheme + ) + + def _clean_url(u: str) -> str: + # Trim whitespace and comprehensive trailing punctuation + # Also remove invisible Unicode characters that can break URLs + import unicodedata + cleaned = u.strip().rstrip('.,;:)]>') + # Remove invisible/control characters but keep valid URL characters + cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc')) + return cleaned + + urls = [] + for match in re.finditer(combined_pattern, content): + url = ( + match.group('md') + or match.group('auto') + or match.group('bare') + or match.group('proto') + or match.group('www') + ) + if not url: + continue + url = _clean_url(url) + + # Skip empty URLs, anchors, and mailto links + if not url or url.startswith('#') or url.startswith('mailto:'): + continue + + # Normalize all URL formats to https:// + if url.startswith('//'): + url = f'https:{url}' + elif url.startswith('www.'): + url = f'https://{url}' + + # Convert relative URLs to absolute if base_url provided + if base_url and not url.startswith(('http://', 'https://')): + try: + url = urljoin(base_url, url) + except Exception as e: + logger.warning(f"Failed to resolve relative URL {url} with base {base_url}: {e}") + continue + + # Only include HTTP/HTTPS URLs + if url.startswith(('http://', 'https://')): + urls.append(url) + + # Remove duplicates while preserving order + seen = set() + unique_urls = [] + for url in urls: + if url not in seen: + seen.add(url) + unique_urls.append(url) + + logger.info(f"Extracted {len(unique_urls)} unique links from content") + return unique_urls + + except Exception as e: + logger.error(f"Error extracting markdown links: {e}", exc_info=True) + return [] + + @staticmethod + def is_link_collection_file(url: str, content: Optional[str] = None) -> bool: + """ + Check if a URL/file appears to be a link collection file like llms.txt. + + Args: + url: URL to check + content: Optional content to analyze for link density + + Returns: + True if file appears to be a link collection, False otherwise + """ + try: + # Extract filename from URL + parsed = urlparse(url) + filename = parsed.path.split('/')[-1].lower() + + # Check for specific link collection filenames + # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links + link_collection_patterns = [ + # .txt variants - files that typically contain lists of links + 'llms.txt', 'links.txt', 'resources.txt', 'references.txt', + # .md/.mdx/.markdown variants + 'llms.md', 'links.md', 'resources.md', 'references.md', + 'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx', + 'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown', + ] + + # Direct filename match + if filename in link_collection_patterns: + logger.info(f"Detected link collection file by filename: {filename}") + return True + + # Pattern-based detection for variations, but exclude "full" variants + # Only match files that are likely link collections, not complete content files + if filename.endswith(('.txt', '.md', '.mdx', '.markdown')): + # Exclude files with "full" in the name - these typically contain complete content, not just links + if 'full' not in filename: + # Match files that start with common link collection prefixes + base_patterns = ['llms', 'links', 'resources', 'references'] + if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns): + logger.info(f"Detected potential link collection file: {filename}") + return True + + # Content-based detection if content is provided + if content: + # Never treat "full" variants as link collections to preserve single-page behavior + if 'full' in filename: + logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}") + return False + # Reuse extractor to avoid regex divergence and maintain consistency + extracted_links = URLHandler.extract_markdown_links(content, url) + total_links = len(extracted_links) + + # Calculate link density (links per 100 characters) + content_length = len(content.strip()) + if content_length > 0: + link_density = (total_links * 100) / content_length + + # If more than 2% of content is links, likely a link collection + if link_density > 2.0 and total_links > 3: + logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%") + return True + + return False + + except Exception as e: + logger.warning(f"Error checking if file is link collection: {e}", exc_info=True) + return False + @staticmethod def extract_display_name(url: str) -> str: diff --git a/python/src/server/services/embeddings/contextual_embedding_service.py b/python/src/server/services/embeddings/contextual_embedding_service.py index 7469d5a..e72d81a 100644 --- a/python/src/server/services/embeddings/contextual_embedding_service.py +++ b/python/src/server/services/embeddings/contextual_embedding_service.py @@ -219,4 +219,4 @@ async def generate_contextual_embeddings_batch( except Exception as e: search_logger.error(f"Error in contextual embedding batch: {e}") # Return non-contextual for all chunks - return [(chunk, False) for chunk in chunks] + return [(chunk, False) for chunk in chunks] \ No newline at end of file diff --git a/python/src/server/services/storage/code_storage_service.py b/python/src/server/services/storage/code_storage_service.py index cacc7d7..e987939 100644 --- a/python/src/server/services/storage/code_storage_service.py +++ b/python/src/server/services/storage/code_storage_service.py @@ -955,6 +955,10 @@ async def add_code_examples_to_supabase( "status": "code_storage", "percentage": progress_percentage, "log": f"Stored batch {batch_num}/{total_batches} of code examples", + # Stage-specific batch fields to prevent contamination with document storage + "code_current_batch": batch_num, + "code_total_batches": total_batches, + # Keep generic fields for backward compatibility "batch_number": batch_num, "total_batches": total_batches, }) @@ -966,4 +970,7 @@ async def add_code_examples_to_supabase( "percentage": 100, "log": f"Code storage completed. Stored {total_items} code examples.", "total_items": total_items, + # Keep final batch info for code storage completion + "code_total_batches": (total_items + batch_size - 1) // batch_size, + "code_current_batch": (total_items + batch_size - 1) // batch_size, }) diff --git a/python/src/server/services/storage/document_storage_service.py b/python/src/server/services/storage/document_storage_service.py index 4fc07a1..392394e 100644 --- a/python/src/server/services/storage/document_storage_service.py +++ b/python/src/server/services/storage/document_storage_service.py @@ -349,6 +349,11 @@ async def add_documents_to_supabase( # Simple batch completion info batch_info = { + # Stage-specific batch fields to prevent contamination with code examples + "document_completed_batches": completed_batches, + "document_total_batches": total_batches, + "document_current_batch": batch_num, + # Keep generic fields for backward compatibility "completed_batches": completed_batches, "total_batches": total_batches, "current_batch": batch_num, @@ -401,12 +406,12 @@ async def add_documents_to_supabase( "document_storage", 100, # Ensure we report 100% f"Document storage completed: {len(contents)} chunks stored in {total_batches} batches", - completed_batches=total_batches, - total_batches=total_batches, - current_batch=total_batches, - chunks_processed=len(contents), - # DON'T send 'status': 'completed' - that's for the orchestration service only! - ) + completed_batches=total_batches, + total_batches=total_batches, + current_batch=total_batches, + chunks_processed=len(contents), + # DON'T send 'status': 'completed' - that's for the orchestration service only! + ) except Exception as e: search_logger.warning(f"Progress callback failed during completion: {e}. Storage still successful.")