Fix/(llms.txt) not crawling links inside of file (#437)

* fixed the llms.txt/fulls-llm.txt/llms.md etc. to be crawleed finally. intelligently determines if theres links in the llms.txt and crawls them as it should. tested fully everything works!

* updated coderabbits suggestion - resolved

* refined to code rabbits suggestions take 2, should be final take. didnt add the max link paramter suggestion though.

* 3rd times the charm, added nit picky thing from code rabbit. code rabbit makes me crave nicotine

* Fixed progress bar accuracy and OpenAI API compatibility issues

  Changes Made:

  1. Progress Bar Fix: Fixed llms.txt crawling progress jumping to 90% then regressing to 45% by adjusting batch crawling progress ranges (20-30% instead of 40-90%) and using consistent ProgressMapper ranges
  2. OpenAI API Compatibility: Added robust fallback logic in contextual embedding service to handle newer models (GPT-5) that require max_completion_tokens instead of max_tokens and don't support custom temperature values

  Files Modified:

  - src/server/services/crawling/crawling_service.py - Fixed progress ranges
  - src/server/services/crawling/progress_mapper.py - Restored original stage ranges
  - src/server/services/embeddings/contextual_embedding_service.py - Added fallback API logic

  Result:

  -  Progress bar now smoothly progresses 030% (crawling)  35-80% (storage)  100%
  -  Automatic compatibility with both old (GPT-4.1-nano) and new (GPT-5-nano) OpenAI models
  -  Eliminates max_tokens not supported and temperature not supported errors

* removed gpt-5-handlings since thats a seprate issue and doesnt pertain to here, definitley recommend looking at that though since gpt-5-nano is considered a reasoning model and doesnt use max_tokens, requires a diffrent output. also removed my upsert fix from documentstorage since thats not apart of this exact issue and i have another PR open for it. checked in code rabbit in my ide no issues, no nitpicks. should be good? might flag me for the UPSERT logic not being in here. owell has nothing to do with this was pr, was submitted in the last revision by mistake. everythings tested and good to go!

* fixed the llms-full.txt crawling issue. now crawls just that page when crawling llms-full.txt. fixed the 100% crawl url when multiple urls are present and hasnt finished crawling. also fixed a styling issue in CrawlingProgressCard.tsx , when batching code examples the batching progress bar would sometimes glitch out of the ui fixed it to where it wont do that now.

* fixed a few things so it will work with the current branch!

* added some enhancemments to ui rendering aswell and other little misc. fixes from code rabbit

---------

Co-authored-by: Chillbruhhh <joshchesser97@gmail.com>
Co-authored-by: Claude Code <claude@anthropic.com>
This commit is contained in:
Josh 2025-09-06 14:33:00 -05:00 committed by GitHub
parent 52ee5e2c2a
commit 8172067413
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 289 additions and 15 deletions

View File

@ -526,6 +526,40 @@ class CrawlingService:
f"Unregistered orchestration service on error | progress_id={self.progress_id}"
)
def _is_self_link(self, link: str, base_url: str) -> bool:
"""
Check if a link is a self-referential link to the base URL.
Handles query parameters, fragments, trailing slashes, and normalizes
scheme/host/ports for accurate comparison.
Args:
link: The link to check
base_url: The base URL to compare against
Returns:
True if the link is self-referential, False otherwise
"""
try:
from urllib.parse import urlparse
def _core(u: str) -> str:
p = urlparse(u)
scheme = (p.scheme or "http").lower()
host = (p.hostname or "").lower()
port = p.port
if (scheme == "http" and port in (None, 80)) or (scheme == "https" and port in (None, 443)):
port_part = ""
else:
port_part = f":{port}" if port else ""
path = p.path.rstrip("/")
return f"{scheme}://{host}{port_part}{path}"
return _core(link) == _core(base_url)
except Exception as e:
logger.warning(f"Error checking if link is self-referential: {e}", exc_info=True)
# Fallback to simple string comparison
return link.rstrip('/') == base_url.rstrip('/')
async def _crawl_by_url_type(self, url: str, request: dict[str, Any]) -> tuple:
"""
Detect URL type and perform appropriate crawling.
@ -536,7 +570,7 @@ class CrawlingService:
crawl_results = []
crawl_type = None
if self.url_handler.is_txt(url):
if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
# Handle text files
crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
if self.progress_tracker:
@ -550,9 +584,54 @@ class CrawlingService:
crawl_results = await self.crawl_markdown_file(
url,
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
start_progress=5,
end_progress=10,
)
# Check if this is a link collection file and extract links
if crawl_results and len(crawl_results) > 0:
content = crawl_results[0].get('markdown', '')
if self.url_handler.is_link_collection_file(url, content):
# Extract links from the content
extracted_links = self.url_handler.extract_markdown_links(content, url)
# Filter out self-referential links to avoid redundant crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [
link for link in extracted_links
if not self._is_self_link(link, url)
]
self_filtered_count = original_count - len(extracted_links)
if self_filtered_count > 0:
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
filtered_count = original_count - len(extracted_links)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
if extracted_links:
# Crawl the extracted links using batch crawling
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
batch_results = await self.crawl_batch_with_progress(
extracted_links,
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
)
# Combine original text file results with batch results
crawl_results.extend(batch_results)
crawl_type = "link_collection_with_crawled_links"
logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
else:
logger.info(f"No valid links found in link collection file: {url}")
logger.info(f"Text file crawling completed: {len(crawl_results)} results")
elif self.url_handler.is_sitemap(url):
# Handle sitemaps

View File

@ -6,7 +6,8 @@ Handles URL transformations and validations.
import hashlib
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urljoin
from typing import List, Optional
from ....config.logfire_config import get_logger
@ -33,6 +34,26 @@ class URLHandler:
logger.warning(f"Error checking if URL is sitemap: {e}")
return False
@staticmethod
def is_markdown(url: str) -> bool:
"""
Check if a URL points to a markdown file (.md, .mdx, .markdown).
Args:
url: URL to check
Returns:
True if URL is a markdown file, False otherwise
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
return path.endswith(('.md', '.mdx', '.markdown'))
except Exception as e:
logger.warning(f"Error checking if URL is markdown file: {e}", exc_info=True)
return False
@staticmethod
def is_txt(url: str) -> bool:
"""
@ -45,9 +66,11 @@ class URLHandler:
True if URL is a text file, False otherwise
"""
try:
return url.endswith(".txt")
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
return parsed.path.lower().endswith('.txt')
except Exception as e:
logger.warning(f"Error checking if URL is text file: {e}")
logger.warning(f"Error checking if URL is text file: {e}", exc_info=True)
return False
@staticmethod
@ -240,7 +263,7 @@ class URLHandler:
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()[:16]
except Exception as e:
# Redact sensitive query params from error logs
# Redacted sensitive query params from error logs
try:
redacted = url.split("?", 1)[0] if "?" in url else url
except Exception:
@ -252,6 +275,166 @@ class URLHandler:
fallback = f"error_{redacted}_{str(e)}"
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
@staticmethod
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
"""
Extract markdown-style links from text content.
Args:
content: Text content to extract links from
base_url: Base URL to resolve relative links against
Returns:
List of absolute URLs found in the content
"""
try:
if not content:
return []
# Ultimate URL pattern with comprehensive format support:
# 1) [text](url) - markdown links
# 2) <https://...> - autolinks
# 3) https://... - bare URLs with protocol
# 4) //example.com - protocol-relative URLs
# 5) www.example.com - scheme-less www URLs
combined_pattern = re.compile(
r'\[(?P<text>[^\]]*)\]\((?P<md>[^)]+)\)' # named: md
r'|<\s*(?P<auto>https?://[^>\s]+)\s*>' # named: auto
r'|(?P<bare>https?://[^\s<>()\[\]"]+)' # named: bare
r'|(?P<proto>//[^\s<>()\[\]"]+)' # named: protocol-relative
r'|(?P<www>www\.[^\s<>()\[\]"]+)' # named: www.* without scheme
)
def _clean_url(u: str) -> str:
# Trim whitespace and comprehensive trailing punctuation
# Also remove invisible Unicode characters that can break URLs
import unicodedata
cleaned = u.strip().rstrip('.,;:)]>')
# Remove invisible/control characters but keep valid URL characters
cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
return cleaned
urls = []
for match in re.finditer(combined_pattern, content):
url = (
match.group('md')
or match.group('auto')
or match.group('bare')
or match.group('proto')
or match.group('www')
)
if not url:
continue
url = _clean_url(url)
# Skip empty URLs, anchors, and mailto links
if not url or url.startswith('#') or url.startswith('mailto:'):
continue
# Normalize all URL formats to https://
if url.startswith('//'):
url = f'https:{url}'
elif url.startswith('www.'):
url = f'https://{url}'
# Convert relative URLs to absolute if base_url provided
if base_url and not url.startswith(('http://', 'https://')):
try:
url = urljoin(base_url, url)
except Exception as e:
logger.warning(f"Failed to resolve relative URL {url} with base {base_url}: {e}")
continue
# Only include HTTP/HTTPS URLs
if url.startswith(('http://', 'https://')):
urls.append(url)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
logger.info(f"Extracted {len(unique_urls)} unique links from content")
return unique_urls
except Exception as e:
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
return []
@staticmethod
def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
"""
Check if a URL/file appears to be a link collection file like llms.txt.
Args:
url: URL to check
content: Optional content to analyze for link density
Returns:
True if file appears to be a link collection, False otherwise
"""
try:
# Extract filename from URL
parsed = urlparse(url)
filename = parsed.path.split('/')[-1].lower()
# Check for specific link collection filenames
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
link_collection_patterns = [
# .txt variants - files that typically contain lists of links
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
# .md/.mdx/.markdown variants
'llms.md', 'links.md', 'resources.md', 'references.md',
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
]
# Direct filename match
if filename in link_collection_patterns:
logger.info(f"Detected link collection file by filename: {filename}")
return True
# Pattern-based detection for variations, but exclude "full" variants
# Only match files that are likely link collections, not complete content files
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
# Exclude files with "full" in the name - these typically contain complete content, not just links
if 'full' not in filename:
# Match files that start with common link collection prefixes
base_patterns = ['llms', 'links', 'resources', 'references']
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
logger.info(f"Detected potential link collection file: {filename}")
return True
# Content-based detection if content is provided
if content:
# Never treat "full" variants as link collections to preserve single-page behavior
if 'full' in filename:
logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
return False
# Reuse extractor to avoid regex divergence and maintain consistency
extracted_links = URLHandler.extract_markdown_links(content, url)
total_links = len(extracted_links)
# Calculate link density (links per 100 characters)
content_length = len(content.strip())
if content_length > 0:
link_density = (total_links * 100) / content_length
# If more than 2% of content is links, likely a link collection
if link_density > 2.0 and total_links > 3:
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
return True
return False
except Exception as e:
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
return False
@staticmethod
def extract_display_name(url: str) -> str:
"""

View File

@ -955,6 +955,10 @@ async def add_code_examples_to_supabase(
"status": "code_storage",
"percentage": progress_percentage,
"log": f"Stored batch {batch_num}/{total_batches} of code examples",
# Stage-specific batch fields to prevent contamination with document storage
"code_current_batch": batch_num,
"code_total_batches": total_batches,
# Keep generic fields for backward compatibility
"batch_number": batch_num,
"total_batches": total_batches,
})
@ -966,4 +970,7 @@ async def add_code_examples_to_supabase(
"percentage": 100,
"log": f"Code storage completed. Stored {total_items} code examples.",
"total_items": total_items,
# Keep final batch info for code storage completion
"code_total_batches": (total_items + batch_size - 1) // batch_size,
"code_current_batch": (total_items + batch_size - 1) // batch_size,
})

View File

@ -349,6 +349,11 @@ async def add_documents_to_supabase(
# Simple batch completion info
batch_info = {
# Stage-specific batch fields to prevent contamination with code examples
"document_completed_batches": completed_batches,
"document_total_batches": total_batches,
"document_current_batch": batch_num,
# Keep generic fields for backward compatibility
"completed_batches": completed_batches,
"total_batches": total_batches,
"current_batch": batch_num,
@ -401,12 +406,12 @@ async def add_documents_to_supabase(
"document_storage",
100, # Ensure we report 100%
f"Document storage completed: {len(contents)} chunks stored in {total_batches} batches",
completed_batches=total_batches,
total_batches=total_batches,
current_batch=total_batches,
chunks_processed=len(contents),
# DON'T send 'status': 'completed' - that's for the orchestration service only!
)
completed_batches=total_batches,
total_batches=total_batches,
current_batch=total_batches,
chunks_processed=len(contents),
# DON'T send 'status': 'completed' - that's for the orchestration service only!
)
except Exception as e:
search_logger.warning(f"Progress callback failed during completion: {e}. Storage still successful.")