From cadda22d222394670e9cc06091002698831bbd23 Mon Sep 17 00:00:00 2001 From: DIY Smart Code Date: Sat, 6 Sep 2025 12:27:17 +0200 Subject: [PATCH] feat: Document Browser with Domain Filtering (Updated Architecture) (#564) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add DocumentBrowser with domain filtering (updated for latest architecture) - Add DocumentBrowser component with two-column layout - Add domain filtering and search functionality - Add chunks API endpoint for browsing document content - Add clickable page count badge to open browser - Integrate with latest HTTP polling architecture - Add service method for fetching chunks with domain filtering - Compatible with new modular component structure 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * fix: Apply CodeRabbit suggestions for domain filtering and API reliability - Preserve subdomains in domain extraction (docs.anthropic.com vs anthropic.com) - Add deterministic ordering to API queries for stable chunk lists - Use case-insensitive domain filtering with ilike - Add explicit Supabase error handling to prevent silent failures 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude * Update document browser branch for main branch compatibility - Add TanStack Query package dependencies - Add getKnowledgeItemChunks service method for DocumentBrowser - Add minimal feature components for build compatibility - Ensure document browser functionality works with latest architecture - Maintain clickable page count badges and document browsing modal Document browser is now ready for use with modernized Archon codebase. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --------- Co-authored-by: Claude --- archon-ui-main/package-lock.json | 78 +++-- .../knowledge-base/DocumentBrowser.tsx | 319 ++++++++++++++++++ .../knowledge-base/KnowledgeItemCard.tsx | 22 +- .../src/pages/KnowledgeBasePage.tsx | 23 ++ .../src/services/knowledgeBaseService.ts | 30 ++ python/src/server/api_routes/knowledge_api.py | 49 +++ 6 files changed, 487 insertions(+), 34 deletions(-) create mode 100644 archon-ui-main/src/components/knowledge-base/DocumentBrowser.tsx diff --git a/archon-ui-main/package-lock.json b/archon-ui-main/package-lock.json index cb8e8f3..7d36713 100644 --- a/archon-ui-main/package-lock.json +++ b/archon-ui-main/package-lock.json @@ -3735,9 +3735,9 @@ "license": "MIT" }, "node_modules/@tanstack/query-core": { - "version": "5.85.7", - "resolved": "https://registry.npmjs.org/@tanstack/query-core/-/query-core-5.85.7.tgz", - "integrity": "sha512-FLT3EtuTbXBmOrDku4bI80Eivmjn/o/Zc1lVEd/6yzR8UAUSnDwYiwghCZvLqHyGSN5mO35ux1yPGMFYBFRSwA==", + "version": "5.87.0", + "resolved": "https://registry.npmjs.org/@tanstack/query-core/-/query-core-5.87.0.tgz", + "integrity": "sha512-gRZig2csRl71i/HEAHlE9TOmMqKKs9WkMAqIUlzagH+sNtgjvqxwaVo2HmfNGe+iDWUak0ratSkiRv0m/Y8ijg==", "license": "MIT", "funding": { "type": "github", @@ -3745,9 +3745,9 @@ } }, "node_modules/@tanstack/query-devtools": { - "version": "5.84.0", - "resolved": "https://registry.npmjs.org/@tanstack/query-devtools/-/query-devtools-5.84.0.tgz", - "integrity": "sha512-fbF3n+z1rqhvd9EoGp5knHkv3p5B2Zml1yNRjh7sNXklngYI5RVIWUrUjZ1RIcEoscarUb0+bOvIs5x9dwzOXQ==", + "version": "5.86.0", + "resolved": "https://registry.npmjs.org/@tanstack/query-devtools/-/query-devtools-5.86.0.tgz", + "integrity": "sha512-/JDw9BP80eambEK/EsDMGAcsL2VFT+8F5KCOwierjPU7QP8Wt1GT32yJpn3qOinBM8/zS3Jy36+F0GiyJp411A==", "license": "MIT", "funding": { "type": "github", @@ -3755,12 +3755,12 @@ } }, "node_modules/@tanstack/react-query": { - "version": "5.85.8", - "resolved": "https://registry.npmjs.org/@tanstack/react-query/-/react-query-5.85.8.tgz", - "integrity": "sha512-r3rW55STAO03EJg5mrCVIJvaEK3oeHme5u7QovuRFIKRbEgTzTv2DPdenX46X+x56LsU3ree1N4rzI/+gJ7KEA==", + "version": "5.87.0", + "resolved": "https://registry.npmjs.org/@tanstack/react-query/-/react-query-5.87.0.tgz", + "integrity": "sha512-3uRCGHo7KWHl6h7ptzLd5CbrjTQP5Q/37aC1cueClkSN4t/OaNFmfGolgs1AoA0kFjP/OZxTY2ytQoifyJzpWQ==", "license": "MIT", "dependencies": { - "@tanstack/query-core": "5.85.7" + "@tanstack/query-core": "5.87.0" }, "funding": { "type": "github", @@ -3771,19 +3771,19 @@ } }, "node_modules/@tanstack/react-query-devtools": { - "version": "5.85.8", - "resolved": "https://registry.npmjs.org/@tanstack/react-query-devtools/-/react-query-devtools-5.85.8.tgz", - "integrity": "sha512-83SXqRpmVlRMpaj32veez/8ohjY7O4VQIYDqW91b4i9AQjiYgE24FbBfR/SOL8b5MfKhHMZkD+BQSpCh9jY06w==", + "version": "5.87.0", + "resolved": "https://registry.npmjs.org/@tanstack/react-query-devtools/-/react-query-devtools-5.87.0.tgz", + "integrity": "sha512-OeOSKsPyLcTVLdn391iNeRqYFEmpYJrY9t+FjKpaC6ql0SyRu2XT3mKYJIfYczhMMlwOIlbJkNaifBveertV8Q==", "license": "MIT", "dependencies": { - "@tanstack/query-devtools": "5.84.0" + "@tanstack/query-devtools": "5.86.0" }, "funding": { "type": "github", "url": "https://github.com/sponsors/tannerlinsley" }, "peerDependencies": { - "@tanstack/react-query": "^5.85.8", + "@tanstack/react-query": "^5.87.0", "react": "^18 || ^19" } }, @@ -4572,9 +4572,9 @@ "license": "MIT" }, "node_modules/acorn": { - "version": "8.15.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", - "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", + "version": "8.14.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.1.tgz", + "integrity": "sha512-OvQ/2pUDKmgfCg++xsTX1wGxfTaszcHVcTctW4UJB4hibJx2HXxxO5UmVgyjMa+ZDsiaf5wWLXYpRWMmBI0QHg==", "license": "MIT", "bin": { "acorn": "bin/acorn" @@ -6004,7 +6004,20 @@ "eslint": ">=8.40" } }, - "node_modules/eslint-scope": { + "node_modules/eslint-visitor-keys": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", + "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/eslint/node_modules/eslint-scope": { "version": "7.2.2", "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-7.2.2.tgz", "integrity": "sha512-dOt21O7lTMhDM+X9mB4GX+DZrZtCUJPL/wlcTqxyrx5IvO0IYtILdtrQGQp+8n5S0gwSVmOf9NQrjMOgfQZlIg==", @@ -6021,17 +6034,14 @@ "url": "https://opencollective.com/eslint" } }, - "node_modules/eslint-visitor-keys": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", - "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", + "node_modules/eslint/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", "dev": true, - "license": "Apache-2.0", + "license": "BSD-2-Clause", "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" + "node": ">=4.0" } }, "node_modules/eslint/node_modules/globals": { @@ -6096,6 +6106,16 @@ "node": ">=0.10" } }, + "node_modules/esquery/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.0" + } + }, "node_modules/esrecurse": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", @@ -6109,7 +6129,7 @@ "node": ">=4.0" } }, - "node_modules/estraverse": { + "node_modules/esrecurse/node_modules/estraverse": { "version": "5.3.0", "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", diff --git a/archon-ui-main/src/components/knowledge-base/DocumentBrowser.tsx b/archon-ui-main/src/components/knowledge-base/DocumentBrowser.tsx new file mode 100644 index 0000000..4373cc0 --- /dev/null +++ b/archon-ui-main/src/components/knowledge-base/DocumentBrowser.tsx @@ -0,0 +1,319 @@ +import React, { useState, useEffect, useMemo } from 'react'; +import { createPortal } from 'react-dom'; +import { Search, Filter, FileText, Globe, X } from 'lucide-react'; +import { motion, AnimatePresence } from 'framer-motion'; +import { Badge } from '../ui/Badge'; +import { Button } from '../ui/Button'; +import { knowledgeBaseService } from '../../services/knowledgeBaseService'; + +interface DocumentChunk { + id: string; + source_id: string; + content: string; + metadata?: any; + url?: string; +} + +interface DocumentBrowserProps { + sourceId: string; + isOpen: boolean; + onClose: () => void; +} + +const extractDomain = (url: string): string => { + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + + // Remove 'www.' prefix if present + const withoutWww = hostname.startsWith('www.') ? hostname.slice(4) : hostname; + + // Keep full hostname (minus 'www.') to preserve subdomain-level filtering + return withoutWww; + } catch { + return url; // Return original if URL parsing fails + } +}; + +export const DocumentBrowser: React.FC = ({ + sourceId, + isOpen, + onClose, +}) => { + const [chunks, setChunks] = useState([]); + const [loading, setLoading] = useState(true); + const [searchQuery, setSearchQuery] = useState(''); + const [selectedDomain, setSelectedDomain] = useState('all'); + const [selectedChunkId, setSelectedChunkId] = useState(null); + const [error, setError] = useState(null); + + // Extract unique domains from chunks + const domains = useMemo(() => { + const domainSet = new Set(); + chunks.forEach(chunk => { + if (chunk.url) { + domainSet.add(extractDomain(chunk.url)); + } + }); + return Array.from(domainSet).sort(); + }, [chunks]); + + // Filter chunks based on search and domain + const filteredChunks = useMemo(() => { + return chunks.filter(chunk => { + // Search filter + const searchLower = searchQuery.toLowerCase(); + const searchMatch = !searchQuery || + chunk.content.toLowerCase().includes(searchLower) || + chunk.url?.toLowerCase().includes(searchLower); + + // Domain filter + const domainMatch = selectedDomain === 'all' || + (chunk.url && extractDomain(chunk.url) === selectedDomain); + + return searchMatch && domainMatch; + }); + }, [chunks, searchQuery, selectedDomain]); + + // Get selected chunk + const selectedChunk = useMemo(() => { + return filteredChunks.find(chunk => chunk.id === selectedChunkId) || filteredChunks[0]; + }, [filteredChunks, selectedChunkId]); + + // Load chunks when component opens + useEffect(() => { + if (isOpen && sourceId) { + loadChunks(); + } + }, [isOpen, sourceId]); + + const loadChunks = async () => { + try { + setLoading(true); + setError(null); + + const response = await knowledgeBaseService.getKnowledgeItemChunks(sourceId); + + if (response.success) { + setChunks(response.chunks); + // Auto-select first chunk if none selected + if (response.chunks.length > 0 && !selectedChunkId) { + setSelectedChunkId(response.chunks[0].id); + } + } else { + setError('Failed to load document chunks'); + } + } catch (error) { + console.error('Failed to load chunks:', error); + setError(error instanceof Error ? error.message : 'Failed to load document chunks'); + } finally { + setLoading(false); + } + }; + + const loadChunksWithDomainFilter = async (domain: string) => { + try { + setLoading(true); + setError(null); + + const domainFilter = domain === 'all' ? undefined : domain; + const response = await knowledgeBaseService.getKnowledgeItemChunks(sourceId, domainFilter); + + if (response.success) { + setChunks(response.chunks); + } else { + setError('Failed to load document chunks'); + } + } catch (error) { + console.error('Failed to load chunks with domain filter:', error); + setError(error instanceof Error ? error.message : 'Failed to load document chunks'); + } finally { + setLoading(false); + } + }; + + const handleDomainChange = (domain: string) => { + setSelectedDomain(domain); + // Note: We could reload with server-side filtering, but for now we'll do client-side filtering + // loadChunksWithDomainFilter(domain); + }; + + if (!isOpen) return null; + + return createPortal( + + e.stopPropagation()} + > + {/* Blue accent line at the top */} +
+ + {/* Sidebar */} +
+ {/* Sidebar Header */} +
+
+

+ Document Chunks ({(filteredChunks || []).length}) +

+
+ + {/* Search */} +
+ + setSearchQuery(e.target.value)} + className="w-full pl-10 pr-3 py-2 bg-gray-900/70 border border-gray-800 rounded-lg text-sm text-gray-300 placeholder-gray-600 focus:outline-none focus:border-blue-500/50 focus:ring-1 focus:ring-blue-500/20 transition-all" + /> +
+ + {/* Domain Filter */} +
+ + +
+
+ + {/* Document List */} +
+ {filteredChunks.length === 0 ? ( +
+ No documents found +
+ ) : ( + filteredChunks.map((chunk, index) => ( + + )) + )} +
+
+ + {/* Main Content Area */} +
+ {/* Header */} +
+
+

+ {selectedChunk ? `Document Chunk` : 'Document Browser'} +

+ {selectedChunk?.url && ( + + + {extractDomain(selectedChunk.url)} + + )} +
+ +
+ + {/* Content */} +
+ {loading ? ( +
+
+
+

Loading document chunks...

+
+
+ ) : !selectedChunk || filteredChunks.length === 0 ? ( +
+
+ +

Select a document chunk to view content

+
+
+ ) : ( +
+
+
+ {selectedChunk.url && ( +
+ {selectedChunk.url} +
+ )} + +
+
+ {selectedChunk.content || 'No content available'} +
+
+ + {selectedChunk.metadata && ( +
+
+ + View Metadata + +
+                            {JSON.stringify(selectedChunk.metadata, null, 2)}
+                          
+
+
+ )} +
+
+
+ )} +
+
+
+
, + document.body + ); +}; \ No newline at end of file diff --git a/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx b/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx index 349b6d2..0c6589d 100644 --- a/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx +++ b/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx @@ -129,6 +129,7 @@ interface KnowledgeItemCardProps { onDelete: (sourceId: string) => void; onUpdate?: () => void; onRefresh?: (sourceId: string) => void; + onBrowseDocuments?: (sourceId: string) => void; isSelectionMode?: boolean; isSelected?: boolean; onToggleSelection?: (event: React.MouseEvent) => void; @@ -139,6 +140,7 @@ export const KnowledgeItemCard = ({ onDelete, onUpdate, onRefresh, + onBrowseDocuments, isSelectionMode = false, isSelected = false, onToggleSelection @@ -454,13 +456,20 @@ export const KnowledgeItemCard = ({ )} - {/* Page count - orange neon container */} + {/* Page count - orange neon container (clickable for document browser) */}
{ + e.stopPropagation(); + if (onBrowseDocuments) { + onBrowseDocuments(item.source_id); + } + }} onMouseEnter={() => setShowPageTooltip(true)} onMouseLeave={() => setShowPageTooltip(false)} + title="Click to browse document chunks" > -
+
{Math.ceil( @@ -471,10 +480,13 @@ export const KnowledgeItemCard = ({ {/* Page count tooltip - positioned relative to the badge */} {showPageTooltip && (
-
- {(item.metadata.word_count || 0).toLocaleString()} words +
+ Click to Browse Documents
+
+ {(item.metadata.word_count || 0).toLocaleString()} words +
= {Math.ceil((item.metadata.word_count || 0) / 250).toLocaleString()} pages
diff --git a/archon-ui-main/src/pages/KnowledgeBasePage.tsx b/archon-ui-main/src/pages/KnowledgeBasePage.tsx index 7861ce2..9b1c96d 100644 --- a/archon-ui-main/src/pages/KnowledgeBasePage.tsx +++ b/archon-ui-main/src/pages/KnowledgeBasePage.tsx @@ -16,6 +16,7 @@ import { KnowledgeGridSkeleton, KnowledgeTableSkeleton } from '../components/kno import { GroupCreationModal } from '../components/knowledge-base/GroupCreationModal'; import { AddKnowledgeModal } from '../components/knowledge-base/AddKnowledgeModal'; import { CrawlingTab } from '../components/knowledge-base/CrawlingTab'; +import { DocumentBrowser } from '../components/knowledge-base/DocumentBrowser'; interface GroupedKnowledgeItem { id: string; @@ -53,6 +54,10 @@ export const KnowledgeBasePage = () => { const [isSelectionMode, setIsSelectionMode] = useState(false); const [lastSelectedIndex, setLastSelectedIndex] = useState(null); + // Document browser state + const [documentBrowserSourceId, setDocumentBrowserSourceId] = useState(null); + const [isDocumentBrowserOpen, setIsDocumentBrowserOpen] = useState(false); + const { showToast } = useToast(); // Load knowledge items @@ -267,6 +272,11 @@ export const KnowledgeBasePage = () => { const handleAddKnowledge = () => { setIsAddModalOpen(true); }; + + const handleBrowseDocuments = (sourceId: string) => { + setDocumentBrowserSourceId(sourceId); + setIsDocumentBrowserOpen(true); + }; const toggleSelectionMode = () => { setIsSelectionMode(!isSelectionMode); @@ -749,6 +759,7 @@ export const KnowledgeBasePage = () => { onDelete={handleDeleteItem} onUpdate={loadKnowledgeItems} onRefresh={handleRefreshItem} + onBrowseDocuments={handleBrowseDocuments} isSelectionMode={isSelectionMode} isSelected={selectedItems.has(item.id)} onToggleSelection={(e) => toggleItemSelection(item.id, index, e)} @@ -790,6 +801,18 @@ export const KnowledgeBasePage = () => { }} /> )} + + {/* Document Browser Modal */} + {isDocumentBrowserOpen && documentBrowserSourceId && ( + { + setIsDocumentBrowserOpen(false); + setDocumentBrowserSourceId(null); + }} + /> + )}
); }; \ No newline at end of file diff --git a/archon-ui-main/src/services/knowledgeBaseService.ts b/archon-ui-main/src/services/knowledgeBaseService.ts index 2a672e9..10ab752 100644 --- a/archon-ui-main/src/services/knowledgeBaseService.ts +++ b/archon-ui-main/src/services/knowledgeBaseService.ts @@ -205,6 +205,35 @@ class KnowledgeBaseService { }) } + /** + * Get document chunks for a knowledge item with optional domain filtering + */ + async getKnowledgeItemChunks(sourceId: string, domainFilter?: string) { + console.log('📄 [KnowledgeBase] Getting chunks for:', sourceId, 'domainFilter:', domainFilter); + + const params = new URLSearchParams(); + if (domainFilter) { + params.append('domain_filter', domainFilter); + } + + const queryString = params.toString(); + const endpoint = `/knowledge-items/${sourceId}/chunks${queryString ? `?${queryString}` : ''}`; + + return apiRequest<{ + success: boolean; + source_id: string; + domain_filter?: string; + chunks: Array<{ + id: string; + source_id: string; + content: string; + metadata?: any; + url?: string; + }>; + count: number; + }>(endpoint); + } + /** * Upload a document to the knowledge base with progress tracking */ @@ -295,6 +324,7 @@ class KnowledgeBaseService { count: number }>(`/knowledge-items/${sourceId}/code-examples`); } + } // Export singleton instance diff --git a/python/src/server/api_routes/knowledge_api.py b/python/src/server/api_routes/knowledge_api.py index 073eb7e..a443b89 100644 --- a/python/src/server/api_routes/knowledge_api.py +++ b/python/src/server/api_routes/knowledge_api.py @@ -237,6 +237,55 @@ async def delete_knowledge_item(source_id: str): raise HTTPException(status_code=500, detail={"error": str(e)}) +@router.get("/knowledge-items/{source_id}/chunks") +async def get_knowledge_item_chunks(source_id: str, domain_filter: str | None = None): + """Get all document chunks for a specific knowledge item with optional domain filtering.""" + try: + safe_logfire_info(f"Fetching chunks for source_id: {source_id}, domain_filter: {domain_filter}") + + # Query document chunks with content for this specific source + supabase = get_supabase_client() + + # Build the query + query = supabase.from_("archon_crawled_pages").select( + "id, source_id, content, metadata, url" + ) + query = query.eq("source_id", source_id) + + # Apply domain filtering if provided + if domain_filter: + # Case-insensitive URL match + query = query.ilike("url", f"%{domain_filter}%") + + # Deterministic ordering (URL then id) + query = query.order("url", desc=False).order("id", desc=False) + + result = query.execute() + if getattr(result, "error", None): + safe_logfire_error( + f"Supabase query error | source_id={source_id} | error={result.error}" + ) + raise HTTPException(status_code=500, detail={"error": str(result.error)}) + + chunks = result.data if result.data else [] + + safe_logfire_info(f"Found {len(chunks)} chunks for {source_id}") + + return { + "success": True, + "source_id": source_id, + "domain_filter": domain_filter, + "chunks": chunks, + "count": len(chunks), + } + + except Exception as e: + safe_logfire_error( + f"Failed to fetch chunks | error={str(e)} | source_id={source_id}" + ) + raise HTTPException(status_code=500, detail={"error": str(e)}) + + @router.get("/knowledge-items/{source_id}/code-examples") async def get_knowledge_item_code_examples(source_id: str): """Get all code examples for a specific knowledge item."""