-- ===================================================== -- Add Hybrid Search with ts_vector Support -- ===================================================== -- This migration adds efficient text search capabilities using PostgreSQL's -- full-text search features (ts_vector) to enable better keyword matching -- in hybrid search operations. -- ===================================================== -- Enable required extensions (pg_trgm for fuzzy matching) CREATE EXTENSION IF NOT EXISTS pg_trgm; -- ===================================================== -- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES -- ===================================================== -- Add ts_vector columns for full-text search if they don't exist ALTER TABLE archon_crawled_pages ADD COLUMN IF NOT EXISTS content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED; ALTER TABLE archon_code_examples ADD COLUMN IF NOT EXISTS content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED; -- Create GIN indexes for fast text search CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector); CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector); -- Create trigram indexes for fuzzy matching (useful for typos and partial matches) CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops); CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops); -- ===================================================== -- SECTION 2: HYBRID SEARCH FUNCTIONS -- ===================================================== -- Hybrid search function for archon_crawled_pages CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages( query_embedding vector(1536), query_text TEXT, match_count INT DEFAULT 10, filter JSONB DEFAULT '{}'::jsonb, source_filter TEXT DEFAULT NULL ) RETURNS TABLE ( id BIGINT, url VARCHAR, chunk_number INTEGER, content TEXT, metadata JSONB, source_id TEXT, similarity FLOAT, match_type TEXT ) LANGUAGE plpgsql AS $$ DECLARE max_vector_results INT; max_text_results INT; BEGIN -- Calculate how many results to fetch from each search type max_vector_results := match_count; max_text_results := match_count; RETURN QUERY WITH vector_results AS ( -- Vector similarity search SELECT cp.id, cp.url, cp.chunk_number, cp.content, cp.metadata, cp.source_id, 1 - (cp.embedding <=> query_embedding) AS vector_sim FROM archon_crawled_pages cp WHERE cp.metadata @> filter AND (source_filter IS NULL OR cp.source_id = source_filter) AND cp.embedding IS NOT NULL ORDER BY cp.embedding <=> query_embedding LIMIT max_vector_results ), text_results AS ( -- Full-text search with ranking SELECT cp.id, cp.url, cp.chunk_number, cp.content, cp.metadata, cp.source_id, ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim FROM archon_crawled_pages cp WHERE cp.metadata @> filter AND (source_filter IS NULL OR cp.source_id = source_filter) AND cp.content_search_vector @@ plainto_tsquery('english', query_text) ORDER BY text_sim DESC LIMIT max_text_results ), combined_results AS ( -- Combine results from both searches SELECT COALESCE(v.id, t.id) AS id, COALESCE(v.url, t.url) AS url, COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, COALESCE(v.content, t.content) AS content, COALESCE(v.metadata, t.metadata) AS metadata, COALESCE(v.source_id, t.source_id) AS source_id, -- Use vector similarity if available, otherwise text similarity COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, -- Determine match type CASE WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' WHEN v.id IS NOT NULL THEN 'vector' ELSE 'keyword' END AS match_type FROM vector_results v FULL OUTER JOIN text_results t ON v.id = t.id ) SELECT * FROM combined_results ORDER BY similarity DESC LIMIT match_count; END; $$; -- Hybrid search function for archon_code_examples CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples( query_embedding vector(1536), query_text TEXT, match_count INT DEFAULT 10, filter JSONB DEFAULT '{}'::jsonb, source_filter TEXT DEFAULT NULL ) RETURNS TABLE ( id BIGINT, url VARCHAR, chunk_number INTEGER, content TEXT, summary TEXT, metadata JSONB, source_id TEXT, similarity FLOAT, match_type TEXT ) LANGUAGE plpgsql AS $$ DECLARE max_vector_results INT; max_text_results INT; BEGIN -- Calculate how many results to fetch from each search type max_vector_results := match_count; max_text_results := match_count; RETURN QUERY WITH vector_results AS ( -- Vector similarity search SELECT ce.id, ce.url, ce.chunk_number, ce.content, ce.summary, ce.metadata, ce.source_id, 1 - (ce.embedding <=> query_embedding) AS vector_sim FROM archon_code_examples ce WHERE ce.metadata @> filter AND (source_filter IS NULL OR ce.source_id = source_filter) AND ce.embedding IS NOT NULL ORDER BY ce.embedding <=> query_embedding LIMIT max_vector_results ), text_results AS ( -- Full-text search with ranking (searches both content and summary) SELECT ce.id, ce.url, ce.chunk_number, ce.content, ce.summary, ce.metadata, ce.source_id, ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim FROM archon_code_examples ce WHERE ce.metadata @> filter AND (source_filter IS NULL OR ce.source_id = source_filter) AND ce.content_search_vector @@ plainto_tsquery('english', query_text) ORDER BY text_sim DESC LIMIT max_text_results ), combined_results AS ( -- Combine results from both searches SELECT COALESCE(v.id, t.id) AS id, COALESCE(v.url, t.url) AS url, COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, COALESCE(v.content, t.content) AS content, COALESCE(v.summary, t.summary) AS summary, COALESCE(v.metadata, t.metadata) AS metadata, COALESCE(v.source_id, t.source_id) AS source_id, -- Use vector similarity if available, otherwise text similarity COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, -- Determine match type CASE WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' WHEN v.id IS NOT NULL THEN 'vector' ELSE 'keyword' END AS match_type FROM vector_results v FULL OUTER JOIN text_results t ON v.id = t.id ) SELECT * FROM combined_results ORDER BY similarity DESC LIMIT match_count; END; $$; -- ===================================================== -- SECTION 3: UPDATE EXISTING DATA -- ===================================================== -- Force regeneration of search vectors for existing data -- This is handled automatically by the GENERATED ALWAYS AS columns -- Add comment to document the new functionality COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting'; COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search'; -- ===================================================== -- MIGRATION COMPLETE -- ===================================================== -- Hybrid search with ts_vector is now available! -- The search vectors will be automatically maintained -- as data is inserted or updated. -- =====================================================