Archon/migration/add_hybrid_search_tsvector.sql

237 lines
8.7 KiB
PL/PgSQL

-- =====================================================
-- Add Hybrid Search with ts_vector Support
-- =====================================================
-- This migration adds efficient text search capabilities using PostgreSQL's
-- full-text search features (ts_vector) to enable better keyword matching
-- in hybrid search operations.
-- =====================================================
-- Enable required extensions (pg_trgm for fuzzy matching)
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- =====================================================
-- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES
-- =====================================================
-- Add ts_vector columns for full-text search if they don't exist
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED;
-- Create GIN indexes for fast text search
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
-- Create trigram indexes for fuzzy matching (useful for typos and partial matches)
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);
-- =====================================================
-- SECTION 2: HYBRID SEARCH FUNCTIONS
-- =====================================================
-- Hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.embedding <=> query_embedding) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.embedding IS NOT NULL
ORDER BY cp.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- Hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.embedding <=> query_embedding) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.embedding IS NOT NULL
ORDER BY ce.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- =====================================================
-- SECTION 3: UPDATE EXISTING DATA
-- =====================================================
-- Force regeneration of search vectors for existing data
-- This is handled automatically by the GENERATED ALWAYS AS columns
-- Add comment to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
-- Hybrid search with ts_vector is now available!
-- The search vectors will be automatically maintained
-- as data is inserted or updated.
-- =====================================================