Archon/migration/add_hybrid_search_tsvector.sql
John C Fitzpatrick 85bd6bc012
Fix multi-dimensional vector hybrid search functions (#681)
Fixes critical bug where hybrid search functions referenced non-existent
cp.embedding and ce.embedding columns instead of dimension-specific columns.

Changes:
- Add new multi-dimensional hybrid search functions with dynamic column selection
- Maintain backward compatibility with existing legacy functions
- Support all embedding dimensions: 384, 768, 1024, 1536, 3072
- Proper error handling for unsupported dimensions

Resolves: #675 - RAG queries now work with multi-dimensional embeddings

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Claude <noreply@anthropic.com>
2025-09-18 10:06:10 -07:00

328 lines
12 KiB
PL/PgSQL

-- =====================================================
-- Add Hybrid Search with ts_vector Support
-- =====================================================
-- This migration adds efficient text search capabilities using PostgreSQL's
-- full-text search features (ts_vector) to enable better keyword matching
-- in hybrid search operations.
-- =====================================================
-- Enable required extensions (pg_trgm for fuzzy matching)
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- =====================================================
-- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES
-- =====================================================
-- Add ts_vector columns for full-text search if they don't exist
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED;
-- Create GIN indexes for fast text search
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
-- Create trigram indexes for fuzzy matching (useful for typos and partial matches)
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);
-- =====================================================
-- SECTION 2: HYBRID SEARCH FUNCTIONS
-- =====================================================
-- Multi-dimensional hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.%I <=> $1) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.%I IS NOT NULL
ORDER BY cp.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM hybrid_search_archon_crawled_pages_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
-- Multi-dimensional hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.%I <=> $1) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.%I IS NOT NULL
ORDER BY ce.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM hybrid_search_archon_code_examples_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
-- =====================================================
-- SECTION 3: UPDATE EXISTING DATA
-- =====================================================
-- Force regeneration of search vectors for existing data
-- This is handled automatically by the GENERATED ALWAYS AS columns
-- Add comments to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages_multi IS 'Multi-dimensional hybrid search combining vector similarity and full-text search with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Legacy hybrid search function for backward compatibility (uses 1536D embeddings)';
COMMENT ON FUNCTION hybrid_search_archon_code_examples_multi IS 'Multi-dimensional hybrid search on code examples with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Legacy hybrid search function for code examples (uses 1536D embeddings)';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
-- Hybrid search with ts_vector is now available!
-- The search vectors will be automatically maintained
-- as data is inserted or updated.
-- =====================================================