Fix multi-dimensional vector hybrid search functions (#681)
Fixes critical bug where hybrid search functions referenced non-existent cp.embedding and ce.embedding columns instead of dimension-specific columns. Changes: - Add new multi-dimensional hybrid search functions with dynamic column selection - Maintain backward compatibility with existing legacy functions - Support all embedding dimensions: 384, 768, 1024, 1536, 3072 - Proper error handling for unsupported dimensions Resolves: #675 - RAG queries now work with multi-dimensional embeddings 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
9ffca825ff
commit
85bd6bc012
@ -35,7 +35,115 @@ CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_
|
||||
-- SECTION 2: HYBRID SEARCH FUNCTIONS
|
||||
-- =====================================================
|
||||
|
||||
-- Hybrid search function for archon_crawled_pages
|
||||
-- Multi-dimensional hybrid search function for archon_crawled_pages
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages_multi(
|
||||
query_embedding VECTOR,
|
||||
embedding_dimension INTEGER,
|
||||
query_text TEXT,
|
||||
match_count INT DEFAULT 10,
|
||||
filter JSONB DEFAULT '{}'::jsonb,
|
||||
source_filter TEXT DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id BIGINT,
|
||||
url VARCHAR,
|
||||
chunk_number INTEGER,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
source_id TEXT,
|
||||
similarity FLOAT,
|
||||
match_type TEXT
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
#variable_conflict use_column
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
sql_query TEXT;
|
||||
embedding_column TEXT;
|
||||
BEGIN
|
||||
-- Determine which embedding column to use based on dimension
|
||||
CASE embedding_dimension
|
||||
WHEN 384 THEN embedding_column := 'embedding_384';
|
||||
WHEN 768 THEN embedding_column := 'embedding_768';
|
||||
WHEN 1024 THEN embedding_column := 'embedding_1024';
|
||||
WHEN 1536 THEN embedding_column := 'embedding_1536';
|
||||
WHEN 3072 THEN embedding_column := 'embedding_3072';
|
||||
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
|
||||
END CASE;
|
||||
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
-- Build dynamic query with proper embedding column
|
||||
sql_query := format('
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
1 - (cp.%I <=> $1) AS vector_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> $4
|
||||
AND ($5 IS NULL OR cp.source_id = $5)
|
||||
AND cp.%I IS NOT NULL
|
||||
ORDER BY cp.%I <=> $1
|
||||
LIMIT $2
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
ts_rank_cd(cp.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> $4
|
||||
AND ($5 IS NULL OR cp.source_id = $5)
|
||||
AND cp.content_search_vector @@ plainto_tsquery(''english'', $6)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT $3
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
SELECT
|
||||
COALESCE(v.id, t.id) AS id,
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
|
||||
WHEN v.id IS NOT NULL THEN ''vector''
|
||||
ELSE ''keyword''
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT $2',
|
||||
embedding_column, embedding_column, embedding_column);
|
||||
|
||||
-- Execute dynamic query
|
||||
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Legacy compatibility function (defaults to 1536D)
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
|
||||
query_embedding vector(1536),
|
||||
query_text TEXT,
|
||||
@ -55,48 +163,91 @@ RETURNS TABLE (
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
BEGIN
|
||||
RETURN QUERY SELECT * FROM hybrid_search_archon_crawled_pages_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Multi-dimensional hybrid search function for archon_code_examples
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples_multi(
|
||||
query_embedding VECTOR,
|
||||
embedding_dimension INTEGER,
|
||||
query_text TEXT,
|
||||
match_count INT DEFAULT 10,
|
||||
filter JSONB DEFAULT '{}'::jsonb,
|
||||
source_filter TEXT DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id BIGINT,
|
||||
url VARCHAR,
|
||||
chunk_number INTEGER,
|
||||
content TEXT,
|
||||
summary TEXT,
|
||||
metadata JSONB,
|
||||
source_id TEXT,
|
||||
similarity FLOAT,
|
||||
match_type TEXT
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
#variable_conflict use_column
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
sql_query TEXT;
|
||||
embedding_column TEXT;
|
||||
BEGIN
|
||||
-- Determine which embedding column to use based on dimension
|
||||
CASE embedding_dimension
|
||||
WHEN 384 THEN embedding_column := 'embedding_384';
|
||||
WHEN 768 THEN embedding_column := 'embedding_768';
|
||||
WHEN 1024 THEN embedding_column := 'embedding_1024';
|
||||
WHEN 1536 THEN embedding_column := 'embedding_1536';
|
||||
WHEN 3072 THEN embedding_column := 'embedding_3072';
|
||||
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
|
||||
END CASE;
|
||||
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
RETURN QUERY
|
||||
-- Build dynamic query with proper embedding column
|
||||
sql_query := format('
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
1 - (cp.embedding <=> query_embedding) AS vector_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> filter
|
||||
AND (source_filter IS NULL OR cp.source_id = source_filter)
|
||||
AND cp.embedding IS NOT NULL
|
||||
ORDER BY cp.embedding <=> query_embedding
|
||||
LIMIT max_vector_results
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
1 - (ce.%I <=> $1) AS vector_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> $4
|
||||
AND ($5 IS NULL OR ce.source_id = $5)
|
||||
AND ce.%I IS NOT NULL
|
||||
ORDER BY ce.%I <=> $1
|
||||
LIMIT $2
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking
|
||||
-- Full-text search with ranking (searches both content and summary)
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> filter
|
||||
AND (source_filter IS NULL OR cp.source_id = source_filter)
|
||||
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
ts_rank_cd(ce.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> $4
|
||||
AND ($5 IS NULL OR ce.source_id = $5)
|
||||
AND ce.content_search_vector @@ plainto_tsquery(''english'', $6)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT max_text_results
|
||||
LIMIT $3
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
@ -105,26 +256,31 @@ BEGIN
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.summary, t.summary) AS summary,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
|
||||
WHEN v.id IS NOT NULL THEN 'vector'
|
||||
ELSE 'keyword'
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
|
||||
WHEN v.id IS NOT NULL THEN ''vector''
|
||||
ELSE ''keyword''
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT match_count;
|
||||
LIMIT $2',
|
||||
embedding_column, embedding_column, embedding_column);
|
||||
|
||||
-- Execute dynamic query
|
||||
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Hybrid search function for archon_code_examples
|
||||
-- Legacy compatibility function (defaults to 1536D)
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
|
||||
query_embedding vector(1536),
|
||||
query_text TEXT,
|
||||
@ -145,75 +301,8 @@ RETURNS TABLE (
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
BEGIN
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
RETURN QUERY
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
1 - (ce.embedding <=> query_embedding) AS vector_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> filter
|
||||
AND (source_filter IS NULL OR ce.source_id = source_filter)
|
||||
AND ce.embedding IS NOT NULL
|
||||
ORDER BY ce.embedding <=> query_embedding
|
||||
LIMIT max_vector_results
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking (searches both content and summary)
|
||||
SELECT
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> filter
|
||||
AND (source_filter IS NULL OR ce.source_id = source_filter)
|
||||
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT max_text_results
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
SELECT
|
||||
COALESCE(v.id, t.id) AS id,
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.summary, t.summary) AS summary,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
|
||||
WHEN v.id IS NOT NULL THEN 'vector'
|
||||
ELSE 'keyword'
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT match_count;
|
||||
RETURN QUERY SELECT * FROM hybrid_search_archon_code_examples_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
|
||||
END;
|
||||
$$;
|
||||
|
||||
@ -224,9 +313,11 @@ $$;
|
||||
-- Force regeneration of search vectors for existing data
|
||||
-- This is handled automatically by the GENERATED ALWAYS AS columns
|
||||
|
||||
-- Add comment to document the new functionality
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
|
||||
-- Add comments to document the new functionality
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages_multi IS 'Multi-dimensional hybrid search combining vector similarity and full-text search with configurable embedding dimensions';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Legacy hybrid search function for backward compatibility (uses 1536D embeddings)';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples_multi IS 'Multi-dimensional hybrid search on code examples with configurable embedding dimensions';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Legacy hybrid search function for code examples (uses 1536D embeddings)';
|
||||
|
||||
-- =====================================================
|
||||
-- MIGRATION COMPLETE
|
||||
|
||||
@ -474,7 +474,115 @@ $$;
|
||||
-- SECTION 5B: HYBRID SEARCH FUNCTIONS WITH TS_VECTOR
|
||||
-- =====================================================
|
||||
|
||||
-- Hybrid search function for archon_crawled_pages
|
||||
-- Multi-dimensional hybrid search function for archon_crawled_pages
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages_multi(
|
||||
query_embedding VECTOR,
|
||||
embedding_dimension INTEGER,
|
||||
query_text TEXT,
|
||||
match_count INT DEFAULT 10,
|
||||
filter JSONB DEFAULT '{}'::jsonb,
|
||||
source_filter TEXT DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id BIGINT,
|
||||
url VARCHAR,
|
||||
chunk_number INTEGER,
|
||||
content TEXT,
|
||||
metadata JSONB,
|
||||
source_id TEXT,
|
||||
similarity FLOAT,
|
||||
match_type TEXT
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
#variable_conflict use_column
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
sql_query TEXT;
|
||||
embedding_column TEXT;
|
||||
BEGIN
|
||||
-- Determine which embedding column to use based on dimension
|
||||
CASE embedding_dimension
|
||||
WHEN 384 THEN embedding_column := 'embedding_384';
|
||||
WHEN 768 THEN embedding_column := 'embedding_768';
|
||||
WHEN 1024 THEN embedding_column := 'embedding_1024';
|
||||
WHEN 1536 THEN embedding_column := 'embedding_1536';
|
||||
WHEN 3072 THEN embedding_column := 'embedding_3072';
|
||||
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
|
||||
END CASE;
|
||||
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
-- Build dynamic query with proper embedding column
|
||||
sql_query := format('
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
1 - (cp.%I <=> $1) AS vector_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> $4
|
||||
AND ($5 IS NULL OR cp.source_id = $5)
|
||||
AND cp.%I IS NOT NULL
|
||||
ORDER BY cp.%I <=> $1
|
||||
LIMIT $2
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
ts_rank_cd(cp.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> $4
|
||||
AND ($5 IS NULL OR cp.source_id = $5)
|
||||
AND cp.content_search_vector @@ plainto_tsquery(''english'', $6)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT $3
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
SELECT
|
||||
COALESCE(v.id, t.id) AS id,
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
|
||||
WHEN v.id IS NOT NULL THEN ''vector''
|
||||
ELSE ''keyword''
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT $2',
|
||||
embedding_column, embedding_column, embedding_column);
|
||||
|
||||
-- Execute dynamic query
|
||||
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Legacy compatibility function (defaults to 1536D)
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
|
||||
query_embedding vector(1536),
|
||||
query_text TEXT,
|
||||
@ -494,48 +602,91 @@ RETURNS TABLE (
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
BEGIN
|
||||
RETURN QUERY SELECT * FROM hybrid_search_archon_crawled_pages_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Multi-dimensional hybrid search function for archon_code_examples
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples_multi(
|
||||
query_embedding VECTOR,
|
||||
embedding_dimension INTEGER,
|
||||
query_text TEXT,
|
||||
match_count INT DEFAULT 10,
|
||||
filter JSONB DEFAULT '{}'::jsonb,
|
||||
source_filter TEXT DEFAULT NULL
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id BIGINT,
|
||||
url VARCHAR,
|
||||
chunk_number INTEGER,
|
||||
content TEXT,
|
||||
summary TEXT,
|
||||
metadata JSONB,
|
||||
source_id TEXT,
|
||||
similarity FLOAT,
|
||||
match_type TEXT
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
#variable_conflict use_column
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
sql_query TEXT;
|
||||
embedding_column TEXT;
|
||||
BEGIN
|
||||
-- Determine which embedding column to use based on dimension
|
||||
CASE embedding_dimension
|
||||
WHEN 384 THEN embedding_column := 'embedding_384';
|
||||
WHEN 768 THEN embedding_column := 'embedding_768';
|
||||
WHEN 1024 THEN embedding_column := 'embedding_1024';
|
||||
WHEN 1536 THEN embedding_column := 'embedding_1536';
|
||||
WHEN 3072 THEN embedding_column := 'embedding_3072';
|
||||
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
|
||||
END CASE;
|
||||
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
RETURN QUERY
|
||||
-- Build dynamic query with proper embedding column
|
||||
sql_query := format('
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
1 - (cp.embedding <=> query_embedding) AS vector_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> filter
|
||||
AND (source_filter IS NULL OR cp.source_id = source_filter)
|
||||
AND cp.embedding IS NOT NULL
|
||||
ORDER BY cp.embedding <=> query_embedding
|
||||
LIMIT max_vector_results
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
1 - (ce.%I <=> $1) AS vector_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> $4
|
||||
AND ($5 IS NULL OR ce.source_id = $5)
|
||||
AND ce.%I IS NOT NULL
|
||||
ORDER BY ce.%I <=> $1
|
||||
LIMIT $2
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking
|
||||
-- Full-text search with ranking (searches both content and summary)
|
||||
SELECT
|
||||
cp.id,
|
||||
cp.url,
|
||||
cp.chunk_number,
|
||||
cp.content,
|
||||
cp.metadata,
|
||||
cp.source_id,
|
||||
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
|
||||
FROM archon_crawled_pages cp
|
||||
WHERE cp.metadata @> filter
|
||||
AND (source_filter IS NULL OR cp.source_id = source_filter)
|
||||
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
ts_rank_cd(ce.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> $4
|
||||
AND ($5 IS NULL OR ce.source_id = $5)
|
||||
AND ce.content_search_vector @@ plainto_tsquery(''english'', $6)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT max_text_results
|
||||
LIMIT $3
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
@ -544,26 +695,31 @@ BEGIN
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.summary, t.summary) AS summary,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
|
||||
WHEN v.id IS NOT NULL THEN 'vector'
|
||||
ELSE 'keyword'
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
|
||||
WHEN v.id IS NOT NULL THEN ''vector''
|
||||
ELSE ''keyword''
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT match_count;
|
||||
LIMIT $2',
|
||||
embedding_column, embedding_column, embedding_column);
|
||||
|
||||
-- Execute dynamic query
|
||||
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Hybrid search function for archon_code_examples
|
||||
-- Legacy compatibility function (defaults to 1536D)
|
||||
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
|
||||
query_embedding vector(1536),
|
||||
query_text TEXT,
|
||||
@ -584,81 +740,16 @@ RETURNS TABLE (
|
||||
)
|
||||
LANGUAGE plpgsql
|
||||
AS $$
|
||||
DECLARE
|
||||
max_vector_results INT;
|
||||
max_text_results INT;
|
||||
BEGIN
|
||||
-- Calculate how many results to fetch from each search type
|
||||
max_vector_results := match_count;
|
||||
max_text_results := match_count;
|
||||
|
||||
RETURN QUERY
|
||||
WITH vector_results AS (
|
||||
-- Vector similarity search
|
||||
SELECT
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
1 - (ce.embedding <=> query_embedding) AS vector_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> filter
|
||||
AND (source_filter IS NULL OR ce.source_id = source_filter)
|
||||
AND ce.embedding IS NOT NULL
|
||||
ORDER BY ce.embedding <=> query_embedding
|
||||
LIMIT max_vector_results
|
||||
),
|
||||
text_results AS (
|
||||
-- Full-text search with ranking (searches both content and summary)
|
||||
SELECT
|
||||
ce.id,
|
||||
ce.url,
|
||||
ce.chunk_number,
|
||||
ce.content,
|
||||
ce.summary,
|
||||
ce.metadata,
|
||||
ce.source_id,
|
||||
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
|
||||
FROM archon_code_examples ce
|
||||
WHERE ce.metadata @> filter
|
||||
AND (source_filter IS NULL OR ce.source_id = source_filter)
|
||||
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
|
||||
ORDER BY text_sim DESC
|
||||
LIMIT max_text_results
|
||||
),
|
||||
combined_results AS (
|
||||
-- Combine results from both searches
|
||||
SELECT
|
||||
COALESCE(v.id, t.id) AS id,
|
||||
COALESCE(v.url, t.url) AS url,
|
||||
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
|
||||
COALESCE(v.content, t.content) AS content,
|
||||
COALESCE(v.summary, t.summary) AS summary,
|
||||
COALESCE(v.metadata, t.metadata) AS metadata,
|
||||
COALESCE(v.source_id, t.source_id) AS source_id,
|
||||
-- Use vector similarity if available, otherwise text similarity
|
||||
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
|
||||
-- Determine match type
|
||||
CASE
|
||||
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
|
||||
WHEN v.id IS NOT NULL THEN 'vector'
|
||||
ELSE 'keyword'
|
||||
END AS match_type
|
||||
FROM vector_results v
|
||||
FULL OUTER JOIN text_results t ON v.id = t.id
|
||||
)
|
||||
SELECT * FROM combined_results
|
||||
ORDER BY similarity DESC
|
||||
LIMIT match_count;
|
||||
RETURN QUERY SELECT * FROM hybrid_search_archon_code_examples_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Add comments to document the new functionality
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages_multi IS 'Multi-dimensional hybrid search combining vector similarity and full-text search with configurable embedding dimensions';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Legacy hybrid search function for backward compatibility (uses 1536D embeddings)';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples_multi IS 'Multi-dimensional hybrid search on code examples with configurable embedding dimensions';
|
||||
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Legacy hybrid search function for code examples (uses 1536D embeddings)';
|
||||
|
||||
-- =====================================================
|
||||
-- SECTION 6: RLS POLICIES FOR KNOWLEDGE BASE
|
||||
|
||||
Loading…
Reference in New Issue
Block a user