Fix multi-dimensional vector hybrid search functions (#681)

Fixes critical bug where hybrid search functions referenced non-existent
cp.embedding and ce.embedding columns instead of dimension-specific columns.

Changes:
- Add new multi-dimensional hybrid search functions with dynamic column selection
- Maintain backward compatibility with existing legacy functions
- Support all embedding dimensions: 384, 768, 1024, 1536, 3072
- Proper error handling for unsupported dimensions

Resolves: #675 - RAG queries now work with multi-dimensional embeddings

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
John C Fitzpatrick 2025-09-18 10:06:10 -07:00 committed by GitHub
parent 9ffca825ff
commit 85bd6bc012
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 389 additions and 207 deletions

View File

@ -35,7 +35,115 @@ CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_
-- SECTION 2: HYBRID SEARCH FUNCTIONS
-- =====================================================
-- Hybrid search function for archon_crawled_pages
-- Multi-dimensional hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.%I <=> $1) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.%I IS NOT NULL
ORDER BY cp.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
@ -55,48 +163,91 @@ RETURNS TABLE (
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM hybrid_search_archon_crawled_pages_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
-- Multi-dimensional hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.embedding <=> query_embedding) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.embedding IS NOT NULL
ORDER BY cp.embedding <=> query_embedding
LIMIT max_vector_results
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.%I <=> $1) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.%I IS NOT NULL
ORDER BY ce.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking
-- Full-text search with ranking (searches both content and summary)
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT max_text_results
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
@ -105,26 +256,31 @@ BEGIN
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Hybrid search function for archon_code_examples
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
@ -145,75 +301,8 @@ RETURNS TABLE (
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.embedding <=> query_embedding) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.embedding IS NOT NULL
ORDER BY ce.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
RETURN QUERY SELECT * FROM hybrid_search_archon_code_examples_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
@ -224,9 +313,11 @@ $$;
-- Force regeneration of search vectors for existing data
-- This is handled automatically by the GENERATED ALWAYS AS columns
-- Add comment to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
-- Add comments to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages_multi IS 'Multi-dimensional hybrid search combining vector similarity and full-text search with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Legacy hybrid search function for backward compatibility (uses 1536D embeddings)';
COMMENT ON FUNCTION hybrid_search_archon_code_examples_multi IS 'Multi-dimensional hybrid search on code examples with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Legacy hybrid search function for code examples (uses 1536D embeddings)';
-- =====================================================
-- MIGRATION COMPLETE

View File

@ -474,7 +474,115 @@ $$;
-- SECTION 5B: HYBRID SEARCH FUNCTIONS WITH TS_VECTOR
-- =====================================================
-- Hybrid search function for archon_crawled_pages
-- Multi-dimensional hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.%I <=> $1) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.%I IS NOT NULL
ORDER BY cp.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> $4
AND ($5 IS NULL OR cp.source_id = $5)
AND cp.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
@ -494,48 +602,91 @@ RETURNS TABLE (
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM hybrid_search_archon_crawled_pages_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
-- Multi-dimensional hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples_multi(
query_embedding VECTOR,
embedding_dimension INTEGER,
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
max_vector_results INT;
max_text_results INT;
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
-- Build dynamic query with proper embedding column
sql_query := format('
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.embedding <=> query_embedding) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.embedding IS NOT NULL
ORDER BY cp.embedding <=> query_embedding
LIMIT max_vector_results
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.%I <=> $1) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.%I IS NOT NULL
ORDER BY ce.%I <=> $1
LIMIT $2
),
text_results AS (
-- Full-text search with ranking
-- Full-text search with ranking (searches both content and summary)
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery(''english'', $6)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> $4
AND ($5 IS NULL OR ce.source_id = $5)
AND ce.content_search_vector @@ plainto_tsquery(''english'', $6)
ORDER BY text_sim DESC
LIMIT max_text_results
LIMIT $3
),
combined_results AS (
-- Combine results from both searches
@ -544,26 +695,31 @@ BEGIN
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN ''hybrid''
WHEN v.id IS NOT NULL THEN ''vector''
ELSE ''keyword''
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, max_vector_results, max_text_results, filter, source_filter, query_text;
END;
$$;
-- Hybrid search function for archon_code_examples
-- Legacy compatibility function (defaults to 1536D)
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
@ -584,81 +740,16 @@ RETURNS TABLE (
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.embedding <=> query_embedding) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.embedding IS NOT NULL
ORDER BY ce.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
RETURN QUERY SELECT * FROM hybrid_search_archon_code_examples_multi(query_embedding, 1536, query_text, match_count, filter, source_filter);
END;
$$;
-- Add comments to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages_multi IS 'Multi-dimensional hybrid search combining vector similarity and full-text search with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Legacy hybrid search function for backward compatibility (uses 1536D embeddings)';
COMMENT ON FUNCTION hybrid_search_archon_code_examples_multi IS 'Multi-dimensional hybrid search on code examples with configurable embedding dimensions';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Legacy hybrid search function for code examples (uses 1536D embeddings)';
-- =====================================================
-- SECTION 6: RLS POLICIES FOR KNOWLEDGE BASE