Archon/iterations/v2-agentic-workflow/site_pages.sql
2025-02-07 15:04:02 -06:00

72 lines
1.9 KiB
PL/PgSQL

-- Enable the pgvector extension
create extension if not exists vector;
-- Create the documentation chunks table
create table site_pages (
id bigserial primary key,
url varchar not null,
chunk_number integer not null,
title varchar not null,
summary varchar not null,
content text not null, -- Added content column
metadata jsonb not null default '{}'::jsonb, -- Added metadata column
embedding vector(1536), -- OpenAI embeddings are 1536 dimensions
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
-- Add a unique constraint to prevent duplicate chunks for the same URL
unique(url, chunk_number)
);
-- Create an index for better vector similarity search performance
create index on site_pages using ivfflat (embedding vector_cosine_ops);
-- Create an index on metadata for faster filtering
create index idx_site_pages_metadata on site_pages using gin (metadata);
-- Create a function to search for documentation chunks
create function match_site_pages (
query_embedding vector(1536),
match_count int default 10,
filter jsonb DEFAULT '{}'::jsonb
) returns table (
id bigint,
url varchar,
chunk_number integer,
title varchar,
summary varchar,
content text,
metadata jsonb,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
url,
chunk_number,
title,
summary,
content,
metadata,
1 - (site_pages.embedding <=> query_embedding) as similarity
from site_pages
where metadata @> filter
order by site_pages.embedding <=> query_embedding
limit match_count;
end;
$$;
-- Everything above will work for any PostgreSQL database. The below commands are for Supabase security
-- Enable RLS on the table
alter table site_pages enable row level security;
-- Create a policy that allows anyone to read
create policy "Allow public read access"
on site_pages
for select
to public
using (true);