-
Notifications
You must be signed in to change notification settings - Fork 569
Expand file tree
/
Copy pathcrawled_pages.sql
More file actions
175 lines (153 loc) · 5.04 KB
/
crawled_pages.sql
File metadata and controls
175 lines (153 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
-- Enable the pgvector extension
create extension if not exists vector;
-- Drop tables if they exist (to allow rerunning the script)
drop table if exists crawled_pages;
drop table if exists code_examples;
drop table if exists sources;
-- Create the sources table
create table sources (
source_id text primary key,
summary text,
total_word_count integer default 0,
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
updated_at timestamp with time zone default timezone('utc'::text, now()) not null
);
-- Create the documentation chunks table
create table crawled_pages (
id bigserial primary key,
url varchar not null,
chunk_number integer not null,
content text not null,
metadata jsonb not null default '{}'::jsonb,
source_id text not null,
embedding vector(1536), -- OpenAI embeddings are 1536 dimensions
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
-- Add a unique constraint to prevent duplicate chunks for the same URL
unique(url, chunk_number),
-- Add foreign key constraint to sources table
foreign key (source_id) references sources(source_id)
);
-- Create an index for better vector similarity search performance
create index on crawled_pages using ivfflat (embedding vector_cosine_ops);
-- Create an index on metadata for faster filtering
create index idx_crawled_pages_metadata on crawled_pages using gin (metadata);
-- Create an index on source_id for faster filtering
CREATE INDEX idx_crawled_pages_source_id ON crawled_pages (source_id);
-- Create a function to search for documentation chunks
create or replace function match_crawled_pages (
query_embedding vector(1536),
match_count int default 10,
filter jsonb DEFAULT '{}'::jsonb,
source_filter text DEFAULT NULL
) returns table (
id bigint,
url varchar,
chunk_number integer,
content text,
metadata jsonb,
source_id text,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
url,
chunk_number,
content,
metadata,
source_id,
1 - (crawled_pages.embedding <=> query_embedding) as similarity
from crawled_pages
where metadata @> filter
AND (source_filter IS NULL OR source_id = source_filter)
order by crawled_pages.embedding <=> query_embedding
limit match_count;
end;
$$;
-- Enable RLS on the crawled_pages table
alter table crawled_pages enable row level security;
-- Create a policy that allows anyone to read crawled_pages
create policy "Allow public read access to crawled_pages"
on crawled_pages
for select
to public
using (true);
-- Enable RLS on the sources table
alter table sources enable row level security;
-- Create a policy that allows anyone to read sources
create policy "Allow public read access to sources"
on sources
for select
to public
using (true);
-- Create the code_examples table
create table code_examples (
id bigserial primary key,
url varchar not null,
chunk_number integer not null,
content text not null, -- The code example content
summary text not null, -- Summary of the code example
metadata jsonb not null default '{}'::jsonb,
source_id text not null,
embedding vector(1536), -- OpenAI embeddings are 1536 dimensions
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
-- Add a unique constraint to prevent duplicate chunks for the same URL
unique(url, chunk_number),
-- Add foreign key constraint to sources table
foreign key (source_id) references sources(source_id)
);
-- Create an index for better vector similarity search performance
create index on code_examples using ivfflat (embedding vector_cosine_ops);
-- Create an index on metadata for faster filtering
create index idx_code_examples_metadata on code_examples using gin (metadata);
-- Create an index on source_id for faster filtering
CREATE INDEX idx_code_examples_source_id ON code_examples (source_id);
-- Create a function to search for code examples
create or replace function match_code_examples (
query_embedding vector(1536),
match_count int default 10,
filter jsonb DEFAULT '{}'::jsonb,
source_filter text DEFAULT NULL
) returns table (
id bigint,
url varchar,
chunk_number integer,
content text,
summary text,
metadata jsonb,
source_id text,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
url,
chunk_number,
content,
summary,
metadata,
source_id,
1 - (code_examples.embedding <=> query_embedding) as similarity
from code_examples
where metadata @> filter
AND (source_filter IS NULL OR source_id = source_filter)
order by code_examples.embedding <=> query_embedding
limit match_count;
end;
$$;
-- Enable RLS on the code_examples table
alter table code_examples enable row level security;
-- Create a policy that allows anyone to read code_examples
create policy "Allow public read access to code_examples"
on code_examples
for select
to public
using (true);