-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.cr
More file actions
68 lines (61 loc) · 2.18 KB
/
example.cr
File metadata and controls
68 lines (61 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
require "db"
require "http/client"
require "json"
require "pg"
db = DB.open("postgres://localhost/pgvector_example")
db.exec "CREATE EXTENSION IF NOT EXISTS vector"
db.exec "DROP TABLE IF EXISTS documents"
db.exec "CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(768))"
db.exec "CREATE INDEX ON documents USING GIN (to_tsvector('english', content))"
def embed(input, task_type)
# nomic-embed-text uses a task prefix
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
input = input.map { |v| "#{task_type}: #{v}" }
url = "http://localhost:11434/api/embed"
data = {
"input" => input,
"model" => "nomic-embed-text",
}
headers = HTTP::Headers.new
headers["Content-Type"] = "application/json"
response = HTTP::Client.post url, headers, data.to_json
JSON.parse(response.body)["embeddings"].as_a.map { |v| v.as_a }
end
documents = ["The dog is barking", "The cat is purring", "The bear is growling"]
embeddings = embed(documents, "search_document")
documents.zip(embeddings) do |content, embedding|
db.exec "INSERT INTO documents (content, embedding) VALUES ($1, $2)", content, embedding.to_json
end
sql = <<-SQL
WITH semantic_search AS (
SELECT id, RANK () OVER (ORDER BY embedding <=> $2) AS rank
FROM documents
ORDER BY embedding <=> $2
LIMIT 20
),
keyword_search AS (
SELECT id, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC)
FROM documents, plainto_tsquery('english', $1) query
WHERE to_tsvector('english', content) @@ query
ORDER BY ts_rank_cd(to_tsvector('english', content), query) DESC
LIMIT 20
)
SELECT
COALESCE(semantic_search.id, keyword_search.id) AS id,
COALESCE(1.0 / ($3::double precision + semantic_search.rank), 0.0) +
COALESCE(1.0 / ($3::double precision + keyword_search.rank), 0.0) AS score
FROM semantic_search
FULL OUTER JOIN keyword_search ON semantic_search.id = keyword_search.id
ORDER BY score DESC
LIMIT 5
SQL
query = "growling bear"
embedding = embed([query], "search_query")[0]
k = 60
db.query(sql, query, embedding.to_json, k) do |rs|
rs.each do
id, score = rs.read(Int64, Float64)
puts "document: #{id}, RRF score: #{score}"
end
end
db.close