Skip to content

Commit 9353b54

Browse files
Add metadata support for documents (#859)
* Add metadata support for documents Add metadata column to pgvector storage to allow associating additional information with stored documents. This enables tracking metadata like source documents, page numbers, or any other document-specific information. - Add JSONB metadata column to schema - Update add_texts to support metadata parameter - Update update_texts to support metadata parameter - Add metadata support to similarity search results - Add tests for metadata functionality BREAKING CHANGE: Schema update requires existing tables to be recreated or manually altered to add metadata column * Update CHANGELOG --------- Co-authored-by: Andrei Bondarev <[email protected]>
1 parent 51df3ec commit 9353b54

File tree

3 files changed

+98
-10
lines changed

3 files changed

+98
-10
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
- [SECURITY]: A change which fixes a security vulnerability.
1111

1212
## [Unreleased]
13+
- [BREAKING] [https://github.com/patterns-ai-core/langchainrb/pull/859] Add metadata support to PgVector storage
1314
- [BUGFIX] [https://github.com/patterns-ai-core/langchainrb/pull/939] Fix Langchain::Vectorsearch::Milvus initializer by passing :api_key
1415

1516
## [0.19.4] - 2025-02-17

lib/langchain/vectorsearch/pgvector.rb

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,43 +51,65 @@ def documents_model
5151
# Upsert a list of texts to the index
5252
# @param texts [Array<String>] The texts to add to the index
5353
# @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
54+
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
5455
# @return [PG::Result] The response from the database including the ids of
5556
# the added or updated texts.
56-
def upsert_texts(texts:, ids:)
57-
data = texts.zip(ids).flat_map do |(text, id)|
58-
{id: id, content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
57+
def upsert_texts(texts:, ids:, metadata: nil)
58+
metadata = Array.new(texts.size, {}) if metadata.nil?
59+
60+
data = texts.zip(ids, metadata).flat_map do |text, id, meta|
61+
{
62+
id: id,
63+
content: text,
64+
vectors: llm.embed(text: text).embedding.to_s,
65+
namespace: namespace,
66+
metadata: meta.to_json
67+
}
5968
end
6069
# @db[table_name.to_sym].multi_insert(data, return: :primary_key)
6170
@db[table_name.to_sym]
6271
.insert_conflict(
6372
target: :id,
64-
update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
73+
update: {
74+
content: Sequel[:excluded][:content],
75+
vectors: Sequel[:excluded][:vectors],
76+
metadata: Sequel[:excluded][:metadata]
77+
}
6578
)
6679
.multi_insert(data, return: :primary_key)
6780
end
6881

6982
# Add a list of texts to the index
7083
# @param texts [Array<String>] The texts to add to the index
7184
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
85+
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
7286
# @return [Array<Integer>] The the ids of the added texts.
73-
def add_texts(texts:, ids: nil)
87+
def add_texts(texts:, ids: nil, metadata: nil)
88+
metadata = Array.new(texts.size, {}) if metadata.nil?
89+
7490
if ids.nil? || ids.empty?
75-
data = texts.map do |text|
76-
{content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
91+
data = texts.zip(metadata).map do |text, meta|
92+
{
93+
content: text,
94+
vectors: llm.embed(text: text).embedding.to_s,
95+
namespace: namespace,
96+
metadata: meta.to_json
97+
}
7798
end
7899

79100
@db[table_name.to_sym].multi_insert(data, return: :primary_key)
80101
else
81-
upsert_texts(texts: texts, ids: ids)
102+
upsert_texts(texts: texts, ids: ids, metadata: metadata)
82103
end
83104
end
84105

85106
# Update a list of ids and corresponding texts to the index
86107
# @param texts [Array<String>] The texts to add to the index
87108
# @param ids [Array<String>] The ids to add to the index, in the same order as the texts
109+
# @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
88110
# @return [Array<Integer>] The ids of the updated texts.
89-
def update_texts(texts:, ids:)
90-
upsert_texts(texts: texts, ids: ids)
111+
def update_texts(texts:, ids:, metadata: nil)
112+
upsert_texts(texts: texts, ids: ids, metadata: metadata)
91113
end
92114

93115
# Remove a list of texts from the index
@@ -107,6 +129,7 @@ def create_default_schema
107129
text :content
108130
column :vectors, "vector(#{vector_dimensions})"
109131
text namespace_column.to_sym, default: nil
132+
jsonb :metadata, default: "{}"
110133
end
111134
end
112135

@@ -136,6 +159,7 @@ def similarity_search(query:, k: 4)
136159
def similarity_search_by_vector(embedding:, k: 4)
137160
db.transaction do # BEGIN
138161
documents_model
162+
.select(:content, :metadata)
139163
.nearest_neighbors(:vectors, embedding, distance: operator).limit(k)
140164
.where(namespace_column.to_sym => namespace)
141165
end

spec/langchain/vectorsearch/pgvector_spec.rb

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,23 @@
4545
result = subject.add_texts(texts: ["Hello World", "Hello World"])
4646
expect(result.size).to eq(2)
4747
end
48+
49+
it "adds texts with metadata" do
50+
metadata = [
51+
{"source" => "doc1", "page" => 1},
52+
{"source" => "doc2", "page" => 2}
53+
]
54+
result = subject.add_texts(
55+
texts: ["Hello World", "Hello World"],
56+
metadata: metadata
57+
)
58+
59+
expect(result.size).to eq(2)
60+
61+
stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [result[0], result[1]])
62+
expect(JSON.parse(stored_records[0]["metadata"])).to match(metadata[0])
63+
expect(JSON.parse(stored_records[1]["metadata"])).to match(metadata[1])
64+
end
4865
end
4966

5067
describe "#update_texts" do
@@ -96,6 +113,36 @@
96113
count = client.exec_params(count_query)
97114
expect(count[0]["count"].to_i).to eq(2)
98115
end
116+
117+
it "updates texts and metadata" do
118+
initial_metadata = [
119+
{"source" => "doc1", "page" => 1},
120+
{"source" => "doc2", "page" => 2}
121+
]
122+
123+
values = subject.add_texts(
124+
texts: ["Hello World", "Hello World"],
125+
metadata: initial_metadata
126+
)
127+
128+
updated_metadata = [
129+
{"source" => "doc1", "page" => 1, "updated" => true},
130+
{"source" => "doc2", "page" => 3}
131+
]
132+
133+
ids = values.flatten
134+
result = subject.update_texts(
135+
texts: ["Hello World", "Hello World".reverse],
136+
ids: ids,
137+
metadata: updated_metadata
138+
)
139+
140+
expect(result.size).to eq(2)
141+
142+
stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [ids[0], ids[1]])
143+
expect(JSON.parse(stored_records[0]["metadata"])).to match(updated_metadata[0])
144+
expect(JSON.parse(stored_records[1]["metadata"])).to match(updated_metadata[1])
145+
end
99146
end
100147

101148
describe "#remove_texts" do
@@ -170,6 +217,22 @@
170217
result = subject.similarity_search(query: "earth")
171218
expect(result.first.content).to eq("a namespaced chunk of text")
172219
end
220+
221+
it "searches for similar texts with metadata and namespace" do
222+
namespace = "foo_namespace"
223+
224+
subject.documents_model.new(
225+
content: "a namespaced chunk of text",
226+
vectors: 1536.times.map { 0 },
227+
namespace: namespace,
228+
metadata: {source: "earth_doc", page: 1}.to_json
229+
).save
230+
231+
allow(subject).to receive(:namespace).and_return(namespace)
232+
result = subject.similarity_search(query: "earth")
233+
expect(result.first.content).to eq("a namespaced chunk of text")
234+
expect(JSON.parse(result.first.metadata)).to match({"source" => "earth_doc", "page" => 1})
235+
end
173236
end
174237

175238
describe "#similarity_search_by_vector" do

0 commit comments

Comments
 (0)