Add metadata support for documents (#859)

aellispierce · andreibondarev · web-flow · commit 9353b54ecc0d · 2025-04-10T13:50:50.000-04:00
* Add metadata support for documents

Add metadata column to pgvector storage to allow associating additional
information with stored documents. This enables tracking metadata like
source documents, page numbers, or any other document-specific information.

- Add JSONB metadata column to schema
- Update add_texts to support metadata parameter
- Update update_texts to support metadata parameter
- Add metadata support to similarity search results
- Add tests for metadata functionality

BREAKING CHANGE: Schema update requires existing tables to be recreated
or manually altered to add metadata column

* Update CHANGELOG

---------

Co-authored-by: Andrei Bondarev &lt;andrei@sourcelabs.io&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - [SECURITY]: A change which fixes a security vulnerability.
 
 ## [Unreleased]
+- [BREAKING] [https://github.com/patterns-ai-core/langchainrb/pull/859] Add metadata support to PgVector storage
 - [BUGFIX] [https://github.com/patterns-ai-core/langchainrb/pull/939] Fix Langchain::Vectorsearch::Milvus initializer by passing :api_key
 
 ## [0.19.4] - 2025-02-17
diff --git a/lib/langchain/vectorsearch/pgvector.rb b/lib/langchain/vectorsearch/pgvector.rb
@@ -51,43 +51,65 @@ def documents_model
     # Upsert a list of texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [PG::Result] The response from the database including the ids of
     # the added or updated texts.
-    def upsert_texts(texts:, ids:)
-      data = texts.zip(ids).flat_map do |(text, id)|
-        {id: id, content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
+    def upsert_texts(texts:, ids:, metadata: nil)
+      metadata = Array.new(texts.size, {}) if metadata.nil?
+
+      data = texts.zip(ids, metadata).flat_map do |text, id, meta|
+        {
+          id: id,
+          content: text,
+          vectors: llm.embed(text: text).embedding.to_s,
+          namespace: namespace,
+          metadata: meta.to_json
+        }
       end
       # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
       @db[table_name.to_sym]
         .insert_conflict(
           target: :id,
-          update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
+          update: {
+            content: Sequel[:excluded][:content],
+            vectors: Sequel[:excluded][:vectors],
+            metadata: Sequel[:excluded][:metadata]
+          }
         )
         .multi_insert(data, return: :primary_key)
     end
 
     # Add a list of texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [Array<Integer>] The the ids of the added texts.
-    def add_texts(texts:, ids: nil)
+    def add_texts(texts:, ids: nil, metadata: nil)
+      metadata = Array.new(texts.size, {}) if metadata.nil?
+
       if ids.nil? || ids.empty?
-        data = texts.map do |text|
-          {content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
+        data = texts.zip(metadata).map do |text, meta|
+          {
+            content: text,
+            vectors: llm.embed(text: text).embedding.to_s,
+            namespace: namespace,
+            metadata: meta.to_json
+          }
         end
 
         @db[table_name.to_sym].multi_insert(data, return: :primary_key)
       else
-        upsert_texts(texts: texts, ids: ids)
+        upsert_texts(texts: texts, ids: ids, metadata: metadata)
       end
     end
 
     # Update a list of ids and corresponding texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
+    # @param metadata [Array<Hash>] The metadata to associate with each text, in the same order as the texts
     # @return [Array<Integer>] The ids of the updated texts.
-    def update_texts(texts:, ids:)
-      upsert_texts(texts: texts, ids: ids)
+    def update_texts(texts:, ids:, metadata: nil)
+      upsert_texts(texts: texts, ids: ids, metadata: metadata)
     end
 
     # Remove a list of texts from the index
@@ -107,6 +129,7 @@ def create_default_schema
         text :content
         column :vectors, "vector(#{vector_dimensions})"
         text namespace_column.to_sym, default: nil
+        jsonb :metadata, default: "{}"
       end
     end
 
@@ -136,6 +159,7 @@ def similarity_search(query:, k: 4)
     def similarity_search_by_vector(embedding:, k: 4)
       db.transaction do # BEGIN
         documents_model
+          .select(:content, :metadata)
           .nearest_neighbors(:vectors, embedding, distance: operator).limit(k)
           .where(namespace_column.to_sym => namespace)
       end
diff --git a/spec/langchain/vectorsearch/pgvector_spec.rb b/spec/langchain/vectorsearch/pgvector_spec.rb
@@ -45,6 +45,23 @@
         result = subject.add_texts(texts: ["Hello World", "Hello World"])
         expect(result.size).to eq(2)
       end
+
+      it "adds texts with metadata" do
+        metadata = [
+          {"source" => "doc1", "page" => 1},
+          {"source" => "doc2", "page" => 2}
+        ]
+        result = subject.add_texts(
+          texts: ["Hello World", "Hello World"],
+          metadata: metadata
+        )
+
+        expect(result.size).to eq(2)
+
+        stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [result[0], result[1]])
+        expect(JSON.parse(stored_records[0]["metadata"])).to match(metadata[0])
+        expect(JSON.parse(stored_records[1]["metadata"])).to match(metadata[1])
+      end
     end
 
     describe "#update_texts" do
@@ -96,6 +113,36 @@
         count = client.exec_params(count_query)
         expect(count[0]["count"].to_i).to eq(2)
       end
+
+      it "updates texts and metadata" do
+        initial_metadata = [
+          {"source" => "doc1", "page" => 1},
+          {"source" => "doc2", "page" => 2}
+        ]
+
+        values = subject.add_texts(
+          texts: ["Hello World", "Hello World"],
+          metadata: initial_metadata
+        )
+
+        updated_metadata = [
+          {"source" => "doc1", "page" => 1, "updated" => true},
+          {"source" => "doc2", "page" => 3}
+        ]
+
+        ids = values.flatten
+        result = subject.update_texts(
+          texts: ["Hello World", "Hello World".reverse],
+          ids: ids,
+          metadata: updated_metadata
+        )
+
+        expect(result.size).to eq(2)
+
+        stored_records = client.exec_params("SELECT metadata FROM products WHERE id IN ($1, $2)", [ids[0], ids[1]])
+        expect(JSON.parse(stored_records[0]["metadata"])).to match(updated_metadata[0])
+        expect(JSON.parse(stored_records[1]["metadata"])).to match(updated_metadata[1])
+      end
     end
 
     describe "#remove_texts" do
@@ -170,6 +217,22 @@
         result = subject.similarity_search(query: "earth")
         expect(result.first.content).to eq("a namespaced chunk of text")
       end
+
+      it "searches for similar texts with metadata and namespace" do
+        namespace = "foo_namespace"
+
+        subject.documents_model.new(
+          content: "a namespaced chunk of text",
+          vectors: 1536.times.map { 0 },
+          namespace: namespace,
+          metadata: {source: "earth_doc", page: 1}.to_json
+        ).save
+
+        allow(subject).to receive(:namespace).and_return(namespace)
+        result = subject.similarity_search(query: "earth")
+        expect(result.first.content).to eq("a namespaced chunk of text")
+        expect(JSON.parse(result.first.metadata)).to match({"source" => "earth_doc", "page" => 1})
+      end
     end
 
     describe "#similarity_search_by_vector" do