diff --git a/examples/store_and_query_with_elasticsearch_using_metadata.rb b/examples/store_and_query_with_elasticsearch_using_metadata.rb new file mode 100644 index 000000000..799982c3f --- /dev/null +++ b/examples/store_and_query_with_elasticsearch_using_metadata.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +require "langchain" +require "dotenv/load" +require "ruby/openai" + +# This example assumes you are running Elasticsearch in Docker: +# +# docker run --name es8 -d \ +# -p 9200:9200 -p 9300:9300 \ +# -e "discovery.type=single-node" \ +# -e "xpack.security.enabled=false" \ +# docker.elastic.co/elasticsearch/elasticsearch:8.12.2 +# +# The container exposes the REST API on http://localhost:9200 which +# the script connects to below. If you use a different host/port, set +# the ELASTICSEARCH_URL environment variable accordingly before running +# the script: +# ELASTICSEARCH_URL=http://localhost:9201 ruby examples/... + +# Instantiate the Elasticsearch vector store +es = Langchain::Vectorsearch::Elasticsearch.new( + url: ENV.fetch("ELASTICSEARCH_URL", "http://localhost:9200"), + index_name: "documents", + llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"]) +) + +# Create the index & mapping (safe to call if it already exists) +# You may need to delete an old index first if it was created without the metadata field. +begin + es.create_default_schema +rescue => e + warn "Index might already exist: #{e.message}" +end + +# Prepare documents with metadata +corpus = [ + { + text: "Vector search lets you retrieve semantically similar documents.", + metadata: {lang: "en", author: "alice", topic: "vector-search"} + }, + { + text: "Las bases de datos vectoriales permiten búsquedas semánticas.", + metadata: {lang: "es", author: "bob", topic: "vector-search"} + }, + { + text: "Ruby makes metaprogramming accessible and fun.", + metadata: {lang: "en", author: "carol", topic: "ruby"} + } +] + +puts "\nAdding documents with metadata …" + +es.add_texts( + texts: corpus.map { |d| d[:text] }, + metadatas: corpus.map { |d| d[:metadata] } +) + +sleep 1 # give ES a moment to index + +puts "\nSimilarity search for 'vector' restricted to English docs:" +filter = {term: {"metadata.lang" => "en"}} +results = es.similarity_search(text: "vector", k: 2, filter: filter) +pp results + +puts "\nSimilarity search by embedding, Spanish docs only:" +embedding = es.llm.embed(text: "vector query").embedding +filter = {term: {"metadata.lang" => "es"}} +pp es.similarity_search_by_vector(embedding: embedding, k: 1, filter: filter) + +# Cleanup (optional) +# es.delete_default_schema diff --git a/lib/langchain/vectorsearch/elasticsearch.rb b/lib/langchain/vectorsearch/elasticsearch.rb index a7111ba05..e6267b457 100644 --- a/lib/langchain/vectorsearch/elasticsearch.rb +++ b/lib/langchain/vectorsearch/elasticsearch.rb @@ -48,12 +48,25 @@ def initialize(url:, index_name:, llm:, api_key: nil, es_options: {}) # Add a list of texts to the index # @param texts [Array] The list of texts to add + # @param metadatas [Array] Optional list of metadata hashes to store alongside each text. Must be the same length as texts when provided. # @return [Elasticsearch::Response] from the Elasticsearch server - def add_texts(texts: []) - body = texts.map do |text| + def add_texts(texts: [], metadatas: []) + metadatas = Array(metadatas) + + if !metadatas.empty? && (metadatas.length != texts.length) + raise ArgumentError, "`metadatas` must be the same length as `texts` when provided" + end + + body = texts.map.with_index do |text, i| + document_body = { + input: text, + input_vector: llm.embed(text: text).embedding + } + document_body[:metadata] = metadatas[i] if metadatas[i] + [ {index: {_index: index_name}}, - {input: text, input_vector: llm.embed(text: text).embedding} + document_body ] end.flatten @@ -63,12 +76,25 @@ def add_texts(texts: []) # Add a list of texts to the index # @param texts [Array] The list of texts to update # @param texts [Array] The list of texts to update + # @param metadatas [Array] Optional list of metadata hashes to update alongside each text. Must be the same length as texts when provided. # @return [Elasticsearch::Response] from the Elasticsearch server - def update_texts(texts: [], ids: []) + def update_texts(texts: [], ids: [], metadatas: []) + metadatas = Array(metadatas) + + if !metadatas.empty? && (metadatas.length != texts.length) + raise ArgumentError, "`metadatas` must be the same length as `texts` when provided" + end + body = texts.map.with_index do |text, i| + document_body = { + input: text, + input_vector: llm.embed(text: text).embedding + } + document_body[:metadata] = metadatas[i] if metadatas[i] + [ {index: {_index: index_name, _id: ids[i]}}, - {input: text, input_vector: llm.embed(text: text).embedding} + document_body ] end.flatten @@ -118,7 +144,11 @@ def default_schema input: { type: "text" }, - input_vector: vector_settings + input_vector: vector_settings, + metadata: { + type: "object", + dynamic: true + } } } } @@ -163,34 +193,45 @@ def ask(question:, k: 4, &block) # @param text [String] The text to search for # @param k [Integer] The number of results to return # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional) + # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional) # @return [Elasticsearch::Response] The response from the server - def similarity_search(text: "", k: 10, query: {}) + def similarity_search(text: "", k: 10, query: {}, filter: {}) if text.empty? && query.empty? raise "Either text or query should pass as an argument" end + # Build base similarity query (script_score by default) if query.empty? query_vector = llm.embed(text: text).embedding - query = default_query(query_vector) end - es_client.search(body: {query: query, size: k}).body + # Apply filter if provided + final_query = if filter.empty? + query + else + {bool: {must: query, filter: filter}} + end + + es_client.search(body: {query: final_query, size: k}).body end # Search for similar texts by embedding # @param embedding [Array] The embedding to search for # @param k [Integer] The number of results to return # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional) + # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional) # @return [Elasticsearch::Response] The response from the server - def similarity_search_by_vector(embedding: [], k: 10, query: {}) + def similarity_search_by_vector(embedding: [], k: 10, query: {}, filter: {}) if embedding.empty? && query.empty? raise "Either embedding or query should pass as an argument" end query = default_query(embedding) if query.empty? - es_client.search(body: {query: query, size: k}).body + final_query = filter.empty? ? query : {bool: {must: query, filter: filter}} + + es_client.search(body: {query: final_query, size: k}).body end end end diff --git a/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb b/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb index e1d5b1d99..6ba3ba42d 100644 --- a/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb +++ b/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb @@ -17,16 +17,23 @@ end describe "#add_texts" do - it "indexes data into elasticsearch" do + it "indexes data into elasticsearch with metadata" do + metadata = {lang: "en"} es_body = [ {index: {_index: "langchain"}}, - {input: "simple text", input_vector: [0.1, 0.2, 0.3]} + {input: "simple text", input_vector: [0.1, 0.2, 0.3], metadata: metadata} ] allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body) expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once - subject.add_texts(texts: ["simple text"]) + subject.add_texts(texts: ["simple text"], metadatas: [metadata]) + end + + it "raises error when metadatas length mismatch" do + expect { + subject.add_texts(texts: ["t1", "t2"], metadatas: [{foo: 1}]) + }.to raise_error(ArgumentError) end end @@ -38,16 +45,17 @@ .and_return([0.1, 0.2, 0.3, 0.4]) end - it "updates respective document" do + it "updates respective document with metadata" do + metadata = {version: 2} es_body = [ {index: {_index: "langchain", _id: 1}}, - {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4]} + {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4], metadata: metadata} ] allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body) expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once - subject.update_texts(texts: ["updated text"], ids: [1]) + subject.update_texts(texts: ["updated text"], ids: [1], metadatas: [metadata]) end end @@ -100,7 +108,8 @@ input: { type: "text" }, - input_vector: {type: "dense_vector", dims: 384} + input_vector: {type: "dense_vector", dims: 384}, + metadata: {type: "object", dynamic: true} } } } @@ -117,7 +126,8 @@ input: { type: "text" }, - input_vector: {type: "dense_vector", dims: 500} + input_vector: {type: "dense_vector", dims: 500}, + metadata: {type: "object", dynamic: true} } } } @@ -145,7 +155,8 @@ end describe "#similarity_search" do - it "should return similar documents" do + it "should return similar documents with metadata filter" do + filter = {term: {"metadata.lang": "en"}} response = [ {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]}, {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]} @@ -154,13 +165,13 @@ allow(es_response).to receive(:body).and_return(response) allow_any_instance_of(::Elasticsearch::Client) - .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}).and_return(es_response) + .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}).and_return(es_response) expect_any_instance_of(::Elasticsearch::Client) - .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}) + .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}) expect(es_response).to receive(:body) - expect(subject.similarity_search(text: "simple", k: 5)).to eq(response) + expect(subject.similarity_search(text: "simple", k: 5, filter: filter)).to eq(response) end it "able to search with custom query" do @@ -197,7 +208,8 @@ end describe "#similarity_search_by_vector" do - it "should return similar documents" do + it "should return similar documents with metadata filter" do + filter = {term: {"metadata.lang": "en"}} response = [ {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]}, {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]} @@ -206,13 +218,13 @@ allow(es_response).to receive(:body).and_return(response) allow_any_instance_of(::Elasticsearch::Client) - .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}).and_return(es_response) + .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}).and_return(es_response) expect_any_instance_of(::Elasticsearch::Client) - .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}) + .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}) expect(es_response).to receive(:body) - expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5)).to eq(response) + expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5, filter: filter)).to eq(response) end it "able to search with custom query" do