diff --git a/examples/src/langchain-classic/indexes/vector_stores/elasticsearch/elasticsearch_hybrid.ts b/examples/src/langchain-classic/indexes/vector_stores/elasticsearch/elasticsearch_hybrid.ts new file mode 100644 index 000000000000..7aebee3380e6 --- /dev/null +++ b/examples/src/langchain-classic/indexes/vector_stores/elasticsearch/elasticsearch_hybrid.ts @@ -0,0 +1,123 @@ +import { Client, ClientOptions } from "@elastic/elasticsearch"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { + ElasticClientArgs, + ElasticVectorSearch, + HybridRetrievalStrategy, +} from "@langchain/community/vectorstores/elasticsearch"; +import { Document } from "@langchain/core/documents"; + +/** + * Demonstrates hybrid search with Elasticsearch, combining: + * - Vector (semantic) search using embeddings + * - BM25 (lexical) full-text search + * - Reciprocal Rank Fusion (RRF) for result merging + * + * Requirements: + * - Elasticsearch 8.9+ (for RRF support) + * - Run: docker-compose up -d --build (in elasticsearch directory) + * - Set ELASTIC_URL, ELASTIC_API_KEY (or ELASTIC_USERNAME/ELASTIC_PASSWORD) + */ +export async function run() { + const config: ClientOptions = { + node: process.env.ELASTIC_URL ?? "http://127.0.0.1:9200", + }; + if (process.env.ELASTIC_API_KEY) { + config.auth = { + apiKey: process.env.ELASTIC_API_KEY, + }; + } else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) { + config.auth = { + username: process.env.ELASTIC_USERNAME, + password: process.env.ELASTIC_PASSWORD, + }; + } + + const embeddings = new OpenAIEmbeddings(); + + const clientArgs: ElasticClientArgs = { + client: new Client(config), + indexName: process.env.ELASTIC_INDEX ?? "test_hybrid_search", + strategy: new HybridRetrievalStrategy({ + rankWindowSize: 100, + rankConstant: 60, + textField: "text", + }), + }; + + const vectorStore = new ElasticVectorSearch(embeddings, clientArgs); + + await vectorStore.deleteIfExists(); + + // Add sample documents + const docs = [ + new Document({ + pageContent: "Running helps build cardiovascular endurance and strengthens leg muscles.", + metadata: { category: "fitness", topic: "running" }, + }), + new Document({ + pageContent: "Marathon training requires consistent mileage and proper recovery.", + metadata: { category: "fitness", topic: "running" }, + }), + new Document({ + pageContent: "Muscle soreness after exercise is caused by microscopic damage to muscle fibers.", + metadata: { category: "health", topic: "recovery" }, + }), + new Document({ + pageContent: "Stretching and foam rolling can help prevent post-workout muscle pain.", + metadata: { category: "health", topic: "recovery" }, + }), + new Document({ + pageContent: "Python is a popular programming language for data science and machine learning.", + metadata: { category: "technology", topic: "programming" }, + }), + ]; + + console.log("Adding documents to Elasticsearch..."); + await vectorStore.addDocuments(docs); + console.log("Documents added successfully!\n"); + + // Example 1: Hybrid search combines semantic + keyword matching + console.log("=== Example 1: Hybrid Search ==="); + const query1 = "How to avoid muscle soreness while running?"; + console.log(`Query: "${query1}"\n`); + + const results1 = await vectorStore.similaritySearchWithScore(query1, 3); + results1.forEach(([doc, score], i) => { + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); + }); + + // Example 2: Semantic search works well for conceptual queries + console.log("\n=== Example 2: Semantic Query ==="); + const query2 = "tips for preventing pain after workouts"; + console.log(`Query: "${query2}"\n`); + + const results2 = await vectorStore.similaritySearchWithScore(query2, 2); + results2.forEach(([doc, score], i) => { + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); + }); + + // Example 3: With metadata filters + console.log("\n=== Example 3: Hybrid Search with Filters ==="); + const query3 = "fitness advice"; + console.log(`Query: "${query3}"`); + console.log(`Filter: category = "fitness"\n`); + + const results3 = await vectorStore.similaritySearchWithScore( + query3, + 3, + { category: "fitness" } + ); + results3.forEach(([doc, score], i) => { + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); + }); + + // Clean up + console.log("\n=== Cleanup ==="); + await vectorStore.deleteIfExists(); + console.log("Index deleted."); +} + diff --git a/libs/langchain-community/src/vectorstores/elasticsearch.ts b/libs/langchain-community/src/vectorstores/elasticsearch.ts index 05a79d3aed4a..dc0a459dd446 100644 --- a/libs/langchain-community/src/vectorstores/elasticsearch.ts +++ b/libs/langchain-community/src/vectorstores/elasticsearch.ts @@ -3,6 +3,7 @@ import { Client, estypes } from "@elastic/elasticsearch"; import type { EmbeddingsInterface } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; import { Document } from "@langchain/core/documents"; +import type { Callbacks } from "@langchain/core/callbacks/manager"; /** * Type representing the k-nearest neighbors (k-NN) engine used in * Elasticsearch. @@ -24,6 +25,30 @@ interface VectorSearchOptions { readonly candidates?: number; } +/** + * Configuration options for hybrid retrieval strategy. + */ +export interface HybridRetrievalStrategyConfig { + rankWindowSize?: number; + rankConstant?: number; + textField?: string; +} + +/** + * Hybrid search strategy combining vector and BM25 search using RRF. + */ +export class HybridRetrievalStrategy { + public readonly rankWindowSize: number; + public readonly rankConstant: number; + public readonly textField: string; + + constructor(config: HybridRetrievalStrategyConfig = {}) { + this.rankWindowSize = config.rankWindowSize ?? 100; + this.rankConstant = config.rankConstant ?? 60; + this.textField = config.textField ?? "text"; + } +} + /** * Interface defining the arguments required to create an Elasticsearch * client. @@ -32,6 +57,7 @@ export interface ElasticClientArgs { readonly client: Client; readonly indexName?: string; readonly vectorSearchOptions?: VectorSearchOptions; + readonly strategy?: HybridRetrievalStrategy; } /** @@ -51,10 +77,23 @@ type ElasticMetadataTerms = { }; /** - * Class for interacting with an Elasticsearch database. It extends the - * VectorStore base class and provides methods for adding documents and - * vectors to the Elasticsearch database, performing similarity searches, - * deleting documents, and more. + * Elasticsearch vector store supporting vector and hybrid search. + * + * Hybrid search combines kNN vector search with BM25 full-text search + * using RRF. Enable by passing a `HybridRetrievalStrategy` to the constructor. + * + * @example + * ```typescript + * // Vector search (default) + * const vectorStore = new ElasticVectorSearch(embeddings, { client, indexName }); + * + * // Hybrid search + * const hybridStore = new ElasticVectorSearch(embeddings, { + * client, + * indexName, + * strategy: new HybridRetrievalStrategy() + * }); + * ``` */ export class ElasticVectorSearch extends VectorStore { declare FilterType: ElasticFilter; @@ -73,6 +112,10 @@ export class ElasticVectorSearch extends VectorStore { private readonly candidates: number; + private readonly strategy?: HybridRetrievalStrategy; + + private lastQueryText?: string; + _vectorstoreType(): string { return "elasticsearch"; } @@ -85,9 +128,14 @@ export class ElasticVectorSearch extends VectorStore { this.m = args.vectorSearchOptions?.m ?? 16; this.efConstruction = args.vectorSearchOptions?.efConstruction ?? 100; this.candidates = args.vectorSearchOptions?.candidates ?? 200; + this.strategy = args.strategy; + + const userAgent = this.strategy + ? "langchain-js-vs-hybrid/0.0.1" + : "langchain-js-vs/0.0.1"; this.client = args.client.child({ - headers: { "user-agent": "langchain-js-vs/0.0.1" }, + headers: { "user-agent": userAgent }, }); this.indexName = args.indexName ?? "documents"; } @@ -155,6 +203,16 @@ export class ElasticVectorSearch extends VectorStore { return documentIds; } + async similaritySearch( + query: string, + k = 4, + filter?: ElasticFilter, + _callbacks?: Callbacks + ): Promise { + this.lastQueryText = query; + return super.similaritySearch(query, k, filter, _callbacks); + } + /** * Method to perform a similarity search in the Elasticsearch database * using a vector. It returns the k most similar documents along with @@ -169,6 +227,15 @@ export class ElasticVectorSearch extends VectorStore { k: number, filter?: ElasticFilter ): Promise<[Document, number][]> { + if (this.strategy && this.lastQueryText) { + return this.hybridSearchVectorWithScore( + this.lastQueryText, + query, + k, + filter + ); + } + const result = await this.client.search({ index: this.indexName, size: k, @@ -191,6 +258,58 @@ export class ElasticVectorSearch extends VectorStore { ]); } + private async hybridSearchVectorWithScore( + queryText: string, + queryVector: number[], + k: number, + filter?: ElasticFilter + ): Promise<[Document, number][]> { + const metadataTerms = this.buildMetadataTerms(filter); + const filterClauses = metadataTerms.must.length > 0 || metadataTerms.must_not.length > 0 + ? { bool: metadataTerms } + : undefined; + + const result = await this.client.search({ + index: this.indexName, + size: k, + retriever: { + rrf: { + retrievers: [ + { + standard: { + query: { + match: { + [this.strategy!.textField]: queryText, + }, + }, + }, + }, + { + knn: { + field: "embedding", + query_vector: queryVector, + k, + num_candidates: this.candidates, + }, + }, + ], + rank_window_size: this.strategy!.rankWindowSize, + rank_constant: this.strategy!.rankConstant, + }, + }, + ...(filterClauses && { query: filterClauses }), + }); + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return result.hits.hits.map((hit: any) => [ + new Document({ + pageContent: hit._source.text, + metadata: hit._source.metadata, + }), + hit._score, + ]); + } + /** * Method to delete documents from the Elasticsearch database. * @param params Object containing the IDs of the documents to delete. diff --git a/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts b/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts index c33812dc49b2..6bf90323092b 100644 --- a/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/elasticsearch.int.test.ts @@ -2,7 +2,7 @@ import { test, expect } from "@jest/globals"; import { Client, ClientOptions } from "@elastic/elasticsearch"; import { OpenAIEmbeddings } from "@langchain/openai"; import { Document } from "@langchain/core/documents"; -import { ElasticVectorSearch } from "../elasticsearch.js"; +import { ElasticVectorSearch, HybridRetrievalStrategy } from "../elasticsearch.js"; describe("ElasticVectorSearch", () => { let store: ElasticVectorSearch; @@ -156,3 +156,243 @@ describe("ElasticVectorSearch", () => { ]); }); }); + +describe("ElasticVectorSearch - Backward Compatibility", () => { + let client: Client; + let embeddings: OpenAIEmbeddings; + + beforeEach(() => { + if (!process.env.ELASTIC_URL) { + throw new Error("ELASTIC_URL not set"); + } + + const config: ClientOptions = { + node: process.env.ELASTIC_URL, + }; + if (process.env.ELASTIC_API_KEY) { + config.auth = { + apiKey: process.env.ELASTIC_API_KEY, + }; + } else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) { + config.auth = { + username: process.env.ELASTIC_USERNAME, + password: process.env.ELASTIC_PASSWORD, + }; + } + client = new Client(config); + embeddings = new OpenAIEmbeddings(); + }); + + test.skip("Pure vector search without strategy works unchanged", async () => { + const indexName = "test_backward_compat_pure"; + const store = new ElasticVectorSearch(embeddings, { client, indexName }); + await store.deleteIfExists(); + + await store.addDocuments([ + new Document({ pageContent: "hello world" }), + new Document({ pageContent: "goodbye world" }), + new Document({ pageContent: "hello universe" }), + ]); + + const results = await store.similaritySearch("hello", 2); + + expect(results).toHaveLength(2); + expect(results[0]).toBeInstanceOf(Document); + expect(results[0].pageContent).toContain("hello"); + }); + + test.skip("similaritySearchVectorWithScore works without strategy", async () => { + const indexName = "test_backward_compat_scores"; + const store = new ElasticVectorSearch(embeddings, { client, indexName }); + await store.deleteIfExists(); + + const createdAt = new Date().getTime(); + await store.addDocuments([ + new Document({ pageContent: "vector search", metadata: { a: createdAt } }), + new Document({ pageContent: "semantic search", metadata: { a: createdAt } }), + new Document({ pageContent: "keyword search", metadata: { a: createdAt + 1 } }), + ]); + + const queryVector = await embeddings.embedQuery("vector"); + const results = await store.similaritySearchVectorWithScore( + queryVector, + 2, + { a: createdAt } + ); + + expect(results).toHaveLength(2); + results.forEach(([doc, score]) => { + expect(doc).toBeInstanceOf(Document); + expect(typeof score).toBe("number"); + expect(score).toBeGreaterThan(0); + expect(doc.metadata.a).toBe(createdAt); + }); + }); + + test.skip("fromTexts static method works without strategy", async () => { + const indexName = "test_backward_compat_fromtexts"; + + const store = await ElasticVectorSearch.fromTexts( + ["first document", "second document", "third document"], + [{ id: 1 }, { id: 2 }, { id: 3 }], + embeddings, + { client, indexName } + ); + + await store.deleteIfExists(); + + const newStore = await ElasticVectorSearch.fromTexts( + ["first document", "second document", "third document"], + [{ id: 1 }, { id: 2 }, { id: 3 }], + embeddings, + { client, indexName } + ); + + const results = await newStore.similaritySearch("first", 1); + + expect(results).toHaveLength(1); + expect(results[0]).toBeInstanceOf(Document); + expect(results[0].pageContent).toBe("first document"); + expect(results[0].metadata.id).toBe(1); + }); +}); + +describe("ElasticVectorSearch - Hybrid Search", () => { + let client: Client; + let embeddings: OpenAIEmbeddings; + + beforeEach(() => { + if (!process.env.ELASTIC_URL) { + throw new Error("ELASTIC_URL not set"); + } + + const config: ClientOptions = { + node: process.env.ELASTIC_URL, + }; + if (process.env.ELASTIC_API_KEY) { + config.auth = { + apiKey: process.env.ELASTIC_API_KEY, + }; + } else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) { + config.auth = { + username: process.env.ELASTIC_USERNAME, + password: process.env.ELASTIC_PASSWORD, + }; + } + client = new Client(config); + embeddings = new OpenAIEmbeddings(); + }); + + test.skip("Hybrid search with default strategy", async () => { + const indexName = "test_hybrid_default"; + const store = new ElasticVectorSearch(embeddings, { + client, + indexName, + strategy: new HybridRetrievalStrategy(), + }); + await store.deleteIfExists(); + + await store.addDocuments([ + new Document({ pageContent: "The quick brown fox jumps over the lazy dog" }), + new Document({ pageContent: "Machine learning and artificial intelligence" }), + new Document({ pageContent: "Elasticsearch vector search capabilities" }), + new Document({ pageContent: "A fox in the forest during autumn" }), + ]); + + const results = await store.similaritySearch("fox in the woods", 2); + + expect(results).toHaveLength(2); + expect(results[0]).toBeInstanceOf(Document); + expect(results.some(doc => doc.pageContent.includes("fox"))).toBe(true); + }); + + test.skip("Hybrid search with custom RRF parameters", async () => { + const indexName = "test_hybrid_custom_rrf"; + const store = new ElasticVectorSearch(embeddings, { + client, + indexName, + strategy: new HybridRetrievalStrategy({ + rankWindowSize: 200, + rankConstant: 80, + textField: "text", + }), + }); + await store.deleteIfExists(); + + await store.addDocuments([ + new Document({ pageContent: "search engines and databases" }), + new Document({ pageContent: "vector embeddings for search" }), + new Document({ pageContent: "neural networks and deep learning" }), + ]); + + const results = await store.similaritySearch("search technology", 2); + + expect(results).toHaveLength(2); + expect(results[0]).toBeInstanceOf(Document); + }); + + test.skip("Hybrid search returns scores correctly", async () => { + const indexName = "test_hybrid_scores"; + const store = new ElasticVectorSearch(embeddings, { + client, + indexName, + strategy: new HybridRetrievalStrategy(), + }); + await store.deleteIfExists(); + + await store.addDocuments([ + new Document({ pageContent: "Elasticsearch hybrid search" }), + new Document({ pageContent: "Vector similarity search" }), + new Document({ pageContent: "Full text search with BM25" }), + ]); + + const queryVector = await embeddings.embedQuery("hybrid search"); + const results = await store.similaritySearchVectorWithScore(queryVector, 3); + + expect(results).toHaveLength(3); + results.forEach(([doc, score]) => { + expect(doc).toBeInstanceOf(Document); + expect(typeof score).toBe("number"); + expect(score).toBeGreaterThan(0); + }); + }); + + test.skip("Hybrid search with metadata filters", async () => { + const indexName = "test_hybrid_filters"; + const store = new ElasticVectorSearch(embeddings, { + client, + indexName, + strategy: new HybridRetrievalStrategy(), + }); + await store.deleteIfExists(); + + const createdAt = new Date().getTime(); + await store.addDocuments([ + new Document({ + pageContent: "Technology article about AI", + metadata: { category: "tech", date: createdAt } + }), + new Document({ + pageContent: "Sports article about football", + metadata: { category: "sports", date: createdAt } + }), + new Document({ + pageContent: "Technology article about ML", + metadata: { category: "tech", date: createdAt } + }), + new Document({ + pageContent: "Sports article about basketball", + metadata: { category: "sports", date: createdAt + 1 } + }), + ]); + + const results = await store.similaritySearch("article about technology", 5, { + category: "tech", + }); + + expect(results.length).toBeLessThanOrEqual(2); + results.forEach(doc => { + expect(doc.metadata.category).toBe("tech"); + }); + }); +});