|
| 1 | +import { Client, ClientOptions } from "@elastic/elasticsearch"; |
| 2 | +import { OpenAIEmbeddings } from "@langchain/openai"; |
| 3 | +import { |
| 4 | + ElasticClientArgs, |
| 5 | + ElasticVectorSearch, |
| 6 | + HybridRetrievalStrategy, |
| 7 | +} from "@langchain/community/vectorstores/elasticsearch"; |
| 8 | +import { Document } from "@langchain/core/documents"; |
| 9 | + |
| 10 | +/** |
| 11 | + * Demonstrates hybrid search with Elasticsearch, combining: |
| 12 | + * - Vector (semantic) search using embeddings |
| 13 | + * - BM25 (lexical) full-text search |
| 14 | + * - Reciprocal Rank Fusion (RRF) for result merging |
| 15 | + * |
| 16 | + * Requirements: |
| 17 | + * - Elasticsearch 8.9+ (for RRF support) |
| 18 | + * - Run: docker-compose up -d --build (in elasticsearch directory) |
| 19 | + * - Set ELASTIC_URL, ELASTIC_API_KEY (or ELASTIC_USERNAME/ELASTIC_PASSWORD) |
| 20 | + */ |
| 21 | +export async function run() { |
| 22 | + // Configure Elasticsearch client |
| 23 | + const config: ClientOptions = { |
| 24 | + node: process.env.ELASTIC_URL ?? "http://127.0.0.1:9200", |
| 25 | + }; |
| 26 | + if (process.env.ELASTIC_API_KEY) { |
| 27 | + config.auth = { |
| 28 | + apiKey: process.env.ELASTIC_API_KEY, |
| 29 | + }; |
| 30 | + } else if (process.env.ELASTIC_USERNAME && process.env.ELASTIC_PASSWORD) { |
| 31 | + config.auth = { |
| 32 | + username: process.env.ELASTIC_USERNAME, |
| 33 | + password: process.env.ELASTIC_PASSWORD, |
| 34 | + }; |
| 35 | + } |
| 36 | + |
| 37 | + const embeddings = new OpenAIEmbeddings(); |
| 38 | + |
| 39 | + // Create vector store with hybrid search strategy |
| 40 | + const clientArgs: ElasticClientArgs = { |
| 41 | + client: new Client(config), |
| 42 | + indexName: process.env.ELASTIC_INDEX ?? "test_hybrid_search", |
| 43 | + strategy: new HybridRetrievalStrategy({ |
| 44 | + rankWindowSize: 100, // Number of documents to consider for RRF |
| 45 | + rankConstant: 60, // RRF constant for score normalization |
| 46 | + textField: "text", // Field to use for BM25 search |
| 47 | + }), |
| 48 | + }; |
| 49 | + |
| 50 | + const vectorStore = new ElasticVectorSearch(embeddings, clientArgs); |
| 51 | + |
| 52 | + // Clean up any existing data |
| 53 | + await vectorStore.deleteIfExists(); |
| 54 | + |
| 55 | + // Add sample documents |
| 56 | + const docs = [ |
| 57 | + new Document({ |
| 58 | + pageContent: "Running helps build cardiovascular endurance and strengthens leg muscles.", |
| 59 | + metadata: { category: "fitness", topic: "running" }, |
| 60 | + }), |
| 61 | + new Document({ |
| 62 | + pageContent: "Marathon training requires consistent mileage and proper recovery.", |
| 63 | + metadata: { category: "fitness", topic: "running" }, |
| 64 | + }), |
| 65 | + new Document({ |
| 66 | + pageContent: "Muscle soreness after exercise is caused by microscopic damage to muscle fibers.", |
| 67 | + metadata: { category: "health", topic: "recovery" }, |
| 68 | + }), |
| 69 | + new Document({ |
| 70 | + pageContent: "Stretching and foam rolling can help prevent post-workout muscle pain.", |
| 71 | + metadata: { category: "health", topic: "recovery" }, |
| 72 | + }), |
| 73 | + new Document({ |
| 74 | + pageContent: "Python is a popular programming language for data science and machine learning.", |
| 75 | + metadata: { category: "technology", topic: "programming" }, |
| 76 | + }), |
| 77 | + ]; |
| 78 | + |
| 79 | + console.log("Adding documents to Elasticsearch..."); |
| 80 | + await vectorStore.addDocuments(docs); |
| 81 | + console.log("Documents added successfully!\n"); |
| 82 | + |
| 83 | + // Example 1: Hybrid search combines semantic + keyword matching |
| 84 | + console.log("=== Example 1: Hybrid Search ==="); |
| 85 | + const query1 = "How to avoid muscle soreness while running?"; |
| 86 | + console.log(`Query: "${query1}"\n`); |
| 87 | + |
| 88 | + const results1 = await vectorStore.similaritySearchWithScore(query1, 3); |
| 89 | + results1.forEach(([doc, score], i) => { |
| 90 | + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); |
| 91 | + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); |
| 92 | + }); |
| 93 | + |
| 94 | + // Example 2: Semantic search works well for conceptual queries |
| 95 | + console.log("\n=== Example 2: Semantic Query ==="); |
| 96 | + const query2 = "tips for preventing pain after workouts"; |
| 97 | + console.log(`Query: "${query2}"\n`); |
| 98 | + |
| 99 | + const results2 = await vectorStore.similaritySearchWithScore(query2, 2); |
| 100 | + results2.forEach(([doc, score], i) => { |
| 101 | + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); |
| 102 | + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); |
| 103 | + }); |
| 104 | + |
| 105 | + // Example 3: With metadata filters |
| 106 | + console.log("\n=== Example 3: Hybrid Search with Filters ==="); |
| 107 | + const query3 = "fitness advice"; |
| 108 | + console.log(`Query: "${query3}"`); |
| 109 | + console.log(`Filter: category = "fitness"\n`); |
| 110 | + |
| 111 | + const results3 = await vectorStore.similaritySearchWithScore( |
| 112 | + query3, |
| 113 | + 3, |
| 114 | + { category: "fitness" } |
| 115 | + ); |
| 116 | + results3.forEach(([doc, score], i) => { |
| 117 | + console.log(`${i + 1}. [Score: ${score.toFixed(4)}] ${doc.pageContent}`); |
| 118 | + console.log(` Metadata: ${JSON.stringify(doc.metadata)}\n`); |
| 119 | + }); |
| 120 | + |
| 121 | + // Clean up |
| 122 | + console.log("\n=== Cleanup ==="); |
| 123 | + await vectorStore.deleteIfExists(); |
| 124 | + console.log("Index deleted."); |
| 125 | +} |
| 126 | + |
| 127 | +/** |
| 128 | + * For Elasticsearch 9.2+: |
| 129 | + * If you need to include vectors in the response, set includeSourceVectors: |
| 130 | + * |
| 131 | + * strategy: new HybridRetrievalStrategy({ |
| 132 | + * includeSourceVectors: true, |
| 133 | + * rankWindowSize: 100, |
| 134 | + * rankConstant: 60, |
| 135 | + * }) |
| 136 | + * |
| 137 | + * Note: This is only needed if you're on ES 9.2+ and want vector data |
| 138 | + * in search responses. ES 9.2+ excludes vectors by default for performance. |
| 139 | + */ |
| 140 | + |
0 commit comments