|
| 1 | +# ES9 Indexer |
| 2 | + |
| 3 | +English | [中文](README_zh.md) |
| 4 | + |
| 5 | +An Elasticsearch 9.x indexer implementation for [Eino](https://github.com/cloudwego/eino) that implements the `Indexer` interface. This enables seamless integration with Eino's vector storage and retrieval system for enhanced semantic search capabilities. |
| 6 | + |
| 7 | +## Features |
| 8 | + |
| 9 | +- Implements `github.com/cloudwego/eino/components/indexer.Indexer` |
| 10 | +- Easy integration with Eino's indexer system |
| 11 | +- Configurable Elasticsearch parameters |
| 12 | +- Support for vector similarity search |
| 13 | +- Bulk indexing operations |
| 14 | +- Custom field mapping support |
| 15 | +- Flexible document vectorization |
| 16 | + |
| 17 | +## Installation |
| 18 | + |
| 19 | +```bash |
| 20 | +go get github.com/cloudwego/eino-ext/components/indexer/es9@latest |
| 21 | +``` |
| 22 | + |
| 23 | +## Quick Start |
| 24 | + |
| 25 | +Here's a quick example of how to use the indexer, you could read components/indexer/es9/examples/indexer/add_documents.go for more details: |
| 26 | + |
| 27 | +```go |
| 28 | +import ( |
| 29 | + "context" |
| 30 | + "fmt" |
| 31 | + "log" |
| 32 | + "os" |
| 33 | + |
| 34 | + "github.com/cloudwego/eino/components/embedding" |
| 35 | + "github.com/cloudwego/eino/schema" |
| 36 | + "github.com/elastic/go-elasticsearch/v9" |
| 37 | + |
| 38 | + "github.com/cloudwego/eino-ext/components/embedding/ark" |
| 39 | + "github.com/cloudwego/eino-ext/components/indexer/es9" |
| 40 | +) |
| 41 | + |
| 42 | +const ( |
| 43 | + indexName = "eino_example" |
| 44 | + fieldContent = "content" |
| 45 | + fieldContentVector = "content_vector" |
| 46 | + fieldExtraLocation = "location" |
| 47 | + docExtraLocation = "location" |
| 48 | +) |
| 49 | + |
| 50 | +func main() { |
| 51 | + ctx := context.Background() |
| 52 | + // es supports multiple ways to connect |
| 53 | + username := os.Getenv("ES_USERNAME") |
| 54 | + password := os.Getenv("ES_PASSWORD") |
| 55 | + |
| 56 | + // 1. Create ES client |
| 57 | + httpCACertPath := os.Getenv("ES_HTTP_CA_CERT_PATH") |
| 58 | + var cert []byte |
| 59 | + var err error |
| 60 | + if httpCACertPath != "" { |
| 61 | + cert, err = os.ReadFile(httpCACertPath) |
| 62 | + if err != nil { |
| 63 | + log.Fatalf("read file failed, err=%v", err) |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + client, _ := elasticsearch.NewClient(elasticsearch.Config{ |
| 68 | + Addresses: []string{"https://localhost:9200"}, |
| 69 | + Username: username, |
| 70 | + Password: password, |
| 71 | + CACert: cert, |
| 72 | + }) |
| 73 | + |
| 74 | + // 2. Create embedding component using ARK |
| 75 | + // Replace "ARK_API_KEY", "ARK_REGION", "ARK_MODEL" with your actual config |
| 76 | + emb, _ := ark.NewEmbedder(ctx, &ark.EmbeddingConfig{ |
| 77 | + APIKey: os.Getenv("ARK_API_KEY"), |
| 78 | + Region: os.Getenv("ARK_REGION"), |
| 79 | + Model: os.Getenv("ARK_MODEL"), |
| 80 | + }) |
| 81 | + |
| 82 | + // 3. Prepare documents |
| 83 | + // Documents usually contain at least an ID and Content. |
| 84 | + // You can also add extra metadata for filtering or other purposes. |
| 85 | + docs := []*schema.Document{ |
| 86 | + { |
| 87 | + ID: "1", |
| 88 | + Content: "Eiffel Tower: Located in Paris, France.", |
| 89 | + MetaData: map[string]any{ |
| 90 | + docExtraLocation: "France", |
| 91 | + }, |
| 92 | + }, |
| 93 | + { |
| 94 | + ID: "2", |
| 95 | + Content: "The Great Wall: Located in China.", |
| 96 | + MetaData: map[string]any{ |
| 97 | + docExtraLocation: "China", |
| 98 | + }, |
| 99 | + }, |
| 100 | + } |
| 101 | + |
| 102 | + // 4. Create ES indexer component |
| 103 | + indexer, _ := es9.NewIndexer(ctx, &es9.IndexerConfig{ |
| 104 | + Client: client, |
| 105 | + Index: indexName, |
| 106 | + BatchSize: 10, |
| 107 | + // DocumentToFields specifies how to map document fields to ES fields |
| 108 | + DocumentToFields: func(ctx context.Context, doc *schema.Document) (field2Value map[string]es9.FieldValue, err error) { |
| 109 | + return map[string]es9.FieldValue{ |
| 110 | + fieldContent: { |
| 111 | + Value: doc.Content, |
| 112 | + EmbedKey: fieldContentVector, // vectorize content and save to "content_vector" |
| 113 | + }, |
| 114 | + fieldExtraLocation: { |
| 115 | + // Extra metadata field |
| 116 | + Value: doc.MetaData[docExtraLocation], |
| 117 | + }, |
| 118 | + }, nil |
| 119 | + }, |
| 120 | + // Provide the embedding component to use for vectorization |
| 121 | + Embedding: emb, |
| 122 | + }) |
| 123 | + |
| 124 | + // 5. Index documents |
| 125 | + ids, err := indexer.Store(ctx, docs) |
| 126 | + if err != nil { |
| 127 | + fmt.Printf("index error: %v\n", err) |
| 128 | + return |
| 129 | + } |
| 130 | + fmt.Println("indexed ids:", ids) |
| 131 | +} |
| 132 | +``` |
| 133 | + |
| 134 | +## Configuration |
| 135 | + |
| 136 | +The indexer can be configured using the `IndexerConfig` struct: |
| 137 | + |
| 138 | +```go |
| 139 | +type IndexerConfig struct { |
| 140 | + Client *elasticsearch.Client // Required: Elasticsearch client instance |
| 141 | + Index string // Required: Index name to store documents |
| 142 | + BatchSize int // Optional: Max texts size for embedding (default: 5) |
| 143 | + |
| 144 | + // Required: Function to map Document fields to Elasticsearch fields |
| 145 | + DocumentToFields func(ctx context.Context, doc *schema.Document) (map[string]FieldValue, error) |
| 146 | + |
| 147 | + // Optional: Required only if vectorization is needed |
| 148 | + Embedding embedding.Embedder |
| 149 | +} |
| 150 | + |
| 151 | +// FieldValue defines how a field should be stored and vectorized |
| 152 | +type FieldValue struct { |
| 153 | + Value any // Original value to store |
| 154 | + EmbedKey string // If set, Value will be vectorized and saved |
| 155 | + Stringify func(val any) (string, error) // Optional: custom string conversion |
| 156 | +} |
| 157 | +``` |
| 158 | + |
| 159 | +## Full Examples |
| 160 | + |
| 161 | +- [Indexer Example](./examples/indexer) |
| 162 | +- [Indexer with Sparse Vector Example](./examples/indexer_with_sparse_vector) |
| 163 | + |
| 164 | +## For More Details |
| 165 | + |
| 166 | +- [Eino Documentation](https://www.cloudwego.io/zh/docs/eino/) |
| 167 | +- [Elasticsearch Go Client Documentation](https://github.com/elastic/go-elasticsearch) |
0 commit comments