Skip to content

Commit f8de707

Browse files
authored
Merge pull request #489 from jamosaur/opensearch-vectorstore
Add OpenSearch Vector Store
2 parents 3d3866a + 8ed7623 commit f8de707

File tree

4 files changed

+349
-0
lines changed

4 files changed

+349
-0
lines changed

.github/workflows/tests.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@ jobs:
3131
ports:
3232
- 3306:3306
3333
options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3
34+
opensearch:
35+
image: opensearchproject/opensearch:latest
36+
ports:
37+
- 9201:9200 # Avoid port conflict with Elasticsearch
38+
- 9600:9600
39+
env:
40+
discovery.type: single-node
41+
plugins.security.disabled: "true"
42+
DISABLE_INSTALL_DEMO_CONFIG: true
43+
options: --health-cmd="curl -s http://localhost:9200/_cluster/health | grep '\"status\":\"green\"'" --health-interval=10s --health-timeout=5s --health-retries=5
3444
strategy:
3545
matrix:
3646
os: [ubuntu-latest]

composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"friendsofphp/php-cs-fixer": "^3.75",
2424
"html2text/html2text": "^4.3",
2525
"illuminate/database": "^10.0|^11.0|^12.0",
26+
"opensearch-project/opensearch-php": "^2.5",
2627
"phpstan/phpstan": "^2.1",
2728
"phpunit/phpunit": "^9.0",
2829
"rector/rector": "^2.0",
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace NeuronAI\RAG\VectorStore;
6+
7+
use NeuronAI\RAG\Document;
8+
use OpenSearch\Client;
9+
use Exception;
10+
11+
use function array_key_exists;
12+
use function array_keys;
13+
use function array_map;
14+
use function count;
15+
use function in_array;
16+
use function max;
17+
18+
class OpenSearchVectorStore implements VectorStoreInterface
19+
{
20+
protected bool $vectorDimSet = false;
21+
22+
protected array $filters = [];
23+
24+
public function __construct(
25+
protected Client $client,
26+
protected string $index,
27+
protected int $topK = 4,
28+
) {
29+
}
30+
31+
protected function checkIndexStatus(Document $document): void
32+
{
33+
$indexExists = $this->client->indices()->exists(['index' => $this->index]);
34+
35+
if ($indexExists) {
36+
$this->mapVectorDimension(count($document->getEmbedding()));
37+
38+
return;
39+
}
40+
41+
$properties = [
42+
'content' => [
43+
'type' => 'text',
44+
],
45+
'sourceType' => [
46+
'type' => 'keyword',
47+
],
48+
'sourceName' => [
49+
'type' => 'keyword',
50+
],
51+
'embedding' => [
52+
'type' => 'knn_vector',
53+
'dimension' => count($document->getEmbedding()),
54+
'index' => true,
55+
'method' => [
56+
'name' => 'hnsw',
57+
'engine' => 'lucene',
58+
'space_type' => 'cosinesimil',
59+
'parameters' => [
60+
'encoder' => [
61+
'name' => 'sq'
62+
]
63+
]
64+
],
65+
]
66+
];
67+
68+
// Map metadata
69+
foreach (array_keys($document->metadata) as $name) {
70+
$properties[$name] = [
71+
'type' => 'keyword',
72+
];
73+
}
74+
75+
$this->client->indices()->create([
76+
'index' => $this->index,
77+
'body' => [
78+
'settings' => [
79+
'index' => [
80+
'knn' => true,
81+
'number_of_replicas' => 0,
82+
],
83+
],
84+
'mappings' => [
85+
'properties' => $properties
86+
]
87+
]
88+
]);
89+
}
90+
91+
/**
92+
* @throws Exception
93+
*/
94+
public function addDocument(Document $document): VectorStoreInterface
95+
{
96+
if ($document->embedding === []) {
97+
throw new Exception('Document embedding must be set before adding a document');
98+
}
99+
100+
$this->checkIndexStatus($document);
101+
102+
$this->client->index([
103+
'index' => $this->index,
104+
'body' => [
105+
'embedding' => $document->getEmbedding(),
106+
'content' => $document->getContent(),
107+
'sourceType' => $document->getSourceType(),
108+
'sourceName' => $document->getSourceName(),
109+
...$document->metadata,
110+
],
111+
]);
112+
113+
$this->client->indices()->refresh(['index' => $this->index]);
114+
115+
return $this;
116+
}
117+
118+
public function addDocuments(array $documents): VectorStoreInterface
119+
{
120+
if ($documents === []) {
121+
return $this;
122+
}
123+
124+
if (empty($documents[0]->getEmbedding())) {
125+
throw new Exception('Document embedding must be set before adding a document');
126+
}
127+
128+
$this->checkIndexStatus($documents[0]);
129+
130+
/*
131+
* Generate a bulk payload
132+
*/
133+
$params = ['body' => []];
134+
foreach ($documents as $document) {
135+
$params['body'][] = [
136+
'index' => [
137+
'_index' => $this->index,
138+
],
139+
];
140+
$params['body'][] = [
141+
'embedding' => $document->getEmbedding(),
142+
'content' => $document->getContent(),
143+
'sourceType' => $document->getSourceType(),
144+
'sourceName' => $document->getSourceName(),
145+
...$document->metadata,
146+
];
147+
}
148+
$this->client->bulk($params);
149+
$this->client->indices()->refresh(['index' => $this->index]);
150+
return $this;
151+
}
152+
153+
public function deleteBySource(string $sourceType, string $sourceName): VectorStoreInterface
154+
{
155+
$this->client->deleteByQuery([
156+
'index' => $this->index,
157+
'q' => "sourceType:{$sourceType} AND sourceName:{$sourceName}",
158+
'body' => []
159+
]);
160+
$this->client->indices()->refresh(['index' => $this->index]);
161+
return $this;
162+
}
163+
164+
/**
165+
* @param array $embedding
166+
* @return Document[]
167+
*/
168+
public function similaritySearch(array $embedding): iterable
169+
{
170+
$searchParams = [
171+
'index' => $this->index,
172+
'body' => [
173+
'query' => [
174+
'knn' => [
175+
'embedding' => [
176+
'vector' => $embedding,
177+
'k' => max(50, $this->topK * 4),
178+
],
179+
],
180+
],
181+
'sort' => [
182+
'_score' => [
183+
'order' => 'desc',
184+
],
185+
],
186+
],
187+
];
188+
189+
// Hybrid search
190+
if ($this->filters !== []) {
191+
$searchParams['body']['query']['knn']['filter'] = $this->filters;
192+
}
193+
194+
$response = $this->client->search($searchParams);
195+
196+
return array_map(function (array $item): Document {
197+
$document = new Document($item['_source']['content']);
198+
//$document->embedding = $item['_source']['embedding']; // avoid carrying large data
199+
$document->sourceType = $item['_source']['sourceType'];
200+
$document->sourceName = $item['_source']['sourceName'];
201+
$document->score = $item['_score'];
202+
203+
foreach ($item['_source'] as $name => $value) {
204+
if (!in_array($name, ['content', 'sourceType', 'sourceName', 'score', 'embedding', 'id'])) {
205+
$document->addMetadata($name, $value);
206+
}
207+
}
208+
209+
return $document;
210+
}, $response['hits']['hits']);
211+
}
212+
213+
/**
214+
* Map vector embeddings dimension on the fly.
215+
*/
216+
private function mapVectorDimension(int $dimension): void
217+
{
218+
if ($this->vectorDimSet) {
219+
return;
220+
}
221+
222+
$response = $this->client->indices()->getFieldMapping([
223+
'index' => $this->index,
224+
'fields' => 'embedding',
225+
]);
226+
227+
$mappings = $response[$this->index]['mappings'];
228+
229+
if (
230+
array_key_exists('embedding', $mappings)
231+
&& $mappings['embedding']['mapping']['embedding']['dimension'] === $dimension
232+
) {
233+
return;
234+
}
235+
236+
$this->client->indices()->putMapping([
237+
'index' => $this->index,
238+
'body' => [
239+
'properties' => [
240+
'embedding' => [
241+
'type' => 'knn_vector',
242+
'dimension' => $dimension,
243+
'index' => true,
244+
'method' => [
245+
'name' => 'hnsw',
246+
'engine' => 'lucene',
247+
'space_type' => 'cosinesimil',
248+
'parameters' => [
249+
'encoder' => [
250+
'name' => 'sq'
251+
]
252+
]
253+
254+
],
255+
],
256+
],
257+
],
258+
]);
259+
260+
$this->vectorDimSet = true;
261+
}
262+
263+
public function withFilters(array $filters): self
264+
{
265+
$this->filters = $filters;
266+
return $this;
267+
}
268+
}
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace NeuronAI\Tests\VectorStore;
6+
7+
use NeuronAI\RAG\VectorStore\OpenSearchVectorStore;
8+
use OpenSearch\Client;
9+
use OpenSearch\GuzzleClientFactory;
10+
use NeuronAI\RAG\Document;
11+
use NeuronAI\RAG\VectorStore\VectorStoreInterface;
12+
use NeuronAI\Tests\Traits\CheckOpenPort;
13+
use PHPUnit\Framework\TestCase;
14+
15+
use function file_get_contents;
16+
use function json_decode;
17+
18+
class OpenSearchTest extends TestCase
19+
{
20+
use CheckOpenPort;
21+
22+
protected Client $client;
23+
24+
protected array $embedding;
25+
26+
protected function setUp(): void
27+
{
28+
if (!$this->isPortOpen('127.0.0.1', 9201)) {
29+
$this->markTestSkipped('Port 9201 is not open. Skipping test.');
30+
}
31+
32+
$this->client = (new GuzzleClientFactory())->create([
33+
'base_uri' => 'http://localhost:9201',
34+
]);
35+
36+
// embedding "Hello World!"
37+
$this->embedding = json_decode(file_get_contents(__DIR__ . '/../Stubs/hello-world.embeddings'), true);
38+
}
39+
40+
public function test_elasticsearch_instance(): void
41+
{
42+
$store = new OpenSearchVectorStore($this->client, 'test');
43+
$this->assertInstanceOf(VectorStoreInterface::class, $store);
44+
}
45+
46+
public function test_add_document_and_search(): void
47+
{
48+
$store = new OpenSearchVectorStore($this->client, 'test');
49+
50+
$document = new Document('Hello World!');
51+
$document->embedding = $this->embedding;
52+
$document->addMetadata('customProperty', 'customValue');
53+
54+
$store->addDocument($document);
55+
56+
$results = $store->similaritySearch($this->embedding);
57+
58+
$this->assertEquals($document->getContent(), $results[0]->getContent());
59+
$this->assertEquals($document->metadata['customProperty'], $results[0]->metadata['customProperty']);
60+
}
61+
62+
public function test_elasticsearch_delete_documents(): void
63+
{
64+
$store = new OpenSearchVectorStore($this->client, 'test');
65+
$store->deleteBySource('manual', 'manual');
66+
67+
$results = $store->similaritySearch($this->embedding);
68+
$this->assertCount(0, $results);
69+
}
70+
}

0 commit comments

Comments
 (0)