Skip to content

Commit 231a807

Browse files
dorrogeraychr-hertel
authored andcommitted
[Store] Add a way to store document content when using Chroma DB
1 parent 3a160d5 commit 231a807

File tree

8 files changed

+393
-46
lines changed

8 files changed

+393
-46
lines changed

src/store/src/Bridge/ChromaDb/Store.php

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,18 @@ public function add(VectorDocument ...$documents): void
3838
$ids = [];
3939
$vectors = [];
4040
$metadata = [];
41+
$originalDocuments = [];
4142
foreach ($documents as $document) {
4243
$ids[] = (string) $document->id;
4344
$vectors[] = $document->vector->getData();
44-
$metadata[] = $document->metadata->getArrayCopy();
45+
$metadataCopy = $document->metadata->getArrayCopy();
46+
$originalDocuments[] = $document->metadata->getText() ?? '';
47+
unset($metadataCopy[Metadata::KEY_TEXT]);
48+
$metadata[] = $metadataCopy;
4549
}
4650

4751
$collection = $this->client->getOrCreateCollection($this->collectionName);
48-
$collection->add($ids, $vectors, $metadata);
52+
$collection->add($ids, $vectors, $metadata, $originalDocuments);
4953
}
5054

5155
public function query(Vector $vector, array $options = []): array

src/store/src/Document/Loader/TextFileLoader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public function __invoke(string $source, array $options = []): iterable
3535
}
3636

3737
yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
38-
'source' => $source,
38+
Metadata::KEY_SOURCE => $source,
3939
]));
4040
}
4141
}

src/store/src/Document/Metadata.php

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,58 @@
1818
*/
1919
final class Metadata extends \ArrayObject
2020
{
21+
public const KEY_PARENT_ID = '_parent_id';
22+
public const KEY_TEXT = '_text';
23+
public const KEY_SOURCE = '_source';
24+
25+
public function hasParentId(): bool
26+
{
27+
return $this->offsetExists(self::KEY_PARENT_ID);
28+
}
29+
30+
public function getParentId(): int|string|null
31+
{
32+
return $this->offsetExists(self::KEY_PARENT_ID)
33+
? $this->offsetGet(self::KEY_PARENT_ID)
34+
: null;
35+
}
36+
37+
public function setParentId(int|string $parentId): void
38+
{
39+
$this->offsetSet(self::KEY_PARENT_ID, $parentId);
40+
}
41+
42+
public function hasText(): bool
43+
{
44+
return $this->offsetExists(self::KEY_TEXT);
45+
}
46+
47+
public function setText(string $text): void
48+
{
49+
$this->offsetSet(self::KEY_TEXT, $text);
50+
}
51+
52+
public function getText(): ?string
53+
{
54+
return $this->offsetExists(self::KEY_TEXT)
55+
? $this->offsetGet(self::KEY_TEXT)
56+
: null;
57+
}
58+
59+
public function hasSource(): bool
60+
{
61+
return $this->offsetExists(self::KEY_SOURCE);
62+
}
63+
64+
public function getSource(): ?string
65+
{
66+
return $this->offsetExists(self::KEY_SOURCE)
67+
? $this->offsetGet(self::KEY_SOURCE)
68+
: null;
69+
}
70+
71+
public function setSource(string $source): void
72+
{
73+
$this->offsetSet(self::KEY_SOURCE, $source);
74+
}
2175
}

src/store/src/Document/Transformer/TextSplitTransformer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ public function __invoke(iterable $documents, array $options = []): iterable
5757
$chunkText = mb_substr($text, $start, $end - $start);
5858

5959
yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
60-
'parent_id' => $document->id,
61-
'text' => $chunkText,
60+
Metadata::KEY_PARENT_ID => $document->id,
61+
Metadata::KEY_TEXT => $chunkText,
6262
...$document->metadata,
6363
]));
6464

src/store/tests/Bridge/ChromaDb/StoreTest.php

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use Codewithkyrian\ChromaDB\Client;
1515
use Codewithkyrian\ChromaDB\Resources\CollectionResource;
1616
use PHPUnit\Framework\Attributes\CoversClass;
17+
use PHPUnit\Framework\Attributes\DataProvider;
1718
use PHPUnit\Framework\TestCase;
1819
use Symfony\AI\Platform\Vector\Vector;
1920
use Symfony\AI\Store\Bridge\ChromaDb\Store;
@@ -24,8 +25,21 @@
2425
#[CoversClass(Store::class)]
2526
final class StoreTest extends TestCase
2627
{
27-
public function testAddDocumentsSuccessfully()
28-
{
28+
/**
29+
* @param array<VectorDocument> $documents
30+
* @param array<string> $expectedIds
31+
* @param array<array<float>> $expectedVectors
32+
* @param array<array<string, mixed>> $expectedMetadata
33+
* @param array<string> $expectedOriginalDocuments
34+
*/
35+
#[DataProvider('addDocumentsProvider')]
36+
public function testAddDocumentsSuccessfully(
37+
array $documents,
38+
array $expectedIds,
39+
array $expectedVectors,
40+
array $expectedMetadata,
41+
array $expectedOriginalDocuments,
42+
): void {
2943
$collection = $this->createMock(CollectionResource::class);
3044
$client = $this->createMock(Client::class);
3145

@@ -34,49 +48,88 @@ public function testAddDocumentsSuccessfully()
3448
->with('test-collection')
3549
->willReturn($collection);
3650

37-
$uuid1 = Uuid::v4();
38-
$uuid2 = Uuid::v4();
39-
4051
$collection->expects($this->once())
4152
->method('add')
42-
->with(
43-
[(string) $uuid1, (string) $uuid2],
44-
[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
45-
[[], ['title' => 'Test Document']],
46-
);
53+
->with($expectedIds, $expectedVectors, $expectedMetadata, $expectedOriginalDocuments);
4754

4855
$store = new Store($client, 'test-collection');
4956

50-
$document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]));
51-
$document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), new Metadata(['title' => 'Test Document']));
52-
53-
$store->add($document1, $document2);
57+
$store->add(...$documents);
5458
}
5559

56-
public function testAddSingleDocument()
60+
/**
61+
* @return \Iterator<string, array{
62+
* documents: array<VectorDocument>,
63+
* expectedIds: array<string>,
64+
* expectedVectors: array<array<float>>,
65+
* expectedMetadata: array<array<string, mixed>>,
66+
* expectedOriginalDocuments: array<string>
67+
* }>
68+
*/
69+
public static function addDocumentsProvider(): \Iterator
5770
{
58-
$collection = $this->createMock(CollectionResource::class);
59-
$client = $this->createMock(Client::class);
60-
61-
$client->expects($this->once())
62-
->method('getOrCreateCollection')
63-
->with('test-collection')
64-
->willReturn($collection);
65-
66-
$uuid = Uuid::v4();
67-
68-
$collection->expects($this->once())
69-
->method('add')
70-
->with(
71-
[(string) $uuid],
72-
[[0.1, 0.2, 0.3]],
73-
[['title' => 'Test Document', 'category' => 'test']],
74-
);
75-
76-
$store = new Store($client, 'test-collection');
77-
78-
$document = new VectorDocument($uuid, new Vector([0.1, 0.2, 0.3]), new Metadata(['title' => 'Test Document', 'category' => 'test']));
79-
80-
$store->add($document);
71+
yield 'multiple documents with and without metadata' => [
72+
'documents' => [
73+
new VectorDocument(
74+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
75+
new Vector([0.1, 0.2, 0.3]),
76+
),
77+
new VectorDocument(
78+
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
79+
new Vector([0.4, 0.5, 0.6]),
80+
new Metadata(['title' => 'Test Document']),
81+
),
82+
],
83+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
84+
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
85+
'expectedMetadata' => [[], ['title' => 'Test Document']],
86+
'expectedOriginalDocuments' => ['', ''],
87+
];
88+
89+
yield 'single document with metadata' => [
90+
'documents' => [
91+
new VectorDocument(
92+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
93+
new Vector([0.1, 0.2, 0.3]),
94+
new Metadata(['title' => 'Test Document', 'category' => 'test']),
95+
),
96+
],
97+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
98+
'expectedVectors' => [[0.1, 0.2, 0.3]],
99+
'expectedMetadata' => [['title' => 'Test Document', 'category' => 'test']],
100+
'expectedOriginalDocuments' => [''],
101+
];
102+
103+
yield 'documents with text content' => [
104+
'documents' => [
105+
new VectorDocument(
106+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
107+
new Vector([0.1, 0.2, 0.3]),
108+
new Metadata(['_text' => 'This is the content of document 1', 'title' => 'Document 1'])),
109+
new VectorDocument(
110+
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
111+
new Vector([0.4, 0.5, 0.6]),
112+
new Metadata(['_text' => 'This is the content of document 2', 'title' => 'Document 2', 'category' => 'test']),
113+
),
114+
],
115+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
116+
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
117+
'expectedMetadata' => [['title' => 'Document 1'], ['title' => 'Document 2', 'category' => 'test']],
118+
'expectedOriginalDocuments' => ['This is the content of document 1', 'This is the content of document 2'],
119+
];
120+
121+
yield 'document with null text' => [
122+
'documents' => [
123+
new VectorDocument(
124+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
125+
new Vector([0.1, 0.2, 0.3]),
126+
new Metadata(['_text' => null, 'title' => 'Test Document']),
127+
),
128+
],
129+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
130+
'expectedVectors' => [[0.1, 0.2, 0.3]],
131+
'expectedMetadata' => [['title' => 'Test Document']],
132+
'expectedOriginalDocuments' => [''],
133+
];
81134
}
82135
}

src/store/tests/Document/Loader/TextFileLoaderTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public function testSourceIsPresentInMetadata()
5252

5353
$this->assertCount(1, $documents);
5454
$this->assertInstanceOf(TextDocument::class, $document = $documents[0]);
55-
$this->assertSame($source, $document->metadata['source']);
55+
$this->assertSame($source, $document->metadata['_source']);
56+
$this->assertSame($source, $document->metadata->getSource());
5657
}
5758
}

0 commit comments

Comments
 (0)