Skip to content

Commit c25af5e

Browse files
committed
feature #288 [Store] Add a way to store document content when using Chroma DB (dorrogeray)
This PR was squashed before being merged into the main branch. Discussion ---------- [Store] Add a way to store document content when using Chroma DB | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes <!-- please update src/**/CHANGELOG.md files --> | Docs? | yes <!-- required for new features --> | Issues | N/A | License | MIT This is just a draft for now to explain what I am trying to achieve - I would like to be able to store the original document content in the Chroma DB records, but the `VectorDocument` currently has no way to pass that in, other than the metadata. I feel like this requirement will not be specific just to Chroma DB, but various databases used for embeddings will support storing of the original document content along with the vectors. Any suggestions on how to approach this holistically? Should the `VectorDocument` be expanded with optional field like `?string $content = null`? Additionally (see https://symfony.com/releases): - Always add tests and ensure they pass. - For new features, provide some code snippets to help understand usage. - Features and deprecations must be submitted against branch main. - Update/add documentation as required (we can help!) - Changelog entry should follow https://symfony.com/doc/current/contributing/code/conventions.html#writing-a-changelog-entry - Never break backward compatibility (see https://symfony.com/bc). --> Commits ------- 231a807 [Store] Add a way to store document content when using Chroma DB
2 parents 49013be + 231a807 commit c25af5e

File tree

8 files changed

+393
-46
lines changed

8 files changed

+393
-46
lines changed

src/store/src/Bridge/ChromaDb/Store.php

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,18 @@ public function add(VectorDocument ...$documents): void
3838
$ids = [];
3939
$vectors = [];
4040
$metadata = [];
41+
$originalDocuments = [];
4142
foreach ($documents as $document) {
4243
$ids[] = (string) $document->id;
4344
$vectors[] = $document->vector->getData();
44-
$metadata[] = $document->metadata->getArrayCopy();
45+
$metadataCopy = $document->metadata->getArrayCopy();
46+
$originalDocuments[] = $document->metadata->getText() ?? '';
47+
unset($metadataCopy[Metadata::KEY_TEXT]);
48+
$metadata[] = $metadataCopy;
4549
}
4650

4751
$collection = $this->client->getOrCreateCollection($this->collectionName);
48-
$collection->add($ids, $vectors, $metadata);
52+
$collection->add($ids, $vectors, $metadata, $originalDocuments);
4953
}
5054

5155
public function query(Vector $vector, array $options = []): array

src/store/src/Document/Loader/TextFileLoader.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public function __invoke(string $source, array $options = []): iterable
3535
}
3636

3737
yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
38-
'source' => $source,
38+
Metadata::KEY_SOURCE => $source,
3939
]));
4040
}
4141
}

src/store/src/Document/Metadata.php

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,58 @@
1818
*/
1919
final class Metadata extends \ArrayObject
2020
{
21+
public const KEY_PARENT_ID = '_parent_id';
22+
public const KEY_TEXT = '_text';
23+
public const KEY_SOURCE = '_source';
24+
25+
public function hasParentId(): bool
26+
{
27+
return $this->offsetExists(self::KEY_PARENT_ID);
28+
}
29+
30+
public function getParentId(): int|string|null
31+
{
32+
return $this->offsetExists(self::KEY_PARENT_ID)
33+
? $this->offsetGet(self::KEY_PARENT_ID)
34+
: null;
35+
}
36+
37+
public function setParentId(int|string $parentId): void
38+
{
39+
$this->offsetSet(self::KEY_PARENT_ID, $parentId);
40+
}
41+
42+
public function hasText(): bool
43+
{
44+
return $this->offsetExists(self::KEY_TEXT);
45+
}
46+
47+
public function setText(string $text): void
48+
{
49+
$this->offsetSet(self::KEY_TEXT, $text);
50+
}
51+
52+
public function getText(): ?string
53+
{
54+
return $this->offsetExists(self::KEY_TEXT)
55+
? $this->offsetGet(self::KEY_TEXT)
56+
: null;
57+
}
58+
59+
public function hasSource(): bool
60+
{
61+
return $this->offsetExists(self::KEY_SOURCE);
62+
}
63+
64+
public function getSource(): ?string
65+
{
66+
return $this->offsetExists(self::KEY_SOURCE)
67+
? $this->offsetGet(self::KEY_SOURCE)
68+
: null;
69+
}
70+
71+
public function setSource(string $source): void
72+
{
73+
$this->offsetSet(self::KEY_SOURCE, $source);
74+
}
2175
}

src/store/src/Document/Transformer/TextSplitTransformer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ public function __invoke(iterable $documents, array $options = []): iterable
5757
$chunkText = mb_substr($text, $start, $end - $start);
5858

5959
yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
60-
'parent_id' => $document->id,
61-
'text' => $chunkText,
60+
Metadata::KEY_PARENT_ID => $document->id,
61+
Metadata::KEY_TEXT => $chunkText,
6262
...$document->metadata,
6363
]));
6464

src/store/tests/Bridge/ChromaDb/StoreTest.php

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use Codewithkyrian\ChromaDB\Client;
1515
use Codewithkyrian\ChromaDB\Resources\CollectionResource;
1616
use PHPUnit\Framework\Attributes\CoversClass;
17+
use PHPUnit\Framework\Attributes\DataProvider;
1718
use PHPUnit\Framework\TestCase;
1819
use Symfony\AI\Platform\Vector\Vector;
1920
use Symfony\AI\Store\Bridge\ChromaDb\Store;
@@ -24,8 +25,21 @@
2425
#[CoversClass(Store::class)]
2526
final class StoreTest extends TestCase
2627
{
27-
public function testAddDocumentsSuccessfully()
28-
{
28+
/**
29+
* @param array<VectorDocument> $documents
30+
* @param array<string> $expectedIds
31+
* @param array<array<float>> $expectedVectors
32+
* @param array<array<string, mixed>> $expectedMetadata
33+
* @param array<string> $expectedOriginalDocuments
34+
*/
35+
#[DataProvider('addDocumentsProvider')]
36+
public function testAddDocumentsSuccessfully(
37+
array $documents,
38+
array $expectedIds,
39+
array $expectedVectors,
40+
array $expectedMetadata,
41+
array $expectedOriginalDocuments,
42+
): void {
2943
$collection = $this->createMock(CollectionResource::class);
3044
$client = $this->createMock(Client::class);
3145

@@ -34,49 +48,88 @@ public function testAddDocumentsSuccessfully()
3448
->with('test-collection')
3549
->willReturn($collection);
3650

37-
$uuid1 = Uuid::v4();
38-
$uuid2 = Uuid::v4();
39-
4051
$collection->expects($this->once())
4152
->method('add')
42-
->with(
43-
[(string) $uuid1, (string) $uuid2],
44-
[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
45-
[[], ['title' => 'Test Document']],
46-
);
53+
->with($expectedIds, $expectedVectors, $expectedMetadata, $expectedOriginalDocuments);
4754

4855
$store = new Store($client, 'test-collection');
4956

50-
$document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]));
51-
$document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), new Metadata(['title' => 'Test Document']));
52-
53-
$store->add($document1, $document2);
57+
$store->add(...$documents);
5458
}
5559

56-
public function testAddSingleDocument()
60+
/**
61+
* @return \Iterator<string, array{
62+
* documents: array<VectorDocument>,
63+
* expectedIds: array<string>,
64+
* expectedVectors: array<array<float>>,
65+
* expectedMetadata: array<array<string, mixed>>,
66+
* expectedOriginalDocuments: array<string>
67+
* }>
68+
*/
69+
public static function addDocumentsProvider(): \Iterator
5770
{
58-
$collection = $this->createMock(CollectionResource::class);
59-
$client = $this->createMock(Client::class);
60-
61-
$client->expects($this->once())
62-
->method('getOrCreateCollection')
63-
->with('test-collection')
64-
->willReturn($collection);
65-
66-
$uuid = Uuid::v4();
67-
68-
$collection->expects($this->once())
69-
->method('add')
70-
->with(
71-
[(string) $uuid],
72-
[[0.1, 0.2, 0.3]],
73-
[['title' => 'Test Document', 'category' => 'test']],
74-
);
75-
76-
$store = new Store($client, 'test-collection');
77-
78-
$document = new VectorDocument($uuid, new Vector([0.1, 0.2, 0.3]), new Metadata(['title' => 'Test Document', 'category' => 'test']));
79-
80-
$store->add($document);
71+
yield 'multiple documents with and without metadata' => [
72+
'documents' => [
73+
new VectorDocument(
74+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
75+
new Vector([0.1, 0.2, 0.3]),
76+
),
77+
new VectorDocument(
78+
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
79+
new Vector([0.4, 0.5, 0.6]),
80+
new Metadata(['title' => 'Test Document']),
81+
),
82+
],
83+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
84+
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
85+
'expectedMetadata' => [[], ['title' => 'Test Document']],
86+
'expectedOriginalDocuments' => ['', ''],
87+
];
88+
89+
yield 'single document with metadata' => [
90+
'documents' => [
91+
new VectorDocument(
92+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
93+
new Vector([0.1, 0.2, 0.3]),
94+
new Metadata(['title' => 'Test Document', 'category' => 'test']),
95+
),
96+
],
97+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
98+
'expectedVectors' => [[0.1, 0.2, 0.3]],
99+
'expectedMetadata' => [['title' => 'Test Document', 'category' => 'test']],
100+
'expectedOriginalDocuments' => [''],
101+
];
102+
103+
yield 'documents with text content' => [
104+
'documents' => [
105+
new VectorDocument(
106+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
107+
new Vector([0.1, 0.2, 0.3]),
108+
new Metadata(['_text' => 'This is the content of document 1', 'title' => 'Document 1'])),
109+
new VectorDocument(
110+
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
111+
new Vector([0.4, 0.5, 0.6]),
112+
new Metadata(['_text' => 'This is the content of document 2', 'title' => 'Document 2', 'category' => 'test']),
113+
),
114+
],
115+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
116+
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
117+
'expectedMetadata' => [['title' => 'Document 1'], ['title' => 'Document 2', 'category' => 'test']],
118+
'expectedOriginalDocuments' => ['This is the content of document 1', 'This is the content of document 2'],
119+
];
120+
121+
yield 'document with null text' => [
122+
'documents' => [
123+
new VectorDocument(
124+
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
125+
new Vector([0.1, 0.2, 0.3]),
126+
new Metadata(['_text' => null, 'title' => 'Test Document']),
127+
),
128+
],
129+
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
130+
'expectedVectors' => [[0.1, 0.2, 0.3]],
131+
'expectedMetadata' => [['title' => 'Test Document']],
132+
'expectedOriginalDocuments' => [''],
133+
];
81134
}
82135
}

src/store/tests/Document/Loader/TextFileLoaderTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public function testSourceIsPresentInMetadata()
5252

5353
$this->assertCount(1, $documents);
5454
$this->assertInstanceOf(TextDocument::class, $document = $documents[0]);
55-
$this->assertSame($source, $document->metadata['source']);
55+
$this->assertSame($source, $document->metadata['_source']);
56+
$this->assertSame($source, $document->metadata->getSource());
5657
}
5758
}

0 commit comments

Comments
 (0)