Skip to content

Commit 4bb633d

Browse files
committed
minor #438 [Store] Add comprehensive unit tests for Vectorizer (OskarStark)
This PR was squashed before being merged into the main branch. Discussion ---------- [Store] Add comprehensive unit tests for `Vectorizer` | Q | A | ------------- | --- | Bug fix? | no | New feature? | no | Docs? | no | Issues | -- | License | MIT Commits ------- 6cacff7 [Store] Add comprehensive unit tests for `Vectorizer`
2 parents bb6b953 + 6cacff7 commit 4bb633d

File tree

1 file changed

+322
-0
lines changed

1 file changed

+322
-0
lines changed
Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\AI\Store\Tests\Document;
13+
14+
use PHPUnit\Framework\Attributes\CoversClass;
15+
use PHPUnit\Framework\Attributes\DataProvider;
16+
use PHPUnit\Framework\Attributes\TestDox;
17+
use PHPUnit\Framework\Attributes\UsesClass;
18+
use PHPUnit\Framework\TestCase;
19+
use Symfony\AI\Platform\Bridge\OpenAi\Embeddings;
20+
use Symfony\AI\Platform\Capability;
21+
use Symfony\AI\Platform\Model;
22+
use Symfony\AI\Platform\ModelClientInterface;
23+
use Symfony\AI\Platform\Platform;
24+
use Symfony\AI\Platform\Result\RawHttpResult;
25+
use Symfony\AI\Platform\Result\RawResultInterface;
26+
use Symfony\AI\Platform\Result\ResultInterface;
27+
use Symfony\AI\Platform\Result\ResultPromise;
28+
use Symfony\AI\Platform\Result\VectorResult;
29+
use Symfony\AI\Platform\ResultConverterInterface;
30+
use Symfony\AI\Platform\Vector\Vector;
31+
use Symfony\AI\Store\Document\Metadata;
32+
use Symfony\AI\Store\Document\TextDocument;
33+
use Symfony\AI\Store\Document\VectorDocument;
34+
use Symfony\AI\Store\Document\Vectorizer;
35+
use Symfony\AI\Store\Tests\Double\PlatformTestHandler;
36+
use Symfony\Component\HttpClient\Response\MockResponse;
37+
use Symfony\Component\Uid\Uuid;
38+
39+
#[CoversClass(Vectorizer::class)]
40+
#[UsesClass(TextDocument::class)]
41+
#[UsesClass(VectorDocument::class)]
42+
#[UsesClass(Metadata::class)]
43+
#[UsesClass(Vector::class)]
44+
#[UsesClass(VectorResult::class)]
45+
#[UsesClass(Platform::class)]
46+
#[UsesClass(ResultPromise::class)]
47+
#[UsesClass(Embeddings::class)]
48+
#[TestDox('Tests for the Vectorizer class')]
49+
final class VectorizerTest extends TestCase
50+
{
51+
public function testVectorizeDocumentsWithBatchSupport()
52+
{
53+
$documents = [
54+
new TextDocument(Uuid::v4(), 'First document content', new Metadata(['source' => 'test1'])),
55+
new TextDocument(Uuid::v4(), 'Second document content', new Metadata(['source' => 'test2'])),
56+
new TextDocument(Uuid::v4(), 'Third document content', new Metadata(['source' => 'test3'])),
57+
];
58+
59+
$vectors = [
60+
new Vector([0.1, 0.2, 0.3]),
61+
new Vector([0.4, 0.5, 0.6]),
62+
new Vector([0.7, 0.8, 0.9]),
63+
];
64+
65+
$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
66+
67+
$model = new Embeddings();
68+
69+
$vectorizer = new Vectorizer($platform, $model);
70+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
71+
72+
$this->assertCount(3, $vectorDocuments);
73+
74+
foreach ($vectorDocuments as $i => $vectorDoc) {
75+
$this->assertInstanceOf(VectorDocument::class, $vectorDoc);
76+
$this->assertSame($documents[$i]->id, $vectorDoc->id);
77+
$this->assertEquals($vectors[$i], $vectorDoc->vector);
78+
$this->assertSame($documents[$i]->metadata, $vectorDoc->metadata);
79+
}
80+
}
81+
82+
public function testVectorizeDocumentsWithSingleDocument()
83+
{
84+
$document = new TextDocument(Uuid::v4(), 'Single document content', new Metadata(['test' => 'value']));
85+
$vector = new Vector([0.1, 0.2, 0.3]);
86+
87+
$platform = PlatformTestHandler::createPlatform(new VectorResult($vector));
88+
$model = new Embeddings();
89+
90+
$vectorizer = new Vectorizer($platform, $model);
91+
$vectorDocuments = $vectorizer->vectorizeDocuments([$document]);
92+
93+
$this->assertCount(1, $vectorDocuments);
94+
$this->assertInstanceOf(VectorDocument::class, $vectorDocuments[0]);
95+
$this->assertSame($document->id, $vectorDocuments[0]->id);
96+
$this->assertEquals($vector, $vectorDocuments[0]->vector);
97+
$this->assertSame($document->metadata, $vectorDocuments[0]->metadata);
98+
}
99+
100+
public function testVectorizeEmptyDocumentsArray()
101+
{
102+
$platform = PlatformTestHandler::createPlatform(new VectorResult());
103+
$model = new Embeddings();
104+
105+
$vectorizer = new Vectorizer($platform, $model);
106+
$vectorDocuments = $vectorizer->vectorizeDocuments([]);
107+
108+
$this->assertSame([], $vectorDocuments);
109+
}
110+
111+
public function testVectorizeDocumentsPreservesMetadata()
112+
{
113+
$metadata1 = new Metadata(['source' => 'file1.txt', 'author' => 'Alice', 'tags' => ['important']]);
114+
$metadata2 = new Metadata(['source' => 'file2.txt', 'author' => 'Bob', 'version' => 2]);
115+
116+
$documents = [
117+
new TextDocument(Uuid::v4(), 'Content 1', $metadata1),
118+
new TextDocument(Uuid::v4(), 'Content 2', $metadata2),
119+
];
120+
121+
$vectors = [
122+
new Vector([0.1, 0.2]),
123+
new Vector([0.3, 0.4]),
124+
];
125+
126+
$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
127+
$model = new Embeddings();
128+
129+
$vectorizer = new Vectorizer($platform, $model);
130+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
131+
132+
$this->assertCount(2, $vectorDocuments);
133+
$this->assertSame($metadata1, $vectorDocuments[0]->metadata);
134+
$this->assertSame($metadata2, $vectorDocuments[1]->metadata);
135+
$this->assertSame(['source' => 'file1.txt', 'author' => 'Alice', 'tags' => ['important']], $vectorDocuments[0]->metadata->getArrayCopy());
136+
$this->assertSame(['source' => 'file2.txt', 'author' => 'Bob', 'version' => 2], $vectorDocuments[1]->metadata->getArrayCopy());
137+
}
138+
139+
public function testVectorizeDocumentsPreservesDocumentIds()
140+
{
141+
$id1 = Uuid::v4();
142+
$id2 = Uuid::v4();
143+
$id3 = Uuid::v4();
144+
145+
$documents = [
146+
new TextDocument($id1, 'Document 1'),
147+
new TextDocument($id2, 'Document 2'),
148+
new TextDocument($id3, 'Document 3'),
149+
];
150+
151+
$vectors = [
152+
new Vector([0.1]),
153+
new Vector([0.2]),
154+
new Vector([0.3]),
155+
];
156+
157+
$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
158+
$model = new Embeddings();
159+
160+
$vectorizer = new Vectorizer($platform, $model);
161+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
162+
163+
$this->assertCount(3, $vectorDocuments);
164+
$this->assertSame($id1, $vectorDocuments[0]->id);
165+
$this->assertSame($id2, $vectorDocuments[1]->id);
166+
$this->assertSame($id3, $vectorDocuments[2]->id);
167+
}
168+
169+
#[DataProvider('provideDocumentCounts')]
170+
public function testVectorizeVariousDocumentCounts(int $count)
171+
{
172+
$documents = [];
173+
$vectors = [];
174+
175+
for ($i = 0; $i < $count; ++$i) {
176+
$documents[] = new TextDocument(
177+
Uuid::v4(),
178+
\sprintf('Document %d content', $i),
179+
new Metadata(['index' => $i])
180+
);
181+
$vectors[] = new Vector([$i * 0.1, $i * 0.2, $i * 0.3]);
182+
}
183+
184+
$platform = PlatformTestHandler::createPlatform(
185+
$count > 0 ? new VectorResult(...$vectors) : new VectorResult()
186+
);
187+
$model = new Embeddings();
188+
189+
$vectorizer = new Vectorizer($platform, $model);
190+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
191+
192+
$this->assertCount($count, $vectorDocuments);
193+
194+
foreach ($vectorDocuments as $i => $vectorDoc) {
195+
$this->assertInstanceOf(VectorDocument::class, $vectorDoc);
196+
$this->assertSame($documents[$i]->id, $vectorDoc->id);
197+
$this->assertEquals($vectors[$i], $vectorDoc->vector);
198+
$this->assertSame($documents[$i]->metadata, $vectorDoc->metadata);
199+
$this->assertSame(['index' => $i], $vectorDoc->metadata->getArrayCopy());
200+
}
201+
}
202+
203+
/**
204+
* @return \Generator<string, array{int}>
205+
*/
206+
public static function provideDocumentCounts(): \Generator
207+
{
208+
yield 'no documents' => [0];
209+
yield 'single document' => [1];
210+
yield 'two documents' => [2];
211+
yield 'three documents' => [3];
212+
}
213+
214+
public function testVectorizeDocumentsWithLargeVectors()
215+
{
216+
$document = new TextDocument(Uuid::v4(), 'Test content');
217+
218+
// Create a large vector with 1536 dimensions (typical for OpenAI embeddings)
219+
$dimensions = [];
220+
for ($i = 0; $i < 1536; ++$i) {
221+
$dimensions[] = $i * 0.001;
222+
}
223+
$vector = new Vector($dimensions);
224+
225+
$platform = PlatformTestHandler::createPlatform(new VectorResult($vector));
226+
$model = new Embeddings();
227+
228+
$vectorizer = new Vectorizer($platform, $model);
229+
$vectorDocuments = $vectorizer->vectorizeDocuments([$document]);
230+
231+
$this->assertCount(1, $vectorDocuments);
232+
$this->assertEquals($vector, $vectorDocuments[0]->vector);
233+
}
234+
235+
public function testVectorizeDocumentsWithSpecialCharacters()
236+
{
237+
$documents = [
238+
new TextDocument(Uuid::v4(), 'Document with "quotes" and special chars: @#$%'),
239+
new TextDocument(Uuid::v4(), "Document with\nnewlines\nand\ttabs"),
240+
new TextDocument(Uuid::v4(), 'Document with émojis 🚀 and ünïcödé'),
241+
];
242+
243+
$vectors = [
244+
new Vector([0.1, 0.2]),
245+
new Vector([0.3, 0.4]),
246+
new Vector([0.5, 0.6]),
247+
];
248+
249+
$platform = PlatformTestHandler::createPlatform(new VectorResult(...$vectors));
250+
$model = new Embeddings();
251+
252+
$vectorizer = new Vectorizer($platform, $model);
253+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
254+
255+
$this->assertCount(3, $vectorDocuments);
256+
257+
foreach ($vectorDocuments as $i => $vectorDoc) {
258+
$this->assertSame($documents[$i]->id, $vectorDoc->id);
259+
$this->assertEquals($vectors[$i], $vectorDoc->vector);
260+
}
261+
}
262+
263+
public function testVectorizeDocumentsWithoutBatchSupportUsesNonBatchMode()
264+
{
265+
// Test with a model that doesn't support batch processing
266+
$model = $this->createMock(Model::class);
267+
$model->expects($this->once())
268+
->method('supports')
269+
->with(Capability::INPUT_MULTIPLE)
270+
->willReturn(false);
271+
272+
$documents = [
273+
new TextDocument(Uuid::v4(), 'Document 1'),
274+
new TextDocument(Uuid::v4(), 'Document 2'),
275+
];
276+
277+
// When batch is not supported, the platform should be invoked once per document
278+
// We simulate this by providing separate vectors for each invocation
279+
$vectors = [
280+
new Vector([0.1, 0.2]),
281+
new Vector([0.3, 0.4]),
282+
];
283+
284+
// Create a custom platform handler for non-batch mode
285+
$handler = new class($vectors) implements ModelClientInterface, ResultConverterInterface {
286+
private int $callIndex = 0;
287+
288+
/**
289+
* @param Vector[] $vectors
290+
*/
291+
public function __construct(
292+
private readonly array $vectors,
293+
) {
294+
}
295+
296+
public function supports(Model $model): bool
297+
{
298+
return true;
299+
}
300+
301+
public function request(Model $model, array|string|object $payload, array $options = []): RawHttpResult
302+
{
303+
return new RawHttpResult(new MockResponse());
304+
}
305+
306+
public function convert(RawResultInterface $result, array $options = []): ResultInterface
307+
{
308+
// Return one vector at a time for non-batch mode
309+
return new VectorResult($this->vectors[$this->callIndex++]);
310+
}
311+
};
312+
313+
$platform = new Platform([$handler], [$handler]);
314+
315+
$vectorizer = new Vectorizer($platform, $model);
316+
$vectorDocuments = $vectorizer->vectorizeDocuments($documents);
317+
318+
$this->assertCount(2, $vectorDocuments);
319+
$this->assertEquals($vectors[0], $vectorDocuments[0]->vector);
320+
$this->assertEquals($vectors[1], $vectorDocuments[1]->vector);
321+
}
322+
}

0 commit comments

Comments
 (0)