Skip to content

Commit 3fd202f

Browse files
committed
feature #732 [Store] Unify Vectorizer methods into single vectorize() method (OskarStark)
This PR was squashed before being merged into the main branch. Discussion ---------- [Store] Unify `Vectorizer` methods into single `vectorize()` method | Q | A | ------------- | --- | Bug fix? | no | New feature? | yes | Docs? | no | Issues | -- | License | MIT Replace separate vectorizeTextDocuments() and vectorize(string) methods with a single vectorize() method that handles: - string -> Vector - array<string> -> array<Vector> - TextDocument -> VectorDocument - array<TextDocument> -> array<VectorDocument> Add PHPStan conditional return types for proper type inference. Add validation to ensure arrays contain only strings or TextDocuments. Update all usages across codebase including tests and examples. cc `@paulinevos` Commits ------- 3fa8960 [Store] Unify `Vectorizer` methods into single `vectorize()` method
2 parents 4674607 + 3fa8960 commit 3fd202f

File tree

5 files changed

+257
-42
lines changed

5 files changed

+257
-42
lines changed

examples/document/vectorizing-text-documents.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@
2626
];
2727

2828
$vectorizer = new Vectorizer($platform, 'text-embedding-3-large');
29-
$vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($textDocuments);
29+
$vectorDocuments = $vectorizer->vectorize($textDocuments);
3030

3131
dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));

src/store/src/Document/Vectorizer.php

Lines changed: 123 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,129 @@ public function __construct(
2727
) {
2828
}
2929

30-
public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array
30+
public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array
31+
{
32+
if (\is_string($values) || $values instanceof \Stringable) {
33+
return $this->vectorizeString($values, $options);
34+
}
35+
36+
if ($values instanceof EmbeddableDocumentInterface) {
37+
return $this->vectorizeEmbeddableDocument($values, $options);
38+
}
39+
40+
if ([] === $values) {
41+
return [];
42+
}
43+
44+
$firstElement = reset($values);
45+
if ($firstElement instanceof EmbeddableDocumentInterface) {
46+
$this->validateArray($values, EmbeddableDocumentInterface::class);
47+
48+
return $this->vectorizeEmbeddableDocuments($values, $options);
49+
}
50+
51+
if (\is_string($firstElement) || $firstElement instanceof \Stringable) {
52+
$this->validateArray($values, 'string|stringable');
53+
54+
return $this->vectorizeStrings($values, $options);
55+
}
56+
57+
throw new RuntimeException('Array must contain only strings, Stringable objects, or EmbeddableDocumentInterface instances.');
58+
}
59+
60+
/**
61+
* @param array<mixed> $values
62+
*/
63+
private function validateArray(array $values, string $expectedType): void
64+
{
65+
foreach ($values as $value) {
66+
if ('string|stringable' === $expectedType) {
67+
if (!\is_string($value) && !$value instanceof \Stringable) {
68+
throw new RuntimeException('Array must contain only strings or Stringable objects.');
69+
}
70+
} elseif (!$value instanceof $expectedType) {
71+
throw new RuntimeException(\sprintf('Array must contain only "%s" instances.', $expectedType));
72+
}
73+
}
74+
}
75+
76+
/**
77+
* @param array<string, mixed> $options
78+
*/
79+
private function vectorizeString(string|\Stringable $string, array $options = []): Vector
80+
{
81+
$stringValue = (string) $string;
82+
$this->logger->debug('Vectorizing string', ['string' => $stringValue]);
83+
84+
$result = $this->platform->invoke($this->model, $stringValue, $options);
85+
$vectors = $result->asVectors();
86+
87+
if (!isset($vectors[0])) {
88+
throw new RuntimeException('No vector returned for string vectorization.');
89+
}
90+
91+
return $vectors[0];
92+
}
93+
94+
/**
95+
* @param array<string, mixed> $options
96+
*/
97+
private function vectorizeEmbeddableDocument(EmbeddableDocumentInterface $document, array $options = []): VectorDocument
98+
{
99+
$this->logger->debug('Vectorizing embeddable document', ['document_id' => $document->getId()]);
100+
101+
$vector = $this->vectorizeString($document->getContent(), $options);
102+
103+
return new VectorDocument($document->getId(), $vector, $document->getMetadata());
104+
}
105+
106+
/**
107+
* @param array<string|\Stringable> $strings
108+
* @param array<string, mixed> $options
109+
*
110+
* @return array<Vector>
111+
*/
112+
private function vectorizeStrings(array $strings, array $options = []): array
113+
{
114+
$stringCount = \count($strings);
115+
$this->logger->info('Starting vectorization of strings', ['string_count' => $stringCount]);
116+
117+
// Convert all values to strings
118+
$stringValues = array_map(fn (string|\Stringable $s) => (string) $s, $strings);
119+
120+
if ($this->platform->getModelCatalog()->getModel($this->model)->supports(Capability::INPUT_MULTIPLE)) {
121+
$this->logger->debug('Using batch vectorization with model that supports multiple inputs');
122+
$result = $this->platform->invoke($this->model, $stringValues, $options);
123+
124+
$vectors = $result->asVectors();
125+
$this->logger->debug('Batch vectorization completed', ['vector_count' => \count($vectors)]);
126+
} else {
127+
$this->logger->debug('Using sequential vectorization for model without multiple input support');
128+
$results = [];
129+
foreach ($stringValues as $i => $string) {
130+
$this->logger->debug('Vectorizing string', ['string_index' => $i]);
131+
$results[] = $this->platform->invoke($this->model, $string, $options);
132+
}
133+
134+
$vectors = [];
135+
foreach ($results as $result) {
136+
$vectors = array_merge($vectors, $result->asVectors());
137+
}
138+
$this->logger->debug('Sequential vectorization completed', ['vector_count' => \count($vectors)]);
139+
}
140+
141+
$this->logger->info('Vectorization process completed', ['string_count' => $stringCount, 'vector_count' => \count($vectors)]);
142+
143+
return $vectors;
144+
}
145+
146+
/**
147+
* @param array<EmbeddableDocumentInterface> $documents
148+
* @param array<string, mixed> $options
149+
*
150+
* @return array<VectorDocument>
151+
*/
152+
private function vectorizeEmbeddableDocuments(array $documents, array $options = []): array
31153
{
32154
$documentCount = \count($documents);
33155
$this->logger->info('Starting vectorization process', ['document_count' => $documentCount]);
@@ -65,18 +187,4 @@ public function vectorizeEmbeddableDocuments(array $documents, array $options =
65187

66188
return $vectorDocuments;
67189
}
68-
69-
public function vectorize(string|\Stringable $string, array $options = []): Vector
70-
{
71-
$this->logger->debug('Vectorizing string', ['string' => (string) $string]);
72-
73-
$result = $this->platform->invoke($this->model, (string) $string, $options);
74-
$vectors = $result->asVectors();
75-
76-
if (!isset($vectors[0])) {
77-
throw new RuntimeException('No vector returned for string vectorization.');
78-
}
79-
80-
return $vectors[0];
81-
}
82190
}

src/store/src/Document/VectorizerInterface.php

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,25 +14,27 @@
1414
use Symfony\AI\Platform\Vector\Vector;
1515

1616
/**
17-
* Interface for converting a collection of Embeddable documents into VectorDocuments
18-
* and for vectorizing individual strings.
17+
* Interface for vectorizing strings and EmbeddableDocuments into Vectors and VectorDocuments.
1918
*
2019
* @author Oskar Stark <[email protected]>
2120
*/
2221
interface VectorizerInterface
2322
{
2423
/**
25-
* @param EmbeddableDocumentInterface[] $documents
26-
* @param array<string, mixed> $options Options to pass to the underlying platform
24+
* Vectorizes strings or EmbeddableDocuments into Vectors or VectorDocuments.
2725
*
28-
* @return VectorDocument[]
29-
*/
30-
public function vectorizeEmbeddableDocuments(array $documents, array $options = []): array;
31-
32-
/**
33-
* Vectorizes a single string or Stringable object into a Vector.
26+
* @param string|\Stringable|EmbeddableDocumentInterface|array<string|\Stringable>|array<EmbeddableDocumentInterface> $values The values to vectorize
27+
* @param array<string, mixed> $options Options to pass to the underlying platform
28+
*
29+
* @return Vector|VectorDocument|array<Vector>|array<VectorDocument>
3430
*
35-
* @param array<string, mixed> $options Options to pass to the underlying platform
31+
* @phpstan-return (
32+
* $values is string|\Stringable ? Vector : (
33+
* $values is EmbeddableDocumentInterface ? VectorDocument : (
34+
* $values is array<string|\Stringable> ? array<Vector> : array<VectorDocument>
35+
* )
36+
* )
37+
* )
3638
*/
37-
public function vectorize(string|\Stringable $string, array $options = []): Vector;
39+
public function vectorize(string|\Stringable|EmbeddableDocumentInterface|array $values, array $options = []): Vector|VectorDocument|array;
3840
}

src/store/src/Indexer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,13 @@ public function index(array $options = []): void
8787
++$counter;
8888

8989
if ($chunkSize === \count($chunk)) {
90-
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
90+
$this->store->add(...$this->vectorizer->vectorize($chunk));
9191
$chunk = [];
9292
}
9393
}
9494

9595
if ([] !== $chunk) {
96-
$this->store->add(...$this->vectorizer->vectorizeEmbeddableDocuments($chunk));
96+
$this->store->add(...$this->vectorizer->vectorize($chunk));
9797
}
9898

9999
$this->logger->debug('Document processing completed', ['total_documents' => $counter]);

0 commit comments

Comments
 (0)