Skip to content

Commit 0d880f6

Browse files
committed
feature #707 [Platform][Store] Introduce EmbeddableDocumentInterface (paulinevos)
This PR was squashed before being merged into the main branch. Discussion ---------- [Platform][Store] Introduce `EmbeddableDocumentInterface` | Q | A | ------------- | --- | Bug fix? |no | New feature? | no <!-- please update src/**/CHANGELOG.md files --> | Docs? | no <!-- required for new features --> | Issues | Fix #18 | License | MIT In an effort to support user land objects for the store, this PR introduces an `Embeddable` interface instead of relying on the `TextDocument` class. Please note this interface would require php ^8.4, as it uses interface properties. I chose this approach so as to introduce minimal change to the existing `TextDocument` class and usages. Instead of `content`, an `Embeddable` document has a `data` property. Any type embeddable document class can implement a hook for the `data` property if needed. This PR also introduces union types for the document IDs so as to not make assumptions about the ID type. Commits ------- 3e15fd5 [Platform][Store] Introduce `EmbeddableDocumentInterface`
2 parents aa1ad5d + 3e15fd5 commit 0d880f6

26 files changed

+242
-200
lines changed

examples/document/vectorizing-text-documents.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,6 @@
2626
];
2727

2828
$vectorizer = new Vectorizer($platform, 'text-embedding-3-large');
29-
$vectorDocuments = $vectorizer->vectorizeTextDocuments($textDocuments);
29+
$vectorDocuments = $vectorizer->vectorizeEmbeddableDocuments($textDocuments);
3030

3131
dump(array_map(fn (VectorDocument $document) => $document->vector->getDimensions(), $vectorDocuments));
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\AI\Store\Document;
13+
14+
interface EmbeddableDocumentInterface
15+
{
16+
public function getId(): mixed;
17+
18+
public function getContent(): string;
19+
20+
public function getMetadata(): Metadata;
21+
}

src/store/src/Document/Filter/TextContainsFilter.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public function filter(iterable $documents, array $options = []): iterable
4949
$caseSensitive = $options[self::OPTION_CASE_SENSITIVE] ?? $this->caseSensitive;
5050

5151
foreach ($documents as $document) {
52-
$content = $document->content;
52+
$content = $document->getContent();
5353

5454
if ($caseSensitive) {
5555
$contains = str_contains($content, $needle);

src/store/src/Document/FilterInterface.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
interface FilterInterface
2121
{
2222
/**
23-
* @param iterable<TextDocument> $documents
24-
* @param array<string, mixed> $options
23+
* @param iterable<EmbeddableDocumentInterface> $documents
24+
* @param array<string, mixed> $options
2525
*
26-
* @return iterable<TextDocument>
26+
* @return iterable<EmbeddableDocumentInterface>
2727
*/
2828
public function filter(iterable $documents, array $options = []): iterable;
2929
}

src/store/src/Document/Loader/InMemoryLoader.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,19 @@
1111

1212
namespace Symfony\AI\Store\Document\Loader;
1313

14+
use Symfony\AI\Store\Document\EmbeddableDocumentInterface;
1415
use Symfony\AI\Store\Document\LoaderInterface;
15-
use Symfony\AI\Store\Document\TextDocument;
1616

1717
/**
18-
* Loader that returns pre-loaded TextDocuments from memory.
18+
* Loader that returns preloaded documents from memory.
1919
* Useful for testing or when documents are already available as objects.
2020
*
2121
* @author Oskar Stark <[email protected]>
2222
*/
2323
final readonly class InMemoryLoader implements LoaderInterface
2424
{
2525
/**
26-
* @param TextDocument[] $documents
26+
* @param EmbeddableDocumentInterface[] $documents
2727
*/
2828
public function __construct(
2929
private array $documents = [],

src/store/src/Document/LoaderInterface.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interface LoaderInterface
2020
* @param string|null $source Identifier for the loader to load the documents from, e.g. file path, folder, or URL. Can be null for InMemoryLoader.
2121
* @param array<string, mixed> $options loader specific set of options to control the loading process
2222
*
23-
* @return iterable<TextDocument> iterable of TextDocuments loaded from the source
23+
* @return iterable<EmbeddableDocumentInterface> iterable of embeddable documents loaded from the source
2424
*/
2525
public function load(?string $source, array $options = []): iterable;
2626
}

src/store/src/Document/TextDocument.php

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717
/**
1818
* @author Christopher Hertel <[email protected]>
1919
*/
20-
final readonly class TextDocument
20+
final readonly class TextDocument implements EmbeddableDocumentInterface
2121
{
2222
public function __construct(
23-
public Uuid $id,
24-
public string $content,
25-
public Metadata $metadata = new Metadata(),
23+
private Uuid $id,
24+
private string $content,
25+
private Metadata $metadata = new Metadata(),
2626
) {
2727
if ('' === trim($this->content)) {
2828
throw new InvalidArgumentException('The content shall not be an empty string.');
@@ -33,4 +33,19 @@ public function withContent(string $content): self
3333
{
3434
return new self($this->id, $content, $this->metadata);
3535
}
36+
37+
public function getId(): Uuid
38+
{
39+
return $this->id;
40+
}
41+
42+
public function getContent(): string
43+
{
44+
return $this->content;
45+
}
46+
47+
public function getMetadata(): Metadata
48+
{
49+
return $this->metadata;
50+
}
3651
}

src/store/src/Document/Transformer/TextReplaceTransformer.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
namespace Symfony\AI\Store\Document\Transformer;
1313

14+
use Symfony\AI\Store\Document\TextDocument;
1415
use Symfony\AI\Store\Document\TransformerInterface;
1516
use Symfony\AI\Store\Exception\InvalidArgumentException;
1617

@@ -32,6 +33,7 @@ public function __construct(
3233
}
3334

3435
/**
36+
* @param iterable<TextDocument> $documents
3537
* @param array{search?: string, replace?: string} $options
3638
*/
3739
public function transform(iterable $documents, array $options = []): iterable
@@ -42,7 +44,7 @@ public function transform(iterable $documents, array $options = []): iterable
4244
self::validate($search, $replace);
4345

4446
foreach ($documents as $document) {
45-
yield $document->withContent(str_replace($search, $replace, $document->content));
47+
yield $document->withContent(str_replace($search, $replace, $document->getContent()));
4648
}
4749
}
4850

src/store/src/Document/Transformer/TextSplitTransformer.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,13 @@ public function transform(iterable $documents, array $options = []): iterable
5151
}
5252

5353
foreach ($documents as $document) {
54-
if (mb_strlen($document->content) <= $chunkSize) {
54+
if (mb_strlen($document->getContent()) <= $chunkSize) {
5555
yield $document;
5656

5757
continue;
5858
}
5959

60-
$text = $document->content;
60+
$text = $document->getContent();
6161
$length = mb_strlen($text);
6262
$start = 0;
6363

@@ -66,9 +66,9 @@ public function transform(iterable $documents, array $options = []): iterable
6666
$chunkText = mb_substr($text, $start, $end - $start);
6767

6868
yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
69-
Metadata::KEY_PARENT_ID => $document->id,
69+
Metadata::KEY_PARENT_ID => $document->getId(),
7070
Metadata::KEY_TEXT => $chunkText,
71-
...$document->metadata,
71+
...$document->getMetadata(),
7272
]));
7373

7474
$start += ($chunkSize - $overlap);

src/store/src/Document/Transformer/TextTrimTransformer.php

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
namespace Symfony\AI\Store\Document\Transformer;
1313

14+
use Symfony\AI\Store\Document\TextDocument;
1415
use Symfony\AI\Store\Document\TransformerInterface;
1516

1617
/**
@@ -20,10 +21,13 @@
2021
*/
2122
final readonly class TextTrimTransformer implements TransformerInterface
2223
{
24+
/**
25+
* @param iterable<TextDocument> $documents
26+
*/
2327
public function transform(iterable $documents, array $options = []): iterable
2428
{
2529
foreach ($documents as $document) {
26-
yield $document->withContent(trim($document->content));
30+
yield $document->withContent(trim($document->getContent()));
2731
}
2832
}
2933
}

0 commit comments

Comments
 (0)