Skip to content

Commit 17b3adb

Browse files
committed
Add constructor with configurable chunk size and overlap to TextSplitTransformer
- Add constructor to TextSplitTransformer with default values (chunkSize=1000, overlap=200) - Add validation that overlap must be non-negative and less than chunk size - Use constructor parameters as defaults when no options are provided in transform method - Add comprehensive tests for constructor parameter validation - Fix code style with PHP CS Fixer
1 parent 2676547 commit 17b3adb

File tree

2 files changed

+68
-2
lines changed

2 files changed

+68
-2
lines changed

src/store/src/Document/Transformer/TextSplitTransformer.php

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,22 @@
2929
public const OPTION_CHUNK_SIZE = 'chunk_size';
3030
public const OPTION_OVERLAP = 'overlap';
3131

32+
public function __construct(
33+
private int $chunkSize = 1000,
34+
private int $overlap = 200,
35+
) {
36+
if ($this->overlap < 0 || $this->overlap >= $this->chunkSize) {
37+
throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d', $this->chunkSize, $this->overlap));
38+
}
39+
}
40+
3241
/**
3342
* @param array{chunk_size?: int, overlap?: int} $options
3443
*/
3544
public function transform(iterable $documents, array $options = []): iterable
3645
{
37-
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? 1000;
38-
$overlap = $options[self::OPTION_OVERLAP] ?? 200;
46+
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? $this->chunkSize;
47+
$overlap = $options[self::OPTION_OVERLAP] ?? $this->overlap;
3948

4049
if ($overlap < 0 || $overlap >= $chunkSize) {
4150
throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.');

src/store/tests/Document/Transformer/TextSplitTransformerTest.php

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,63 @@ public function testSplitWithNegativeOverlap()
184184
]));
185185
}
186186

187+
public function testConstructorWithValidParameters()
188+
{
189+
$transformer = new TextSplitTransformer(500, 100);
190+
$document = new TextDocument(Uuid::v4(), 'short text');
191+
192+
$chunks = iterator_to_array($transformer->transform([$document]));
193+
194+
$this->assertCount(1, $chunks);
195+
$this->assertSame('short text', $chunks[0]->content);
196+
}
197+
198+
public function testConstructorWithDefaultParameters()
199+
{
200+
$transformer = new TextSplitTransformer();
201+
$document = new TextDocument(Uuid::v4(), 'short text');
202+
203+
$chunks = iterator_to_array($transformer->transform([$document]));
204+
205+
$this->assertCount(1, $chunks);
206+
$this->assertSame('short text', $chunks[0]->content);
207+
}
208+
209+
public function testConstructorWithNegativeOverlap()
210+
{
211+
$this->expectException(InvalidArgumentException::class);
212+
$this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 1000, overlap: -1');
213+
214+
new TextSplitTransformer(1000, -1);
215+
}
216+
217+
public function testConstructorWithOverlapEqualToChunkSize()
218+
{
219+
$this->expectException(InvalidArgumentException::class);
220+
$this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 500, overlap: 500');
221+
222+
new TextSplitTransformer(500, 500);
223+
}
224+
225+
public function testConstructorWithOverlapGreaterThanChunkSize()
226+
{
227+
$this->expectException(InvalidArgumentException::class);
228+
$this->expectExceptionMessage('Overlap must be non-negative and less than chunk size. Got chunk size: 100, overlap: 200');
229+
230+
new TextSplitTransformer(100, 200);
231+
}
232+
233+
public function testConstructorParametersAreUsedAsDefaults()
234+
{
235+
$transformer = new TextSplitTransformer(150, 25);
236+
$document = new TextDocument(Uuid::v4(), $this->getLongText());
237+
238+
$chunks = iterator_to_array($transformer->transform([$document]));
239+
240+
$this->assertCount(12, $chunks);
241+
$this->assertSame(150, mb_strlen($chunks[0]->content));
242+
}
243+
187244
private function getLongText(): string
188245
{
189246
return trim(file_get_contents(\dirname(__DIR__, 5).'/fixtures/lorem.txt'));

0 commit comments

Comments
 (0)