Skip to content

Commit 54f883e

Browse files
committed
Improved vector text chunking
1 parent e611b32 commit 54f883e

File tree

4 files changed

+140
-9
lines changed

4 files changed

+140
-9
lines changed

app/Search/SearchController.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,12 @@ public function searchSiblings(Request $request, SiblingFetcher $siblingFetcher)
141141
return view('entities.list-basic', ['entities' => $entities, 'style' => 'compact']);
142142
}
143143

144+
/**
145+
* Perform a vector/LLM-based query search.
146+
*/
144147
public function searchQuery(Request $request, VectorSearchRunner $runner)
145148
{
149+
// TODO - Validate if query system is active
146150
$query = $request->get('query', '');
147151

148152
if ($query) {

app/Search/Vectors/EntityVectorGenerator.php

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
namespace BookStack\Search\Vectors;
44

5+
use BookStack\Activity\Models\Tag;
56
use BookStack\Entities\Models\Entity;
67
use BookStack\Search\Vectors\Services\VectorQueryService;
78
use Illuminate\Support\Facades\DB;
@@ -47,8 +48,10 @@ protected function storeEmbeddings(array $embeddings, array $textChunks, Entity
4748
];
4849
}
4950

50-
// TODO - Chunk inserts
51-
SearchVector::query()->insert($toInsert);
51+
$chunks = array_chunk($toInsert, 500);
52+
foreach ($chunks as $chunk) {
53+
SearchVector::query()->insert($chunk);
54+
}
5255
}
5356

5457
/**
@@ -69,16 +72,16 @@ protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQ
6972
*/
7073
protected function chunkText(string $text): array
7174
{
72-
// TODO - Join adjacent smaller chunks up
73-
return array_filter(array_map(function (string $section): string {
74-
return trim($section);
75-
}, explode("\n", $text)));
75+
return (new TextChunker(500, ["\n", '.', ' ', '']))->chunk($text);
7676
}
7777

7878
protected function entityToPlainText(Entity $entity): string
7979
{
80-
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
81-
// TODO - Add tags
82-
return $text;
80+
$tags = $entity->tags()->get();
81+
$tagText = $tags->map(function (Tag $tag) {
82+
return $tag->name . ': ' . $tag->value;
83+
})->join('\n');
84+
85+
return $entity->name . "\n{$tagText}\n" . $entity->{$entity->textField};
8386
}
8487
}

app/Search/Vectors/TextChunker.php

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors;
4+
5+
use InvalidArgumentException;
6+
7+
/**
8+
* Splits a given string into smaller chunks based on specified delimiters
9+
* and a predefined maximum chunk size. This will work through the given delimiters
10+
* to break down text further and further to fit into the chunk size.
11+
*
12+
* The last delimiter is always an empty string to ensure text can always be broken down.
13+
*/
14+
class TextChunker
15+
{
16+
public function __construct(
17+
protected int $chunkSize,
18+
protected array $delimiterOrder,
19+
) {
20+
if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
21+
$this->delimiterOrder[] = '';
22+
}
23+
24+
if ($this->chunkSize < 1) {
25+
throw new InvalidArgumentException('Chunk size must be greater than 0');
26+
}
27+
}
28+
29+
public function chunk(string $text): array
30+
{
31+
$delimiter = $this->delimiterOrder[0];
32+
$delimiterLength = strlen($delimiter);
33+
$lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
34+
35+
$cChunk = ''; // Current chunk
36+
$cLength = 0; // Current chunk length
37+
$chunks = []; // Chunks to return
38+
$lDelim = ''; // Last delimiter
39+
40+
foreach ($lines as $index => $line) {
41+
$lineLength = strlen($line);
42+
if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
43+
$cChunk .= $line . $delimiter;
44+
$cLength += $lineLength + $delimiterLength;
45+
$lDelim = $delimiter;
46+
} else if ($lineLength <= $this->chunkSize) {
47+
$chunks[] = trim($cChunk, $delimiter);
48+
$cChunk = $line . $delimiter;
49+
$cLength = $lineLength + $delimiterLength;
50+
$lDelim = $delimiter;
51+
} else {
52+
$subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
53+
$subDelimiter = $this->delimiterOrder[1] ?? '';
54+
$subDelimiterLength = strlen($subDelimiter);
55+
foreach ($subChunks->chunk($line) as $subChunk) {
56+
$chunkLength = strlen($subChunk);
57+
if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
58+
$cChunk .= $subChunk . $subDelimiter;
59+
$cLength += $chunkLength + $subDelimiterLength;
60+
$lDelim = $subDelimiter;
61+
} else {
62+
$chunks[] = trim($cChunk, $lDelim);
63+
$cChunk = $subChunk . $subDelimiter;
64+
$cLength = $chunkLength + $subDelimiterLength;
65+
$lDelim = $subDelimiter;
66+
}
67+
}
68+
}
69+
}
70+
71+
if ($cChunk !== '') {
72+
$chunks[] = trim($cChunk, $lDelim);
73+
}
74+
75+
return $chunks;
76+
}
77+
}

tests/Search/TextChunkerTest.php

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?php
2+
3+
namespace Search;
4+
5+
use BookStack\Search\Vectors\TextChunker;
6+
use Tests\TestCase;
7+
8+
class TextChunkerTest extends TestCase
9+
{
10+
public function test_it_chunks_text()
11+
{
12+
$chunker = new TextChunker(3, []);
13+
$chunks = $chunker->chunk('123456789');
14+
15+
$this->assertEquals(['123', '456', '789'], $chunks);
16+
}
17+
18+
public function test_chunk_size_must_be_greater_than_zero()
19+
{
20+
$this->expectException(\InvalidArgumentException::class);
21+
$chunker = new TextChunker(-5, []);
22+
}
23+
24+
public function test_it_works_through_given_delimiters()
25+
{
26+
$chunker = new TextChunker(5, ['-', '.', '']);
27+
$chunks = $chunker->chunk('12-3456.789abcdefg');
28+
29+
$this->assertEquals(['12', '3456', '789ab', 'cdefg'], $chunks);
30+
}
31+
32+
public function test_it_attempts_to_pack_chunks()
33+
{
34+
$chunker = new TextChunker(8, [' ', '']);
35+
$chunks = $chunker->chunk('123 456 789 abc def');
36+
37+
$this->assertEquals(['123 456', '789 abc', 'def'], $chunks);
38+
}
39+
40+
public function test_it_attempts_to_pack_using_subchunks()
41+
{
42+
$chunker = new TextChunker(8, [' ', '-', '']);
43+
$chunks = $chunker->chunk('123 456-789abc');
44+
45+
$this->assertEquals(['123 456', '789abc'], $chunks);
46+
}
47+
}

0 commit comments

Comments
 (0)