Skip to content

Commit 8452099

Browse files
committed
Vectors: Built content vector indexing system
1 parent 0ec0913 commit 8452099

9 files changed

+269
-1
lines changed

app/Config/services.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@
2222
// Callback URL for social authentication methods
2323
'callback_url' => env('APP_URL', false),
2424

25+
// LLM Service
26+
// Options: openai
27+
'llm' => env('LLM_SERVICE', ''),
28+
29+
// OpenAI API-compatible service details
30+
'openai' => [
31+
'endpoint' => env('OPENAI_ENDPOINT', 'https://api.openai.com'),
32+
'key' => env('OPENAI_KEY', ''),
33+
],
34+
2535
'github' => [
2636
'client_id' => env('GITHUB_APP_ID', false),
2737
'client_secret' => env('GITHUB_APP_SECRET', false),

app/Search/SearchIndex.php

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
use BookStack\Entities\EntityProvider;
77
use BookStack\Entities\Models\Entity;
88
use BookStack\Entities\Models\Page;
9+
use BookStack\Search\Vectors\StoreEntityVectorsJob;
10+
use BookStack\Search\Vectors\VectorQueryServiceProvider;
911
use BookStack\Util\HtmlDocument;
1012
use DOMNode;
1113
use Illuminate\Database\Eloquent\Builder;
@@ -25,7 +27,7 @@ class SearchIndex
2527
public static string $softDelimiters = ".-";
2628

2729
public function __construct(
28-
protected EntityProvider $entityProvider
30+
protected EntityProvider $entityProvider,
2931
) {
3032
}
3133

@@ -37,6 +39,10 @@ public function indexEntity(Entity $entity): void
3739
$this->deleteEntityTerms($entity);
3840
$terms = $this->entityToTermDataArray($entity);
3941
$this->insertTerms($terms);
42+
43+
if (VectorQueryServiceProvider::isEnabled()) {
44+
dispatch(new StoreEntityVectorsJob($entity));
45+
}
4046
}
4147

4248
/**
@@ -47,9 +53,15 @@ public function indexEntity(Entity $entity): void
4753
public function indexEntities(array $entities): void
4854
{
4955
$terms = [];
56+
$vectorQueryEnabled = VectorQueryServiceProvider::isEnabled();
57+
5058
foreach ($entities as $entity) {
5159
$entityTerms = $this->entityToTermDataArray($entity);
5260
array_push($terms, ...$entityTerms);
61+
62+
if ($vectorQueryEnabled) {
63+
dispatch(new StoreEntityVectorsJob($entity));
64+
}
5365
}
5466

5567
$this->insertTerms($terms);
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors;
4+
5+
use BookStack\Entities\Models\Entity;
6+
use BookStack\Search\Vectors\Services\VectorQueryService;
7+
use Illuminate\Support\Facades\DB;
8+
9+
class EntityVectorGenerator
10+
{
11+
public function __construct(
12+
protected VectorQueryServiceProvider $vectorQueryServiceProvider
13+
) {
14+
}
15+
16+
public function generateAndStore(Entity $entity): void
17+
{
18+
$vectorService = $this->vectorQueryServiceProvider->get();
19+
20+
$text = $this->entityToPlainText($entity);
21+
$chunks = $this->chunkText($text);
22+
$embeddings = $this->chunksToEmbeddings($chunks, $vectorService);
23+
24+
$this->deleteExistingEmbeddingsForEntity($entity);
25+
$this->storeEmbeddings($embeddings, $chunks, $entity);
26+
}
27+
28+
protected function deleteExistingEmbeddingsForEntity(Entity $entity): void
29+
{
30+
SearchVector::query()
31+
->where('entity_type', '=', $entity->getMorphClass())
32+
->where('entity_id', '=', $entity->id)
33+
->delete();
34+
}
35+
36+
protected function storeEmbeddings(array $embeddings, array $textChunks, Entity $entity): void
37+
{
38+
$toInsert = [];
39+
40+
foreach ($embeddings as $index => $embedding) {
41+
$text = $textChunks[$index];
42+
$toInsert[] = [
43+
'entity_id' => $entity->id,
44+
'entity_type' => $entity->getMorphClass(),
45+
'embedding' => DB::raw('STRING_TO_VECTOR("[' . implode(',', $embedding) . ']")'),
46+
'text' => $text,
47+
];
48+
}
49+
50+
// TODO - Chunk inserts
51+
SearchVector::query()->insert($toInsert);
52+
}
53+
54+
/**
55+
* @param string[] $chunks
56+
* @return float[] array
57+
*/
58+
protected function chunksToEmbeddings(array $chunks, VectorQueryService $vectorQueryService): array
59+
{
60+
$embeddings = [];
61+
foreach ($chunks as $index => $chunk) {
62+
$embeddings[$index] = $vectorQueryService->generateEmbeddings($chunk);
63+
}
64+
return $embeddings;
65+
}
66+
67+
/**
68+
* @return string[]
69+
*/
70+
protected function chunkText(string $text): array
71+
{
72+
// TODO - Join adjacent smaller chunks up
73+
return array_filter(array_map(function (string $section): string {
74+
return trim($section);
75+
}, explode("\n", $text)));
76+
}
77+
78+
protected function entityToPlainText(Entity $entity): string
79+
{
80+
$text = $entity->name . "\n\n" . $entity->{$entity->textField};
81+
// TODO - Add tags
82+
return $text;
83+
}
84+
}

app/Search/Vectors/SearchVector.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors;
4+
5+
use Illuminate\Database\Eloquent\Model;
6+
7+
/**
8+
* @property string $entity_type
9+
* @property int $entity_id
10+
* @property string $text
11+
* @property string $embedding
12+
*/
13+
class SearchVector extends Model
14+
{
15+
public $timestamps = false;
16+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors\Services;
4+
5+
use BookStack\Http\HttpRequestService;
6+
7+
class OpenAiVectorQueryService implements VectorQueryService
8+
{
9+
public function __construct(
10+
protected string $endpoint,
11+
protected string $key,
12+
protected HttpRequestService $http,
13+
) {
14+
}
15+
16+
protected function jsonRequest(string $method, string $uri, array $data): array
17+
{
18+
$fullUrl = rtrim($this->endpoint, '/') . '/' . ltrim($uri, '/');
19+
$client = $this->http->buildClient(10);
20+
$request = $this->http->jsonRequest($method, $fullUrl, $data)
21+
->withHeader('Authorization', 'Bearer ' . $this->key);
22+
23+
$response = $client->sendRequest($request);
24+
return json_decode($response->getBody()->getContents(), true);
25+
}
26+
27+
public function generateEmbeddings(string $text): array
28+
{
29+
$response = $this->jsonRequest('POST', 'v1/embeddings', [
30+
'input' => $text,
31+
'model' => 'text-embedding-3-small',
32+
]);
33+
34+
return $response['data'][0]['embedding'];
35+
}
36+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors\Services;
4+
5+
interface VectorQueryService
6+
{
7+
/**
8+
* Generate embedding vectors from the given chunk of text.
9+
* @return float[]
10+
*/
11+
public function generateEmbeddings(string $text): array;
12+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors;
4+
5+
use BookStack\Entities\Models\Entity;
6+
use Illuminate\Contracts\Queue\ShouldQueue;
7+
use Illuminate\Foundation\Queue\Queueable;
8+
9+
class StoreEntityVectorsJob implements ShouldQueue
10+
{
11+
use Queueable;
12+
13+
/**
14+
* Create a new job instance.
15+
*/
16+
public function __construct(
17+
protected Entity $entity
18+
) {
19+
}
20+
21+
/**
22+
* Execute the job.
23+
*/
24+
public function handle(EntityVectorGenerator $generator): void
25+
{
26+
$generator->generateAndStore($this->entity);
27+
}
28+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<?php
2+
3+
namespace BookStack\Search\Vectors;
4+
5+
use BookStack\Http\HttpRequestService;
6+
use BookStack\Search\Vectors\Services\OpenAiVectorQueryService;
7+
use BookStack\Search\Vectors\Services\VectorQueryService;
8+
9+
class VectorQueryServiceProvider
10+
{
11+
public function __construct(
12+
protected HttpRequestService $http,
13+
) {
14+
}
15+
16+
public function get(): VectorQueryService
17+
{
18+
$service = $this->getServiceName();
19+
20+
if ($service === 'openai') {
21+
$key = config('services.openai.key');
22+
$endpoint = config('services.openai.endpoint');
23+
return new OpenAiVectorQueryService($endpoint, $key, $this->http);
24+
}
25+
26+
throw new \Exception("No '{$service}' LLM service found");
27+
}
28+
29+
protected static function getServiceName(): string
30+
{
31+
return strtolower(config('services.llm'));
32+
}
33+
34+
public static function isEnabled(): bool
35+
{
36+
return !empty(static::getServiceName());
37+
}
38+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<?php
2+
3+
use Illuminate\Database\Migrations\Migration;
4+
use Illuminate\Database\Schema\Blueprint;
5+
use Illuminate\Support\Facades\Schema;
6+
7+
return new class extends Migration
8+
{
9+
/**
10+
* Run the migrations.
11+
*/
12+
public function up(): void
13+
{
14+
// TODO - Handle compatibility with older databases that don't support vectors
15+
Schema::create('search_vectors', function (Blueprint $table) {
16+
$table->string('entity_type', 100);
17+
$table->integer('entity_id');
18+
$table->text('text');
19+
$table->vector('embedding');
20+
21+
$table->index(['entity_type', 'entity_id']);
22+
});
23+
}
24+
25+
/**
26+
* Reverse the migrations.
27+
*/
28+
public function down(): void
29+
{
30+
Schema::dropIfExists('search_vectors');
31+
}
32+
};

0 commit comments

Comments
 (0)