Skip to content

Commit 50a1094

Browse files
committed
Merge branch 'refactor'
* refactor: (32 commits) rebuild the index when the embedding model changed fix info output on used models auto style fixes ask the rephrased question only if it has more context print score in chat set custom info text for simulate sub command emit the INDEXER_PAGE_ADD event make threshold configurable mechanisms to override things on command line animate button on first show automatic stylefixes better JSON exception handling in storages small adjustments prefer prompted user messages over system prompts various refactoring and introduction of a simulate command separate the rephrasing model from the chat model do not hardcode dimensions in qdrant storage fix syntax error in qdrant storage fix info output correctly use storage setting ...
2 parents a55df08 + bae450a commit 50a1094

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1852
-533
lines changed

AbstractCLI.php

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
namespace dokuwiki\plugin\aichat;
4+
5+
use dokuwiki\Extension\CLIPlugin;
6+
use splitbrain\phpcli\Options;
7+
8+
abstract class AbstractCLI extends CLIPlugin
9+
{
10+
/** @var \helper_plugin_aichat */
11+
protected $helper;
12+
13+
/** @inheritdoc */
14+
public function __construct($autocatch = true)
15+
{
16+
parent::__construct($autocatch);
17+
$this->helper = plugin_load('helper', 'aichat');
18+
$this->helper->setLogger($this);
19+
$this->loadConfig();
20+
ini_set('memory_limit', -1);
21+
}
22+
23+
/** @inheritdoc */
24+
protected function setup(Options $options)
25+
{
26+
$options->useCompactHelp();
27+
28+
$options->registerOption(
29+
'lang',
30+
'When set to a language code, it overrides the the lang and preferUIlanguage settings and asks the ' .
31+
'bot to always use this language instead. ' .
32+
'When set to "auto" the bot is asked to detect the language of the input falling back to the wiki lang.',
33+
'',
34+
'lang'
35+
);
36+
}
37+
38+
/** @inheritDoc */
39+
protected function main(Options $options)
40+
{
41+
if ($this->loglevel['debug']['enabled']) {
42+
$this->helper->factory->setDebug(true);
43+
}
44+
45+
$lc = $options->getOpt('lang');
46+
if ($lc === 'auto') {
47+
$this->helper->updateConfig(['preferUIlanguage' => 0]);
48+
} elseif ($lc) {
49+
$this->helper->updateConfig(['preferUIlanguage' => 1]);
50+
global $conf;
51+
$conf['lang'] = $lc;
52+
}
53+
}
54+
}

Chunk.php

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ public function __construct(
3232

3333
public function __toString(): string
3434
{
35-
return $this->page . '#' . $this->id;
35+
$string = $this->page . '#' . $this->id;
36+
if ($this->score) {
37+
$string .= sprintf(' (%.2f)', $this->score);
38+
}
39+
return $string;
3640
}
3741

3842
/**

Embeddings.php

Lines changed: 82 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
namespace dokuwiki\plugin\aichat;
44

5+
use dokuwiki\Extension\Event;
56
use dokuwiki\Extension\PluginInterface;
6-
use dokuwiki\plugin\aichat\Model\AbstractModel;
7+
use dokuwiki\plugin\aichat\Model\ChatInterface;
8+
use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
79
use dokuwiki\plugin\aichat\Storage\AbstractStorage;
810
use dokuwiki\Search\Indexer;
911
use splitbrain\phpcli\CLI;
@@ -21,8 +23,12 @@ class Embeddings
2123
/** @var int maximum overlap between chunks in tokens */
2224
final public const MAX_OVERLAP_LEN = 200;
2325

24-
/** @var AbstractModel */
25-
protected $model;
26+
/** @var ChatInterface */
27+
protected $chatModel;
28+
29+
/** @var EmbeddingInterface */
30+
protected $embedModel;
31+
2632
/** @var CLI|null */
2733
protected $logger;
2834
/** @var Encoder */
@@ -34,10 +40,33 @@ class Embeddings
3440
/** @var array remember sentences when chunking */
3541
private $sentenceQueue = [];
3642

37-
public function __construct(AbstractModel $model, AbstractStorage $storage)
38-
{
39-
$this->model = $model;
43+
/** @var int the time spent for the last similar chunk retrieval */
44+
public $timeSpent = 0;
45+
46+
protected $configChunkSize;
47+
protected $configContextChunks;
48+
protected $similarityThreshold;
49+
50+
/**
51+
* Embeddings constructor.
52+
*
53+
* @param ChatInterface $chatModel
54+
* @param EmbeddingInterface $embedModel
55+
* @param AbstractStorage $storage
56+
* @param array $config The plugin configuration
57+
*/
58+
public function __construct(
59+
ChatInterface $chatModel,
60+
EmbeddingInterface $embedModel,
61+
AbstractStorage $storage,
62+
$config
63+
) {
64+
$this->chatModel = $chatModel;
65+
$this->embedModel = $embedModel;
4066
$this->storage = $storage;
67+
$this->configChunkSize = $config['chunkSize'];
68+
$this->configContextChunks = $config['contextChunks'];
69+
$this->similarityThreshold = $config['similarityThreshold'] / 100;
4170
}
4271

4372
/**
@@ -73,6 +102,20 @@ public function getTokenEncoder()
73102
return $this->tokenEncoder;
74103
}
75104

105+
/**
106+
* Return the chunk size to use
107+
*
108+
* @return int
109+
*/
110+
public function getChunkSize()
111+
{
112+
return min(
113+
floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
114+
floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
115+
$this->configChunkSize, // this is usually the smallest
116+
);
117+
}
118+
76119
/**
77120
* Update the embeddings storage
78121
*
@@ -95,7 +138,7 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
95138
!page_exists($page) ||
96139
isHiddenPage($page) ||
97140
filesize(wikiFN($page)) < 150 || // skip very small pages
98-
($skipRE && preg_match($skipRE, (string) $page)) ||
141+
($skipRE && preg_match($skipRE, (string)$page)) ||
99142
($matchRE && !preg_match($matchRE, ":$page"))
100143
) {
101144
// this page should not be in the index (anymore)
@@ -111,7 +154,8 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
111154
} else {
112155
// page is newer than the chunks we have, create new chunks
113156
$this->storage->deletePageChunks($page, $chunkID);
114-
$this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
157+
$chunks = $this->createPageChunks($page, $chunkID);
158+
if ($chunks) $this->storage->addPageChunks($chunks);
115159
}
116160
}
117161
$this->storage->finalizeCreation();
@@ -126,9 +170,10 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
126170
* @param string $page Name of the page to split
127171
* @param int $firstChunkID The ID of the first chunk of this page
128172
* @return Chunk[] A list of chunks created for this page
173+
* @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
129174
* @throws \Exception
130175
*/
131-
protected function createPageChunks($page, $firstChunkID)
176+
public function createPageChunks($page, $firstChunkID)
132177
{
133178
$chunkList = [];
134179

@@ -141,12 +186,25 @@ protected function createPageChunks($page, $firstChunkID)
141186
$text = rawWiki($page);
142187
}
143188

189+
// allow plugins to modify the text before splitting
190+
$eventData = [
191+
'page' => $page,
192+
'body' => '',
193+
'metadata' => ['title' => $page, 'relation_references' => []],
194+
];
195+
$event = new Event('INDEXER_PAGE_ADD', $eventData);
196+
if ($event->advise_before()) {
197+
$text = $eventData['body'] . ' ' . $text;
198+
} else {
199+
$text = $eventData['body'];
200+
}
201+
144202
$parts = $this->splitIntoChunks($text);
145203
foreach ($parts as $part) {
146-
if (trim((string) $part) == '') continue; // skip empty chunks
204+
if (trim((string)$part) == '') continue; // skip empty chunks
147205

148206
try {
149-
$embedding = $this->model->getEmbedding($part);
207+
$embedding = $this->embedModel->getEmbedding($part);
150208
} catch (\Exception $e) {
151209
if ($this->logger instanceof CLI) {
152210
$this->logger->error(
@@ -186,19 +244,20 @@ protected function createPageChunks($page, $firstChunkID)
186244
public function getSimilarChunks($query, $lang = '')
187245
{
188246
global $auth;
189-
$vector = $this->model->getEmbedding($query);
247+
$vector = $this->embedModel->getEmbedding($query);
190248

191-
$fetch = ceil(
192-
($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
193-
* 1.5 // fetch a few more than needed, since not all chunks are maximum length
249+
$fetch = min(
250+
($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
251+
$this->configContextChunks
194252
);
195253

196254
$time = microtime(true);
197255
$chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
256+
$this->timeSpent = round(microtime(true) - $time, 2);
198257
if ($this->logger instanceof CLI) {
199258
$this->logger->info(
200259
'Fetched {count} similar chunks from store in {time} seconds',
201-
['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
260+
['count' => count($chunks), 'time' => $this->timeSpent]
202261
);
203262
}
204263

@@ -207,9 +266,10 @@ public function getSimilarChunks($query, $lang = '')
207266
foreach ($chunks as $chunk) {
208267
// filter out chunks the user is not allowed to read
209268
if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
269+
if ($chunk->getScore() < $this->similarityThreshold) continue;
210270

211271
$chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
212-
if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
272+
if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough
213273

214274
$result[] = $chunk;
215275
$size += $chunkSize;
@@ -224,7 +284,7 @@ public function getSimilarChunks($query, $lang = '')
224284
* @throws \Exception
225285
* @todo support splitting too long sentences
226286
*/
227-
public function splitIntoChunks($text)
287+
protected function splitIntoChunks($text)
228288
{
229289
$sentenceSplitter = new Sentence();
230290
$tiktok = $this->getTokenEncoder();
@@ -236,23 +296,24 @@ public function splitIntoChunks($text)
236296
$chunk = '';
237297
while ($sentence = array_shift($sentences)) {
238298
$slen = count($tiktok->encode($sentence));
239-
if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
299+
if ($slen > $this->getChunkSize()) {
240300
// sentence is too long, we need to split it further
241301
if ($this->logger instanceof CLI) $this->logger->warning(
242302
'Sentence too long, splitting not implemented yet'
243303
);
244304
continue;
245305
}
246306

247-
if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
307+
if ($chunklen + $slen < $this->getChunkSize()) {
248308
// add to current chunk
249309
$chunk .= $sentence;
250310
$chunklen += $slen;
251311
// remember sentence for overlap check
252312
$this->rememberSentence($sentence);
253313
} else {
254314
// add current chunk to result
255-
$chunks[] = $chunk;
315+
$chunk = trim($chunk);
316+
if ($chunk !== '') $chunks[] = $chunk;
256317

257318
// start new chunk with remembered sentences
258319
$chunk = implode(' ', $this->sentenceQueue);

0 commit comments

Comments
 (0)