22
33namespace dokuwiki \plugin \aichat ;
44
5+ use dokuwiki \Extension \Event ;
56use dokuwiki \Extension \PluginInterface ;
6- use dokuwiki \plugin \aichat \Model \AbstractModel ;
7+ use dokuwiki \plugin \aichat \Model \ChatInterface ;
8+ use dokuwiki \plugin \aichat \Model \EmbeddingInterface ;
79use dokuwiki \plugin \aichat \Storage \AbstractStorage ;
810use dokuwiki \Search \Indexer ;
911use splitbrain \phpcli \CLI ;
@@ -21,8 +23,12 @@ class Embeddings
2123 /** @var int maximum overlap between chunks in tokens */
2224 final public const MAX_OVERLAP_LEN = 200 ;
2325
24- /** @var AbstractModel */
25- protected $ model ;
26+ /** @var ChatInterface */
27+ protected $ chatModel ;
28+
29+ /** @var EmbeddingInterface */
30+ protected $ embedModel ;
31+
2632 /** @var CLI|null */
2733 protected $ logger ;
2834 /** @var Encoder */
@@ -34,10 +40,33 @@ class Embeddings
3440 /** @var array remember sentences when chunking */
3541 private $ sentenceQueue = [];
3642
37- public function __construct (AbstractModel $ model , AbstractStorage $ storage )
38- {
39- $ this ->model = $ model ;
43+ /** @var int the time spent for the last similar chunk retrieval */
44+ public $ timeSpent = 0 ;
45+
46+ protected $ configChunkSize ;
47+ protected $ configContextChunks ;
48+ protected $ similarityThreshold ;
49+
50+ /**
51+ * Embeddings constructor.
52+ *
53+ * @param ChatInterface $chatModel
54+ * @param EmbeddingInterface $embedModel
55+ * @param AbstractStorage $storage
56+ * @param array $config The plugin configuration
57+ */
58+ public function __construct (
59+ ChatInterface $ chatModel ,
60+ EmbeddingInterface $ embedModel ,
61+ AbstractStorage $ storage ,
62+ $ config
63+ ) {
64+ $ this ->chatModel = $ chatModel ;
65+ $ this ->embedModel = $ embedModel ;
4066 $ this ->storage = $ storage ;
67+ $ this ->configChunkSize = $ config ['chunkSize ' ];
68+ $ this ->configContextChunks = $ config ['contextChunks ' ];
69+ $ this ->similarityThreshold = $ config ['similarityThreshold ' ] / 100 ;
4170 }
4271
4372 /**
@@ -73,6 +102,20 @@ public function getTokenEncoder()
73102 return $ this ->tokenEncoder ;
74103 }
75104
105+ /**
106+ * Return the chunk size to use
107+ *
108+ * @return int
109+ */
110+ public function getChunkSize ()
111+ {
112+ return min (
113+ floor ($ this ->chatModel ->getMaxInputTokenLength () / 4 ), // be able to fit 4 chunks into the max input
114+ floor ($ this ->embedModel ->getMaxInputTokenLength () * 0.9 ), // only use 90% of the embedding model to be safe
115+ $ this ->configChunkSize , // this is usually the smallest
116+ );
117+ }
118+
76119 /**
77120 * Update the embeddings storage
78121 *
@@ -95,7 +138,7 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
95138 !page_exists ($ page ) ||
96139 isHiddenPage ($ page ) ||
97140 filesize (wikiFN ($ page )) < 150 || // skip very small pages
98- ($ skipRE && preg_match ($ skipRE , (string ) $ page )) ||
141+ ($ skipRE && preg_match ($ skipRE , (string )$ page )) ||
99142 ($ matchRE && !preg_match ($ matchRE , ": $ page " ))
100143 ) {
101144 // this page should not be in the index (anymore)
@@ -111,7 +154,8 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
111154 } else {
112155 // page is newer than the chunks we have, create new chunks
113156 $ this ->storage ->deletePageChunks ($ page , $ chunkID );
114- $ this ->storage ->addPageChunks ($ this ->createPageChunks ($ page , $ chunkID ));
157+ $ chunks = $ this ->createPageChunks ($ page , $ chunkID );
158+ if ($ chunks ) $ this ->storage ->addPageChunks ($ chunks );
115159 }
116160 }
117161 $ this ->storage ->finalizeCreation ();
@@ -126,9 +170,10 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
126170 * @param string $page Name of the page to split
127171 * @param int $firstChunkID The ID of the first chunk of this page
128172 * @return Chunk[] A list of chunks created for this page
173+ * @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
129174 * @throws \Exception
130175 */
131- protected function createPageChunks ($ page , $ firstChunkID )
176+ public function createPageChunks ($ page , $ firstChunkID )
132177 {
133178 $ chunkList = [];
134179
@@ -141,12 +186,25 @@ protected function createPageChunks($page, $firstChunkID)
141186 $ text = rawWiki ($ page );
142187 }
143188
189+ // allow plugins to modify the text before splitting
190+ $ eventData = [
191+ 'page ' => $ page ,
192+ 'body ' => '' ,
193+ 'metadata ' => ['title ' => $ page , 'relation_references ' => []],
194+ ];
195+ $ event = new Event ('INDEXER_PAGE_ADD ' , $ eventData );
196+ if ($ event ->advise_before ()) {
197+ $ text = $ eventData ['body ' ] . ' ' . $ text ;
198+ } else {
199+ $ text = $ eventData ['body ' ];
200+ }
201+
144202 $ parts = $ this ->splitIntoChunks ($ text );
145203 foreach ($ parts as $ part ) {
146- if (trim ((string ) $ part ) == '' ) continue ; // skip empty chunks
204+ if (trim ((string )$ part ) == '' ) continue ; // skip empty chunks
147205
148206 try {
149- $ embedding = $ this ->model ->getEmbedding ($ part );
207+ $ embedding = $ this ->embedModel ->getEmbedding ($ part );
150208 } catch (\Exception $ e ) {
151209 if ($ this ->logger instanceof CLI ) {
152210 $ this ->logger ->error (
@@ -186,19 +244,20 @@ protected function createPageChunks($page, $firstChunkID)
186244 public function getSimilarChunks ($ query , $ lang = '' )
187245 {
188246 global $ auth ;
189- $ vector = $ this ->model ->getEmbedding ($ query );
247+ $ vector = $ this ->embedModel ->getEmbedding ($ query );
190248
191- $ fetch = ceil (
192- ($ this ->model -> getMaxContextTokenLength () / $ this ->model -> getMaxEmbeddingTokenLength ())
193- * 1.5 // fetch a few more than needed, since not all chunks are maximum length
249+ $ fetch = min (
250+ ($ this ->chatModel -> getMaxInputTokenLength () / $ this ->getChunkSize ()),
251+ $ this -> configContextChunks
194252 );
195253
196254 $ time = microtime (true );
197255 $ chunks = $ this ->storage ->getSimilarChunks ($ vector , $ lang , $ fetch );
256+ $ this ->timeSpent = round (microtime (true ) - $ time , 2 );
198257 if ($ this ->logger instanceof CLI ) {
199258 $ this ->logger ->info (
200259 'Fetched {count} similar chunks from store in {time} seconds ' ,
201- ['count ' => count ($ chunks ), 'time ' => round ( microtime ( true ) - $ time , 2 ) ]
260+ ['count ' => count ($ chunks ), 'time ' => $ this -> timeSpent ]
202261 );
203262 }
204263
@@ -207,9 +266,10 @@ public function getSimilarChunks($query, $lang = '')
207266 foreach ($ chunks as $ chunk ) {
208267 // filter out chunks the user is not allowed to read
209268 if ($ auth && auth_quickaclcheck ($ chunk ->getPage ()) < AUTH_READ ) continue ;
269+ if ($ chunk ->getScore () < $ this ->similarityThreshold ) continue ;
210270
211271 $ chunkSize = count ($ this ->getTokenEncoder ()->encode ($ chunk ->getText ()));
212- if ($ size + $ chunkSize > $ this ->model -> getMaxContextTokenLength ()) break ; // we have enough
272+ if ($ size + $ chunkSize > $ this ->chatModel -> getMaxInputTokenLength ()) break ; // we have enough
213273
214274 $ result [] = $ chunk ;
215275 $ size += $ chunkSize ;
@@ -224,7 +284,7 @@ public function getSimilarChunks($query, $lang = '')
224284 * @throws \Exception
225285 * @todo support splitting too long sentences
226286 */
227- public function splitIntoChunks ($ text )
287+ protected function splitIntoChunks ($ text )
228288 {
229289 $ sentenceSplitter = new Sentence ();
230290 $ tiktok = $ this ->getTokenEncoder ();
@@ -236,23 +296,24 @@ public function splitIntoChunks($text)
236296 $ chunk = '' ;
237297 while ($ sentence = array_shift ($ sentences )) {
238298 $ slen = count ($ tiktok ->encode ($ sentence ));
239- if ($ slen > $ this ->model -> getMaxEmbeddingTokenLength ()) {
299+ if ($ slen > $ this ->getChunkSize ()) {
240300 // sentence is too long, we need to split it further
241301 if ($ this ->logger instanceof CLI ) $ this ->logger ->warning (
242302 'Sentence too long, splitting not implemented yet '
243303 );
244304 continue ;
245305 }
246306
247- if ($ chunklen + $ slen < $ this ->model -> getMaxEmbeddingTokenLength ()) {
307+ if ($ chunklen + $ slen < $ this ->getChunkSize ()) {
248308 // add to current chunk
249309 $ chunk .= $ sentence ;
250310 $ chunklen += $ slen ;
251311 // remember sentence for overlap check
252312 $ this ->rememberSentence ($ sentence );
253313 } else {
254314 // add current chunk to result
255- $ chunks [] = $ chunk ;
315+ $ chunk = trim ($ chunk );
316+ if ($ chunk !== '' ) $ chunks [] = $ chunk ;
256317
257318 // start new chunk with remembered sentences
258319 $ chunk = implode (' ' , $ this ->sentenceQueue );
0 commit comments