Skip to content

Commit 661701e

Browse files
committed
Use custom renderer when creating embeddings
Rendering makes plugin output available and and handles includes. It might also help with #15. The renderer uses markdown like output since all LLMs seem to be very familiar with it's syntax. This might help them to understand the document structure better. This also adds a breadcrumb trail at the top of each chunk which might help with contextulization as well.
1 parent bcaa910 commit 661701e

File tree

2 files changed

+660
-14
lines changed

2 files changed

+660
-14
lines changed

Embeddings.php

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
use dokuwiki\Extension\Event;
66
use dokuwiki\Extension\PluginInterface;
7+
use dokuwiki\File\PageResolver;
78
use dokuwiki\plugin\aichat\Model\ChatInterface;
89
use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
910
use dokuwiki\plugin\aichat\Storage\AbstractStorage;
@@ -177,23 +178,20 @@ public function createPageChunks($page, $firstChunkID)
177178
{
178179
$chunkList = [];
179180

180-
$textRenderer = plugin_load('renderer', 'text');
181-
if ($textRenderer instanceof PluginInterface) {
182-
global $ID;
183-
$ID = $page;
184-
try {
185-
$text = p_cached_output(wikiFN($page), 'text', $page);
186-
} catch (\Throwable $e) {
187-
if ($this->logger) $this->logger->error(
188-
'Failed to render page {page} using raw text instead. {msg}',
189-
['page' => $page, 'msg' => $e->getMessage()]
190-
);
191-
$text = rawWiki($page);
192-
}
193-
} else {
181+
global $ID;
182+
$ID = $page;
183+
try {
184+
$text = p_cached_output(wikiFN($page), 'aichat', $page);
185+
} catch (\Throwable $e) {
186+
if ($this->logger) $this->logger->error(
187+
'Failed to render page {page}. Using raw text instead. {msg}',
188+
['page' => $page, 'msg' => $e->getMessage()]
189+
);
194190
$text = rawWiki($page);
195191
}
196192

193+
$crumbs = $this->breadcrumbTrail($page);
194+
197195
// allow plugins to modify the text before splitting
198196
$eventData = [
199197
'page' => $page,
@@ -211,6 +209,8 @@ public function createPageChunks($page, $firstChunkID)
211209
foreach ($parts as $part) {
212210
if (trim((string)$part) == '') continue; // skip empty chunks
213211

212+
$part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk
213+
214214
try {
215215
$embedding = $this->embedModel->getEmbedding($part);
216216
} catch (\Exception $e) {
@@ -285,6 +285,37 @@ public function getSimilarChunks($query, $lang = '')
285285
return $result;
286286
}
287287

288+
/**
289+
* Create a breadcrumb trail for the given page
290+
*
291+
* Uses the first heading of each namespace and the page itself. This is added as a prefix to
292+
* each chunk to give the AI some context.
293+
*
294+
* @param string $id
295+
* @return string
296+
*/
297+
protected function breadcrumbTrail($id)
298+
{
299+
$namespaces = explode(':', getNS($id));
300+
$resolver = new PageResolver($id);
301+
$crumbs = [];
302+
303+
// all namespaces
304+
$check = '';
305+
foreach ($namespaces as $namespace) {
306+
$check .= $namespace . ':';
307+
$page = $resolver->resolveId($check);
308+
$title = p_get_first_heading($page);
309+
$crumbs[] = $title ? "$title ($namespace)" : $namespace;
310+
}
311+
312+
// the page itself
313+
$title = p_get_first_heading($id);
314+
$page = noNS($id);
315+
$crumbs[] = $title ? "$title ($page)" : $page;
316+
317+
return implode(' » ', $crumbs);
318+
}
288319

289320
/**
290321
* @param $text

0 commit comments

Comments
 (0)