Skip to content

Commit d5c102b

Browse files
committed
Regular expressions to limit the indexed pages. Implements #5
Both regular expressions (when set) need to apply at the same time. Eg a page MUST match the matchRegex and MUST NOT match the skipRegex to be applied. The regular expressions are applied when running the `embed` command line command. Pages no longer adhering to a changed regex setup will be removed from the vector store. For the sqlite storage it is recommended to re-cluster the index when the reges are changed by running the `maintenance` command.
1 parent db36318 commit d5c102b

File tree

5 files changed

+51
-4
lines changed

5 files changed

+51
-4
lines changed

Embeddings.php

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,12 @@ public function getTokenEncoder()
7777
* Update the embeddings storage
7878
*
7979
* @param string $skipRE Regular expression to filter out pages (full RE with delimiters)
80+
* @param string $matchRE Regular expression pages have to match to be included (full RE with delimiters)
8081
* @param bool $clear Should any existing storage be cleared before updating?
8182
* @return void
8283
* @throws \Exception
8384
*/
84-
public function createNewIndex($skipRE = '', $clear = false)
85+
public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
8586
{
8687
$indexer = new Indexer();
8788
$pages = $indexer->getPages();
@@ -94,7 +95,8 @@ public function createNewIndex($skipRE = '', $clear = false)
9495
!page_exists($page) ||
9596
isHiddenPage($page) ||
9697
filesize(wikiFN($page)) < 150 || // skip very small pages
97-
($skipRE && preg_match($skipRE, (string) $page))
98+
($skipRE && preg_match($skipRE, (string) $page)) ||
99+
($matchRE && !preg_match($matchRE, ":$page"))
98100
) {
99101
// this page should not be in the index (anymore)
100102
$this->storage->deletePageChunks($page, $chunkID);

cli.php

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,12 +265,13 @@ protected function runMaintenance()
265265
* Recreate chunks and embeddings for all pages
266266
*
267267
* @return void
268-
* @todo make skip regex configurable
269268
*/
270269
protected function createEmbeddings($clear)
271270
{
271+
[$skipRE, $matchRE] = $this->getRegexps();
272+
272273
$start = time();
273-
$this->helper->getEmbeddings()->createNewIndex('/(^|:)(playground|sandbox)(:|$)/', $clear);
274+
$this->helper->getEmbeddings()->createNewIndex($skipRE, $matchRE, $clear);
274275
$this->notice('Peak memory used: {memory}', ['memory' => filesize_h(memory_get_peak_usage(true))]);
275276
$this->notice('Spent time: {time}min', ['time' => round((time() - $start) / 60, 2)]);
276277
}
@@ -354,4 +355,42 @@ protected function readLine($prompt)
354355

355356
return $value;
356357
}
358+
359+
/**
360+
* Read the skip and match regex from the config
361+
*
362+
* Ensures the regular expressions are valid
363+
*
364+
* @return string[] [$skipRE, $matchRE]
365+
*/
366+
protected function getRegexps()
367+
{
368+
$skip = $this->getConf('skipRegex');
369+
$skipRE = '';
370+
$match = $this->getConf('matchRegex');
371+
$matchRE = '';
372+
373+
if ($skip) {
374+
$skipRE = '/' . $skip . '/';
375+
if (@preg_match($skipRE, null) === false) {
376+
$this->error(preg_last_error_msg());
377+
$this->error('Invalid regular expression in $conf[\'skipRegex\']. Ignored.');
378+
$skipRE = '';
379+
} else {
380+
$this->success('Skipping pages matching ' . $skipRE);
381+
}
382+
}
383+
384+
if ($match) {
385+
$matchRE = '/' . $match . '/';
386+
if (@preg_match($matchRE, null) === false) {
387+
$this->error(preg_last_error_msg());
388+
$this->error('Invalid regular expression in $conf[\'matchRegex\']. Ignored.');
389+
$matchRE = '';
390+
} else {
391+
$this->success('Only indexing pages matching ' . $matchRE);
392+
}
393+
}
394+
return [$skipRE, $matchRE];
395+
}
357396
}

conf/default.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,6 @@
2525

2626
$conf['logging'] = 0;
2727
$conf['restrict'] = '';
28+
$conf['skipRegex'] = ':(playground|sandbox)(:|$)';
29+
$conf['matchRegex'] = '';
2830
$conf['preferUIlanguage'] = 0;

conf/metadata.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
$meta['logging'] = array('onoff');
3434
$meta['restrict'] = array('string');
35+
$meta['skipRegex'] = array('string');
36+
$meta['matchRegex'] = array('string');
3537
$meta['preferUIlanguage'] = array('multichoice', '_choices' => array(
3638
\dokuwiki\plugin\aichat\AIChat::LANG_AUTO_ALL,
3739
\dokuwiki\plugin\aichat\AIChat::LANG_UI_ALL,

lang/en/settings.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424

2525
$lang['logging'] = 'Log all questions and answers. Use the <a href="?do=admin&page=logviewer&facility=aichat">Log Viewer</a> to access.';
2626
$lang['restrict'] = 'Restrict access to these users and groups (comma separated). Leave empty to allow all users.';
27+
$lang['skipRegex'] = 'Skip indexing pages matching this regular expression (no delimiters).';
28+
$lang['matchRegex'] = 'Only index pages matching this regular expression (no delimiters).';
2729
$lang['preferUIlanguage'] = 'How to work with multilingual wikis? (Requires the translation plugin)';
2830

2931
$lang['preferUIlanguage_o_0'] = 'Guess language, use all sources';

0 commit comments

Comments
 (0)