Skip to content

Commit 5e6dd16

Browse files
committed
Added ChromaDB vector storage support
1 parent dc355d5 commit 5e6dd16

File tree

5 files changed

+338
-1
lines changed

5 files changed

+338
-1
lines changed

Storage/ChromaStorage.php

Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,312 @@
1+
<?php
2+
3+
namespace dokuwiki\plugin\aichat\Storage;
4+
5+
use dokuwiki\HTTP\DokuHTTPClient;
6+
use dokuwiki\plugin\aichat\Chunk;
7+
8+
/**
9+
* Implements the storage backend using a Chroma DB in server mode
10+
*/
11+
class ChromaStorage extends AbstractStorage
12+
{
13+
/** @var string URL to the chroma server instance */
14+
protected $baseurl;
15+
16+
/** @var DokuHTTPClient http client */
17+
protected $http;
18+
19+
protected $tenant = 'default_tenant';
20+
protected $database = 'default_database';
21+
protected $collection = '';
22+
protected $collectionID = '';
23+
24+
/**
25+
* PineconeStorage constructor.
26+
*/
27+
public function __construct()
28+
{
29+
$helper = plugin_load('helper', 'aichat');
30+
31+
$this->baseurl = $helper->getConf('chroma_baseurl');
32+
$this->tenant = $helper->getConf('chroma_tenant');
33+
$this->database = $helper->getConf('chroma_database');
34+
$this->collection = $helper->getConf('chroma_collection');
35+
36+
$this->http = new DokuHTTPClient();
37+
$this->http->headers['Content-Type'] = 'application/json';
38+
$this->http->headers['Accept'] = 'application/json';
39+
$this->http->keep_alive = false;
40+
$this->http->timeout = 30;
41+
42+
if($helper->getConf('chroma_apikey')) {
43+
$this->http->headers['Authorization'] = 'Bearer ' . $helper->getConf('chroma_apikey');
44+
}
45+
}
46+
47+
/**
48+
* Execute a query against the Chroma API
49+
*
50+
* @param string $endpoint API endpoint, will be added to the base URL
51+
* @param mixed $data The data to send, will be JSON encoded
52+
* @param string $method POST|GET
53+
* @return mixed
54+
* @throws \Exception
55+
*/
56+
protected function runQuery($endpoint, $data, $method = 'POST')
57+
{
58+
$url = $this->baseurl . '/api/v1' . $endpoint . '?tenant=' . $this->tenant . '&database=' . $this->database;
59+
60+
if (is_array($data) && $data === []) {
61+
$json = '{}';
62+
} else {
63+
$json = json_encode($data);
64+
}
65+
66+
$this->http->sendRequest($url, $json, $method);
67+
$response = $this->http->resp_body;
68+
69+
if (!$response) {
70+
throw new \Exception('Chroma API returned no response. ' . $this->http->error);
71+
}
72+
73+
try {
74+
$result = json_decode($response, true, 512, JSON_THROW_ON_ERROR);
75+
} catch (\Exception $e) {
76+
throw new \Exception('Chroma API returned invalid JSON. ' . $response);
77+
}
78+
79+
if ((int)$this->http->status !== 200) {
80+
if (isset($result['detail'][0]['msg'])) {
81+
$error = $result['detail'][0]['msg'];
82+
} else if (isset($result['detail']['msg'])) {
83+
$error = $result['detail']['msg'];
84+
} else if (isset($result['detail']) && is_string($result['detail'])) {
85+
$error = $result['detail'];
86+
} else if (isset($result['error'])) {
87+
$error = $result['error'];
88+
} else {
89+
$error = $this->http->error;
90+
}
91+
92+
throw new \Exception('Chroma API returned error. ' . $error);
93+
}
94+
95+
return $result;
96+
}
97+
98+
/**
99+
* Get the collection ID for the configured collection
100+
*
101+
* @return string
102+
* @throws \Exception
103+
*/
104+
protected function getCollectionID()
105+
{
106+
if ($this->collectionID) return $this->collectionID;
107+
108+
$result = $this->runQuery(
109+
'/collections/',
110+
[
111+
'name' => $this->collection,
112+
'get_or_create' => true
113+
]
114+
);
115+
$this->collectionID = $result['id'];
116+
return $this->collectionID;
117+
}
118+
119+
/** @inheritdoc */
120+
public function getChunk($chunkID)
121+
{
122+
$data = $this->runQuery(
123+
'/collections/' . $this->getCollectionID() . '/get',
124+
[
125+
'ids' => [(string)$chunkID],
126+
'include' => [
127+
'metadatas',
128+
'documents',
129+
'embeddings'
130+
]
131+
]
132+
);
133+
134+
if (!$data) return null;
135+
if (!$data['ids']) return null;
136+
137+
return new Chunk(
138+
$data['metadatas'][0]['page'],
139+
(int)$data['ids'][0],
140+
$data['documents'][0],
141+
$data['embeddings'][0],
142+
$data['metadatas'][0]['language'] ?? '',
143+
$data['metadatas'][0]['created']
144+
);
145+
}
146+
147+
/** @inheritdoc */
148+
public function startCreation($clear = false)
149+
{
150+
if ($clear) {
151+
$this->runQuery('/collections/' . $this->collection, '', 'DELETE');
152+
$this->collectionID = '';
153+
}
154+
}
155+
156+
/** @inheritdoc */
157+
public function reusePageChunks($page, $firstChunkID)
158+
{
159+
// no-op
160+
}
161+
162+
/** @inheritdoc */
163+
public function deletePageChunks($page, $firstChunkID)
164+
{
165+
// delete all possible chunk IDs
166+
$ids = range($firstChunkID, $firstChunkID + 99, 1);
167+
$ids = array_map(function ($id) {
168+
return (string)$id;
169+
}, $ids);
170+
$this->runQuery(
171+
'/collections/' . $this->getCollectionID() . '/delete',
172+
[
173+
'ids' => $ids
174+
]
175+
);
176+
}
177+
178+
/** @inheritdoc */
179+
public function addPageChunks($chunks)
180+
{
181+
$ids = [];
182+
$embeddings = [];
183+
$metadatas = [];
184+
$documents = [];
185+
186+
foreach ($chunks as $chunk) {
187+
$ids[] = (string)$chunk->getId();
188+
$embeddings[] = $chunk->getEmbedding();
189+
$metadatas[] = [
190+
'page' => $chunk->getPage(),
191+
'created' => $chunk->getCreated(),
192+
'language' => $chunk->getLanguage()
193+
];
194+
$documents[] = $chunk->getText();
195+
196+
}
197+
198+
$this->runQuery(
199+
'/collections/' . $this->getCollectionID() . '/upsert',
200+
[
201+
'ids' => $ids,
202+
'embeddings' => $embeddings,
203+
'metadatas' => $metadatas,
204+
'documents' => $documents
205+
]
206+
);
207+
}
208+
209+
/** @inheritdoc */
210+
public function finalizeCreation()
211+
{
212+
// no-op
213+
}
214+
215+
/** @inheritdoc */
216+
public function runMaintenance()
217+
{
218+
// no-op
219+
}
220+
221+
/** @inheritdoc */
222+
public function getPageChunks($page, $firstChunkID)
223+
{
224+
$ids = range($firstChunkID, $firstChunkID + 99, 1);
225+
$ids = array_map(function ($id) {
226+
return (string)$id;
227+
}, $ids);
228+
229+
$data = $this->runQuery(
230+
'/collections/' . $this->getCollectionID() . '/get',
231+
[
232+
'ids' => $ids,
233+
'include' => [
234+
'metadatas',
235+
'documents',
236+
'embeddings'
237+
],
238+
'limit' => 100,
239+
]
240+
);
241+
242+
if (!$data) return [];
243+
if (!$data['ids']) return null;
244+
245+
$chunks = [];
246+
foreach ($data['ids'] as $idx => $id) {
247+
$chunks[] = new Chunk(
248+
$data['metadatas'][$idx]['page'],
249+
(int)$id,
250+
$data['documents'][$idx],
251+
$data['embeddings'][$idx],
252+
$data['metadatas'][$idx]['language'] ?? '',
253+
$data['metadatas'][$idx]['created']
254+
);
255+
}
256+
return $chunks;
257+
}
258+
259+
/** @inheritdoc */
260+
public function getSimilarChunks($vector, $lang = '', $limit = 4)
261+
{
262+
$limit *= 2; // we can't check ACLs, so we return more than requested
263+
264+
if ($lang) {
265+
$filter = ['language' => ['$eq', $lang]];
266+
} else {
267+
$filter = null;
268+
}
269+
270+
$data = $this->runQuery(
271+
'/collections/' . $this->getCollectionID() . '/query',
272+
[
273+
'query_embeddings' => [$vector],
274+
'n_results' => (int)$limit,
275+
'where' => $filter,
276+
'include' => [
277+
'metadatas',
278+
'documents',
279+
'embeddings',
280+
'distances',
281+
]
282+
]
283+
);
284+
285+
$chunks = [];
286+
foreach ($data['ids'][0] as $idx => $id) {
287+
$chunks[] = new Chunk(
288+
$data['metadatas'][0][$idx]['page'],
289+
(int)$id,
290+
$data['documents'][0][$idx],
291+
$data['embeddings'][0][$idx],
292+
$data['metadatas'][0][$idx]['language'] ?? '',
293+
$data['metadatas'][0][$idx]['created'],
294+
$data['distances'][0][$idx]
295+
);
296+
}
297+
return $chunks;
298+
}
299+
300+
/** @inheritdoc */
301+
public function statistics()
302+
{
303+
$count = $this->runQuery('/collections/' . $this->getCollectionID() . '/count', '', 'GET');
304+
$version = $this->runQuery('/version', '', 'GET');
305+
306+
return [
307+
'chroma_version' => $version,
308+
'collection_id' => $this->getCollectionID(),
309+
'chunks' => $count
310+
];
311+
}
312+
}

conf/default.php

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
$conf['pinecone_apikey'] = '';
1414
$conf['pinecone_baseurl'] = '';
1515

16+
$conf['chroma_baseurl'] = '';
17+
$conf['chroma_apikey'] = '';
18+
$conf['chroma_tenant'] = 'default_tenant';
19+
$conf['chroma_database'] = 'default_database';
20+
$conf['chroma_collection'] = 'aichat';
21+
1622
$conf['logging'] = 0;
1723
$conf['restrict'] = '';
1824
$conf['preferUIlanguage'] = 0;

conf/metadata.php

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
$meta['pinecone_apikey'] = array('string');
2121
$meta['pinecone_baseurl'] = array('string');
2222

23+
$meta['chroma_baseurl'] = array('string');
24+
$meta['chroma_apikey'] = array('string');
25+
$meta['chroma_tenant'] = array('string');
26+
$meta['chroma_database'] = array('string');
27+
$meta['chroma_collection'] = array('string');
28+
2329
$meta['logging'] = array('onoff');
2430
$meta['restrict'] = array('string');
2531
$meta['preferUIlanguage'] = array('multichoice', '_choices' => array(

helper.php

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
<?php
22

3-
use dokuwiki\Extension\Plugin;
43
use dokuwiki\Extension\CLIPlugin;
4+
use dokuwiki\Extension\Plugin;
55
use dokuwiki\plugin\aichat\AIChat;
66
use dokuwiki\plugin\aichat\Chunk;
77
use dokuwiki\plugin\aichat\Embeddings;
88
use dokuwiki\plugin\aichat\Model\AbstractModel;
99
use dokuwiki\plugin\aichat\Model\OpenAI\GPT35Turbo;
1010
use dokuwiki\plugin\aichat\Storage\AbstractStorage;
11+
use dokuwiki\plugin\aichat\Storage\ChromaStorage;
1112
use dokuwiki\plugin\aichat\Storage\PineconeStorage;
1213
use dokuwiki\plugin\aichat\Storage\SQLiteStorage;
1314

@@ -115,6 +116,8 @@ public function getStorage()
115116
if (!$this->storage instanceof AbstractStorage) {
116117
if ($this->getConf('pinecone_apikey')) {
117118
$this->storage = new PineconeStorage();
119+
} elseif ($this->getConf('chroma_baseurl')) {
120+
$this->storage = new ChromaStorage();
118121
} else {
119122
$this->storage = new SQLiteStorage();
120123
}

lang/en/settings.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88
$lang['openaikey'] = 'Your OpenAI API key';
99
$lang['openaiorg'] = 'Your OpenAI organization ID (if any)';
1010
$lang['model'] = 'Which model to use. When changing models, be sure to run <code>php bin/plugin.php aichat embed -c</code> to rebuild the vector storage.';
11+
12+
$lang['pinecone_apikey'] = 'Your Pinecone API key if you want to use Pinecone as a storage backend.';
13+
$lang['pinecone_baseurl'] = 'Your Pinecone base URL if you want to use Pinecone as a storage backend.';
14+
15+
$lang['chroma_baseurl'] = 'Your Chroma base URL if you want to use Chroma as a storage backend.';
16+
$lang['chroma_apikey'] = 'Your Chroma API key. Empty if no authentication is required.';
17+
$lang['chroma_tenant'] = 'Your Chroma tenant name.';
18+
$lang['chroma_database'] = 'Your Chroma database name.';
19+
$lang['chroma_collection'] = 'The collection to use. Will be created.';
20+
1121
$lang['logging'] = 'Log all questions and answers. Use the <a href="?do=admin&page=logviewer&facility=aichat">Log Viewer</a> to access.';
1222
$lang['restrict'] = 'Restrict access to these users and groups (comma separated). Leave empty to allow all users.';
1323
$lang['preferUIlanguage'] = 'How to work with multilingual wikis? (Requires the translation plugin)';

0 commit comments

Comments
 (0)