Skip to content

Commit 3314eab

Browse files
committed
feat(kb): enhance knowledgebase functionality with metadata support and improve search handling
1 parent 7e0f5d2 commit 3314eab

File tree

7 files changed

+171
-34
lines changed

7 files changed

+171
-34
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,13 @@ cython_debug/
167167
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
168168
.idea/
169169

170+
# Visual Studio Code
171+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
172+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
173+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
174+
# you could uncomment the following to ignore the entire vscode folder
175+
.vscode/
176+
170177
# Ruff stuff:
171178
.ruff_cache/
172179

veadk/integrations/ve_tos/ve_tos.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,21 @@ def _set_cors_rules(self, bucket_name: str) -> bool:
178178
logger.error(f"Failed to set CORS rules for bucket {bucket_name}: {str(e)}")
179179
return False
180180

181-
def _build_object_key_for_file(self, data_path: str) -> str:
182-
"""generate TOS object key"""
181+
def build_tos_url(
182+
self, user_id: str, app_name: str, session_id: str, data_path: str
183+
) -> tuple[str, str]:
184+
"""Builds the TOS object key and URL for the given parameters.
185+
186+
Args:
187+
user_id (str): User ID
188+
app_name (str): App name
189+
session_id (str): Session ID
190+
data_path (str): Data path
191+
192+
Returns:
193+
tuple[str, str]: Object key and TOS URL.
194+
"""
195+
183196
parsed_url = urlparse(data_path)
184197

185198
# Generate object key
@@ -245,14 +258,24 @@ def build_tos_url(self, object_key: str, bucket_name: str = "") -> str:
245258

246259
# deprecated
247260
def upload(
248-
self, data: Union[str, bytes], bucket_name: str = "", object_key: str = ""
261+
self, object_key: str, data: Union[str, bytes], metadata: dict | None = None
249262
):
263+
"""Uploads data to TOS.
264+
265+
Args:
266+
object_key (str): The object key for the uploaded data.
267+
data (Union[str, bytes]): The data to upload, either as a file path or raw bytes.
268+
metadata (dict | None, optional): Metadata to associate with the object. Defaults to None.
269+
270+
Raises:
271+
ValueError: If the data type is unsupported.
272+
"""
250273
if isinstance(data, str):
251274
# data is a file path
252-
return asyncio.to_thread(self.upload_file, data, bucket_name, object_key)
275+
return asyncio.to_thread(self.upload_file, data, bucket_name, object_key, metadata)
253276
elif isinstance(data, bytes):
254277
# data is bytes content
255-
return asyncio.to_thread(self.upload_bytes, data, bucket_name, object_key)
278+
return asyncio.to_thread(self.upload_bytes, data, bucket_name, object_key, metadata)
256279
else:
257280
error_msg = f"Upload failed: data type error. Only str (file path) and bytes are supported, got {type(data)}"
258281
logger.error(error_msg)

veadk/knowledgebase/backends/base_backend.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,19 @@ def precheck_index_naming(self) -> None:
2929
"""
3030

3131
@abstractmethod
32-
def add_from_directory(self, directory: str, **kwargs) -> bool:
32+
def add_from_directory(self, directory: str, *args, **kwargs) -> bool:
3333
"""Add knowledge from file path to knowledgebase"""
3434

3535
@abstractmethod
36-
def add_from_files(self, files: list[str], **kwargs) -> bool:
36+
def add_from_files(self, files: list[str], *args, **kwargs) -> bool:
3737
"""Add knowledge (e.g, documents, strings, ...) to knowledgebase"""
3838

3939
@abstractmethod
40-
def add_from_text(self, text: str | list[str], **kwargs) -> bool:
40+
def add_from_text(self, text: str | list[str], *args, **kwargs) -> bool:
4141
"""Add knowledge from text to knowledgebase"""
4242

4343
@abstractmethod
44-
def search(self, **kwargs) -> list:
44+
def search(self, *args, **kwargs) -> list:
4545
"""Search knowledge from knowledgebase"""
4646

4747
# Optional methods for future use:

veadk/knowledgebase/backends/vikingdb_knowledge_backend.py

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import asyncio
16+
import json
1617
import re
1718
from pathlib import Path
1819
from typing import Any, Literal
@@ -26,6 +27,7 @@
2627
from veadk.configs.database_configs import NormalTOSConfig, TOSConfig
2728
from veadk.knowledgebase.backends.base_backend import BaseKnowledgebaseBackend
2829
from veadk.knowledgebase.backends.utils import build_vikingdb_knowledgebase_request
30+
from veadk.knowledgebase.entry import KnowledgebaseEntry
2931
from veadk.utils.logger import get_logger
3032
from veadk.utils.misc import formatted_timestamp
3133

@@ -106,6 +108,7 @@ def add_from_directory(
106108
directory: str,
107109
tos_bucket_name: str | None = None,
108110
tos_bucket_path: str = "knowledgebase",
111+
metadata: dict | None = None,
109112
**kwargs,
110113
) -> bool:
111114
"""Add knowledge from a directory to the knowledgebase.
@@ -114,6 +117,8 @@ def add_from_directory(
114117
directory (str): The directory to add to knowledgebase.
115118
tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
116119
tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
120+
metadata (dict | None, optional): The metadata of the files. Defaults to None.
121+
**kwargs: Additional keyword arguments.
117122
118123
Returns:
119124
bool: True if successful, False otherwise.
@@ -126,6 +131,7 @@ def add_from_directory(
126131
content,
127132
tos_bucket_name=tos_bucket_name,
128133
object_key=f"{tos_bucket_path}/{file_name}",
134+
metadata=metadata,
129135
)
130136
self._add_doc(tos_url=tos_url)
131137
return True
@@ -136,6 +142,7 @@ def add_from_files(
136142
files: list[str],
137143
tos_bucket_name: str | None = None,
138144
tos_bucket_path: str = "knowledgebase",
145+
metadata: dict | None = None,
139146
**kwargs,
140147
) -> bool:
141148
"""Add knowledge from a directory to the knowledgebase.
@@ -144,6 +151,8 @@ def add_from_files(
144151
files (list[str]): The files to add to knowledgebase.
145152
tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
146153
tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
154+
metadata (dict | None, optional): The metadata of the files. Defaults to None.
155+
**kwargs: Additional keyword arguments.
147156
148157
Returns:
149158
bool: True if successful, False otherwise.
@@ -155,6 +164,7 @@ def add_from_files(
155164
content,
156165
tos_bucket_name=tos_bucket_name,
157166
object_key=f"{tos_bucket_path}/{file_name}",
167+
metadata=metadata,
158168
)
159169
self._add_doc(tos_url=tos_url)
160170
return True
@@ -165,6 +175,7 @@ def add_from_text(
165175
text: str | list[str],
166176
tos_bucket_name: str | None = None,
167177
tos_bucket_path: str = "knowledgebase",
178+
metadata: dict | None = None,
168179
**kwargs,
169180
) -> bool:
170181
"""Add knowledge from text to the knowledgebase.
@@ -189,7 +200,7 @@ def add_from_text(
189200
for _text, _object_key in zip(text, object_keys):
190201
_content = _text.encode("utf-8")
191202
tos_url = self._upload_bytes_to_tos(
192-
_content, tos_bucket_name, _object_key
203+
_content, tos_bucket_name, _object_key, metadata=metadata
193204
)
194205
self._add_doc(tos_url=tos_url)
195206
return True
@@ -198,7 +209,9 @@ def add_from_text(
198209
object_key = kwargs.get(
199210
"object_key", f"veadk/knowledgebase/{formatted_timestamp()}.txt"
200211
)
201-
tos_url = self._upload_bytes_to_tos(content, tos_bucket_name, object_key)
212+
tos_url = self._upload_bytes_to_tos(
213+
content, tos_bucket_name, object_key, metadata=metadata
214+
)
202215
self._add_doc(tos_url=tos_url)
203216
else:
204217
raise ValueError("text must be str or list[str]")
@@ -210,6 +223,7 @@ def add_from_bytes(
210223
file_name: str,
211224
tos_bucket_name: str | None = None,
212225
tos_bucket_path: str = "knowledgebase",
226+
metadata: dict | None = None,
213227
**kwargs,
214228
) -> bool:
215229
"""Add knowledge from bytes to the knowledgebase.
@@ -219,6 +233,8 @@ def add_from_bytes(
219233
file_name (str): The file name of the content.
220234
tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
221235
tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
236+
metadata (dict | None, optional): The metadata of the files. Defaults to None.
237+
**kwargs: Additional keyword arguments.
222238
223239
Returns:
224240
bool: True if successful, False otherwise.
@@ -228,15 +244,24 @@ def add_from_bytes(
228244
content,
229245
tos_bucket_name=tos_bucket_name,
230246
object_key=f"{tos_bucket_path}/{file_name}",
247+
metadata=metadata,
231248
)
232249
response = self._add_doc(tos_url=tos_url)
233250
if response["code"] == 0:
234251
return True
235252
return False
236253

237254
@override
238-
def search(self, query: str, top_k: int = 5) -> list:
239-
return self._search_knowledge(query=query, top_k=top_k)
255+
def search(
256+
self,
257+
query: str,
258+
top_k: int = 5,
259+
metadata: dict | None = None,
260+
rerank: bool = True,
261+
) -> list:
262+
return self._search_knowledge(
263+
query=query, top_k=top_k, metadata=metadata, rerank=rerank
264+
)
240265

241266
def delete_collection(self) -> bool:
242267
DELETE_COLLECTION_PATH = "/api/knowledge/collection/delete"
@@ -359,7 +384,7 @@ def create_collection(self) -> None:
359384
response = self._do_request(
360385
body={
361386
"name": self.index,
362-
"project": "default",
387+
"project": self.volcengine_project,
363388
"description": "Created by Volcengine Agent Development Kit (VeADK).",
364389
},
365390
path=CREATE_COLLECTION_PATH,
@@ -372,10 +397,17 @@ def create_collection(self) -> None:
372397
)
373398

374399
def _upload_bytes_to_tos(
375-
self, content: bytes, tos_bucket_name: str, object_key: str
400+
self,
401+
content: bytes,
402+
tos_bucket_name: str,
403+
object_key: str,
404+
metadata: dict | None = None,
376405
) -> str:
406+
# Here, we set the metadata via the TOS object, ref: https://www.volcengine.com/docs/84313/1254624
377407
self._tos_client.bucket_name = tos_bucket_name
378-
coro = self._tos_client.upload(object_key=object_key, data=content)
408+
coro = self._tos_client.upload(
409+
object_key=object_key, data=content, metadata=metadata
410+
)
379411
try:
380412
loop = asyncio.get_running_loop()
381413
loop.run_until_complete(
@@ -391,7 +423,7 @@ def _add_doc(self, tos_url: str) -> Any:
391423
response = self._do_request(
392424
body={
393425
"collection_name": self.index,
394-
"project": "default",
426+
"project": self.volcengine_project,
395427
"add_type": "tos",
396428
"tos_path": tos_url,
397429
},
@@ -400,14 +432,43 @@ def _add_doc(self, tos_url: str) -> Any:
400432
)
401433
return response
402434

403-
def _search_knowledge(self, query: str, top_k: int = 5) -> list[str]:
435+
def _search_knowledge(
436+
self,
437+
query: str,
438+
top_k: int = 5,
439+
metadata: dict | None = None,
440+
rerank: bool = True,
441+
chunk_diffusion_count: int | None = 3,
442+
) -> list[KnowledgebaseEntry]:
404443
SEARCH_KNOWLEDGE_PATH = "/api/knowledge/collection/search_knowledge"
405444

445+
query_param = (
446+
{
447+
"doc_filter": {
448+
"op": "and",
449+
"conds": [
450+
{"op": "must", "field": str(k), "conds": [str(v)]}
451+
for k, v in metadata.items()
452+
],
453+
}
454+
}
455+
if metadata
456+
else None
457+
)
458+
459+
post_precessing = {
460+
"rerank_swich": rerank,
461+
"chunk_diffusion_count": chunk_diffusion_count,
462+
}
463+
406464
response = self._do_request(
407465
body={
408466
"name": self.index,
467+
"project": self.volcengine_project,
409468
"query": query,
410469
"limit": top_k,
470+
"query_param": query_param,
471+
"post_processing": post_precessing,
411472
},
412473
path=SEARCH_KNOWLEDGE_PATH,
413474
method="POST",
@@ -418,11 +479,19 @@ def _search_knowledge(self, query: str, top_k: int = 5) -> list[str]:
418479
f"Error during knowledge search: {response.get('code')}, message: {response.get('message')}"
419480
)
420481

421-
search_result_list = response.get("data", {}).get("result_list", [])
482+
entries = []
483+
for result in response.get("data", {}).get("result_list", []):
484+
doc_meta_raw_str = result.get("doc_info", {}).get("doc_meta")
485+
doc_meta_list = json.loads(doc_meta_raw_str) if doc_meta_raw_str else []
486+
metadata = {}
487+
for meta in doc_meta_list:
488+
metadata[meta["field_name"]] = meta["field_value"]
422489

423-
return [
424-
search_result.get("content", "") for search_result in search_result_list
425-
]
490+
entries.append(
491+
KnowledgebaseEntry(content=result.get("content", ""), metadata=metadata)
492+
)
493+
494+
return entries
426495

427496
def _do_request(
428497
self,
@@ -445,4 +514,8 @@ def _do_request(
445514
headers=request.headers,
446515
data=request.body,
447516
)
517+
if not response.ok:
518+
logger.error(
519+
f"VikingDBKnowledgeBackend error during request: {response.json()}"
520+
)
448521
return response.json()

veadk/knowledgebase/entry.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from pydantic import BaseModel
16+
17+
18+
class KnowledgebaseEntry(BaseModel):
19+
"""Represents a single entry in the knowledgebase."""
20+
21+
# The main content of the knowledgebase entry.
22+
content: str
23+
24+
# Optional metadata associated with the entry.
25+
metadata: dict | None = None

0 commit comments

Comments
 (0)