1313# limitations under the License.
1414
1515import asyncio
16+ import json
1617import re
1718from pathlib import Path
1819from typing import Any , Literal
2627from veadk .configs .database_configs import NormalTOSConfig , TOSConfig
2728from veadk .knowledgebase .backends .base_backend import BaseKnowledgebaseBackend
2829from veadk .knowledgebase .backends .utils import build_vikingdb_knowledgebase_request
30+ from veadk .knowledgebase .entry import KnowledgebaseEntry
2931from veadk .utils .logger import get_logger
3032from veadk .utils .misc import formatted_timestamp
3133
@@ -106,6 +108,7 @@ def add_from_directory(
106108 directory : str ,
107109 tos_bucket_name : str | None = None ,
108110 tos_bucket_path : str = "knowledgebase" ,
111+ metadata : dict | None = None ,
109112 ** kwargs ,
110113 ) -> bool :
111114 """Add knowledge from a directory to the knowledgebase.
@@ -114,6 +117,8 @@ def add_from_directory(
114117 directory (str): The directory to add to knowledgebase.
115118 tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
116119 tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
120+ metadata (dict | None, optional): The metadata of the files. Defaults to None.
121+ **kwargs: Additional keyword arguments.
117122
118123 Returns:
119124 bool: True if successful, False otherwise.
@@ -126,6 +131,7 @@ def add_from_directory(
126131 content ,
127132 tos_bucket_name = tos_bucket_name ,
128133 object_key = f"{ tos_bucket_path } /{ file_name } " ,
134+ metadata = metadata ,
129135 )
130136 self ._add_doc (tos_url = tos_url )
131137 return True
@@ -136,6 +142,7 @@ def add_from_files(
136142 files : list [str ],
137143 tos_bucket_name : str | None = None ,
138144 tos_bucket_path : str = "knowledgebase" ,
145+ metadata : dict | None = None ,
139146 ** kwargs ,
140147 ) -> bool :
141148 """Add knowledge from a directory to the knowledgebase.
@@ -144,6 +151,8 @@ def add_from_files(
144151 files (list[str]): The files to add to knowledgebase.
145152 tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
146153 tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
154+ metadata (dict | None, optional): The metadata of the files. Defaults to None.
155+ **kwargs: Additional keyword arguments.
147156
148157 Returns:
149158 bool: True if successful, False otherwise.
@@ -155,6 +164,7 @@ def add_from_files(
155164 content ,
156165 tos_bucket_name = tos_bucket_name ,
157166 object_key = f"{ tos_bucket_path } /{ file_name } " ,
167+ metadata = metadata ,
158168 )
159169 self ._add_doc (tos_url = tos_url )
160170 return True
@@ -165,6 +175,7 @@ def add_from_text(
165175 text : str | list [str ],
166176 tos_bucket_name : str | None = None ,
167177 tos_bucket_path : str = "knowledgebase" ,
178+ metadata : dict | None = None ,
168179 ** kwargs ,
169180 ) -> bool :
170181 """Add knowledge from text to the knowledgebase.
@@ -189,7 +200,7 @@ def add_from_text(
189200 for _text , _object_key in zip (text , object_keys ):
190201 _content = _text .encode ("utf-8" )
191202 tos_url = self ._upload_bytes_to_tos (
192- _content , tos_bucket_name , _object_key
203+ _content , tos_bucket_name , _object_key , metadata = metadata
193204 )
194205 self ._add_doc (tos_url = tos_url )
195206 return True
@@ -198,7 +209,9 @@ def add_from_text(
198209 object_key = kwargs .get (
199210 "object_key" , f"veadk/knowledgebase/{ formatted_timestamp ()} .txt"
200211 )
201- tos_url = self ._upload_bytes_to_tos (content , tos_bucket_name , object_key )
212+ tos_url = self ._upload_bytes_to_tos (
213+ content , tos_bucket_name , object_key , metadata = metadata
214+ )
202215 self ._add_doc (tos_url = tos_url )
203216 else :
204217 raise ValueError ("text must be str or list[str]" )
@@ -210,6 +223,7 @@ def add_from_bytes(
210223 file_name : str ,
211224 tos_bucket_name : str | None = None ,
212225 tos_bucket_path : str = "knowledgebase" ,
226+ metadata : dict | None = None ,
213227 ** kwargs ,
214228 ) -> bool :
215229 """Add knowledge from bytes to the knowledgebase.
@@ -219,6 +233,8 @@ def add_from_bytes(
219233 file_name (str): The file name of the content.
220234 tos_bucket_name (str | None, optional): The bucket name of TOS. Defaults to None.
221235 tos_bucket_path (str, optional): The path of TOS bucket. Defaults to "knowledgebase".
236+ metadata (dict | None, optional): The metadata of the files. Defaults to None.
237+ **kwargs: Additional keyword arguments.
222238
223239 Returns:
224240 bool: True if successful, False otherwise.
@@ -228,15 +244,24 @@ def add_from_bytes(
228244 content ,
229245 tos_bucket_name = tos_bucket_name ,
230246 object_key = f"{ tos_bucket_path } /{ file_name } " ,
247+ metadata = metadata ,
231248 )
232249 response = self ._add_doc (tos_url = tos_url )
233250 if response ["code" ] == 0 :
234251 return True
235252 return False
236253
237254 @override
238- def search (self , query : str , top_k : int = 5 ) -> list :
239- return self ._search_knowledge (query = query , top_k = top_k )
255+ def search (
256+ self ,
257+ query : str ,
258+ top_k : int = 5 ,
259+ metadata : dict | None = None ,
260+ rerank : bool = True ,
261+ ) -> list :
262+ return self ._search_knowledge (
263+ query = query , top_k = top_k , metadata = metadata , rerank = rerank
264+ )
240265
241266 def delete_collection (self ) -> bool :
242267 DELETE_COLLECTION_PATH = "/api/knowledge/collection/delete"
@@ -359,7 +384,7 @@ def create_collection(self) -> None:
359384 response = self ._do_request (
360385 body = {
361386 "name" : self .index ,
362- "project" : "default" ,
387+ "project" : self . volcengine_project ,
363388 "description" : "Created by Volcengine Agent Development Kit (VeADK)." ,
364389 },
365390 path = CREATE_COLLECTION_PATH ,
@@ -372,10 +397,17 @@ def create_collection(self) -> None:
372397 )
373398
374399 def _upload_bytes_to_tos (
375- self , content : bytes , tos_bucket_name : str , object_key : str
400+ self ,
401+ content : bytes ,
402+ tos_bucket_name : str ,
403+ object_key : str ,
404+ metadata : dict | None = None ,
376405 ) -> str :
406+ # Here, we set the metadata via the TOS object, ref: https://www.volcengine.com/docs/84313/1254624
377407 self ._tos_client .bucket_name = tos_bucket_name
378- coro = self ._tos_client .upload (object_key = object_key , data = content )
408+ coro = self ._tos_client .upload (
409+ object_key = object_key , data = content , metadata = metadata
410+ )
379411 try :
380412 loop = asyncio .get_running_loop ()
381413 loop .run_until_complete (
@@ -391,7 +423,7 @@ def _add_doc(self, tos_url: str) -> Any:
391423 response = self ._do_request (
392424 body = {
393425 "collection_name" : self .index ,
394- "project" : "default" ,
426+ "project" : self . volcengine_project ,
395427 "add_type" : "tos" ,
396428 "tos_path" : tos_url ,
397429 },
@@ -400,14 +432,43 @@ def _add_doc(self, tos_url: str) -> Any:
400432 )
401433 return response
402434
403- def _search_knowledge (self , query : str , top_k : int = 5 ) -> list [str ]:
435+ def _search_knowledge (
436+ self ,
437+ query : str ,
438+ top_k : int = 5 ,
439+ metadata : dict | None = None ,
440+ rerank : bool = True ,
441+ chunk_diffusion_count : int | None = 3 ,
442+ ) -> list [KnowledgebaseEntry ]:
404443 SEARCH_KNOWLEDGE_PATH = "/api/knowledge/collection/search_knowledge"
405444
445+ query_param = (
446+ {
447+ "doc_filter" : {
448+ "op" : "and" ,
449+ "conds" : [
450+ {"op" : "must" , "field" : str (k ), "conds" : [str (v )]}
451+ for k , v in metadata .items ()
452+ ],
453+ }
454+ }
455+ if metadata
456+ else None
457+ )
458+
459+ post_precessing = {
460+ "rerank_swich" : rerank ,
461+ "chunk_diffusion_count" : chunk_diffusion_count ,
462+ }
463+
406464 response = self ._do_request (
407465 body = {
408466 "name" : self .index ,
467+ "project" : self .volcengine_project ,
409468 "query" : query ,
410469 "limit" : top_k ,
470+ "query_param" : query_param ,
471+ "post_processing" : post_precessing ,
411472 },
412473 path = SEARCH_KNOWLEDGE_PATH ,
413474 method = "POST" ,
@@ -418,11 +479,19 @@ def _search_knowledge(self, query: str, top_k: int = 5) -> list[str]:
418479 f"Error during knowledge search: { response .get ('code' )} , message: { response .get ('message' )} "
419480 )
420481
421- search_result_list = response .get ("data" , {}).get ("result_list" , [])
482+ entries = []
483+ for result in response .get ("data" , {}).get ("result_list" , []):
484+ doc_meta_raw_str = result .get ("doc_info" , {}).get ("doc_meta" )
485+ doc_meta_list = json .loads (doc_meta_raw_str ) if doc_meta_raw_str else []
486+ metadata = {}
487+ for meta in doc_meta_list :
488+ metadata [meta ["field_name" ]] = meta ["field_value" ]
422489
423- return [
424- search_result .get ("content" , "" ) for search_result in search_result_list
425- ]
490+ entries .append (
491+ KnowledgebaseEntry (content = result .get ("content" , "" ), metadata = metadata )
492+ )
493+
494+ return entries
426495
427496 def _do_request (
428497 self ,
@@ -445,4 +514,8 @@ def _do_request(
445514 headers = request .headers ,
446515 data = request .body ,
447516 )
517+ if not response .ok :
518+ logger .error (
519+ f"VikingDBKnowledgeBackend error during request: { response .json ()} "
520+ )
448521 return response .json ()
0 commit comments