Skip to content

Commit 110d02d

Browse files
committed
refine(kb): reconstruct knowledgebase
1 parent bde4785 commit 110d02d

File tree

9 files changed

+822
-143
lines changed

9 files changed

+822
-143
lines changed

veadk/configs/model_configs.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,20 @@ class ModelConfig(BaseSettings):
4040
@cached_property
4141
def api_key(self) -> str:
4242
return os.getenv("MODEL_AGENT_API_KEY") or ARKVeAuth().token
43+
44+
45+
class EmbeddingModelConfig(BaseSettings):
46+
model_config = SettingsConfigDict(env_prefix="MODEL_EMBEDDING_")
47+
48+
name: str = "doubao-embedding-text-240715"
49+
"""Model name for embedding."""
50+
51+
dim: int = 2560
52+
"""Embedding dim is different from different models."""
53+
54+
api_base: str = "https://ark.cn-beijing.volces.com/api/v3/embeddings"
55+
"""The api base of the model for embedding."""
56+
57+
@cached_property
58+
def api_key(self) -> str:
59+
return os.getenv("MODEL_EMBEDDING_API_KEY") or ARKVeAuth().token
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from abc import ABC, abstractmethod
16+
17+
from pydantic import BaseModel
18+
19+
20+
class BaseKnowledgebaseBackend(ABC, BaseModel):
21+
index: str
22+
"""Index or collection name of the vector storage."""
23+
24+
@abstractmethod
25+
def add_from_directory(self, directory: str, **kwargs) -> bool:
26+
"""Add knowledge from file path to knowledgebase"""
27+
...
28+
29+
@abstractmethod
30+
def add_from_files(self, files: list[str], **kwargs) -> bool:
31+
"""Add knowledge (e.g, documents, strings, ...) to knowledgebase"""
32+
...
33+
34+
@abstractmethod
35+
def add_from_text(self, text: str | list[str], **kwargs) -> bool:
36+
"""Add knowledge from text to knowledgebase"""
37+
...
38+
39+
@abstractmethod
40+
def search(self, **kwargs) -> list:
41+
"""Search knowledge from knowledgebase"""
42+
...
43+
44+
def delete(self, **kwargs) -> bool:
45+
"""Delete knowledge from knowledgebase"""
46+
...
47+
48+
def list_docs(self, **kwargs) -> None:
49+
"""List original documents in knowledgebase"""
50+
pass
51+
52+
def list_chunks(self, **kwargs) -> None:
53+
"""List embeded document chunks in knowledgebase"""
54+
pass
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex
16+
from llama_index.core.schema import BaseNode
17+
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
18+
from pydantic import Field
19+
from typing_extensions import Any, override
20+
21+
from veadk.configs.model_configs import EmbeddingModelConfig
22+
from veadk.knowledgebase.backends.base_backend import BaseKnowledgebaseBackend
23+
from veadk.knowledgebase.backends.utils import get_llama_index_splitter
24+
25+
26+
class InMemoryKnowledgeBackend(BaseKnowledgebaseBackend):
27+
embedding_config: EmbeddingModelConfig = Field(default_factory=EmbeddingModelConfig)
28+
"""Embedding model configs"""
29+
30+
def model_post_init(self, __context: Any) -> None:
31+
self._embed_model = OpenAILikeEmbedding(
32+
model_name=self.embedding_config.name,
33+
api_key=self.embedding_config.api_key,
34+
api_base=self.embedding_config.api_base,
35+
)
36+
self._vector_index = VectorStoreIndex([], embed_model=self._embed_model)
37+
self._retriever = self._vector_index.as_retriever()
38+
39+
@override
40+
def add_from_directory(self, directory: str) -> bool:
41+
documents = SimpleDirectoryReader(input_dir=directory).load_data()
42+
nodes = self._split_documents(documents)
43+
self._vector_index.insert_nodes(nodes)
44+
return True
45+
46+
@override
47+
def add_from_files(self, files: list[str]) -> bool:
48+
documents = SimpleDirectoryReader(input_files=files).load_data()
49+
nodes = self._split_documents(documents)
50+
self._vector_index.insert_nodes(nodes)
51+
return True
52+
53+
@override
54+
def add_from_text(self, text: str | list[str]) -> bool:
55+
if isinstance(text, str):
56+
documents = [Document(text=text)]
57+
else:
58+
documents = [Document(text=t) for t in text]
59+
nodes = self._split_documents(documents)
60+
self._vector_index.insert_nodes(nodes)
61+
return True
62+
63+
@override
64+
def search(self, query: str, top_k: int = 5) -> list[str]:
65+
retrieved_nodes = self._retriever.retrieve(query, top_k=top_k)
66+
return [node.text for node in retrieved_nodes]
67+
68+
def _split_documents(self, documents: list[Document]) -> list[BaseNode]:
69+
"""Split document into chunks"""
70+
nodes = []
71+
for document in documents:
72+
splitter = get_llama_index_splitter(document.metadata.get("file_path", ""))
73+
_nodes = splitter.get_nodes_from_documents([document])
74+
nodes.extend(_nodes)
75+
return nodes
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from llama_index.core import (
16+
Document,
17+
SimpleDirectoryReader,
18+
StorageContext,
19+
VectorStoreIndex,
20+
)
21+
from llama_index.core.schema import BaseNode
22+
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
23+
from llama_index.vector_stores.opensearch import (
24+
OpensearchVectorClient,
25+
OpensearchVectorStore,
26+
)
27+
from pydantic import Field
28+
from typing_extensions import Any, override
29+
30+
from veadk.configs.database_configs import OpensearchConfig
31+
from veadk.configs.model_configs import EmbeddingModelConfig
32+
from veadk.knowledgebase.backends.base_backend import BaseKnowledgebaseBackend
33+
from veadk.knowledgebase.backends.utils import get_llama_index_splitter
34+
35+
36+
class OpensearchKnowledgeBackend(BaseKnowledgebaseBackend):
37+
opensearch_config: OpensearchConfig = Field(default_factory=OpensearchConfig)
38+
"""Opensearch client configs"""
39+
40+
embedding_config: EmbeddingModelConfig = Field(default_factory=EmbeddingModelConfig)
41+
"""Embedding model configs"""
42+
43+
def model_post_init(self, __context: Any) -> None:
44+
self._opensearch_client = OpensearchVectorClient(
45+
endpoint=self.opensearch_config.host,
46+
port=self.opensearch_config.port,
47+
http_auth=(
48+
self.opensearch_config.username,
49+
self.opensearch_config.password,
50+
),
51+
use_ssl=True,
52+
verify_certs=False,
53+
dim=self.embedding_config.dim,
54+
index=self.index, # collection name
55+
)
56+
57+
self._vector_store = OpensearchVectorStore(client=self._opensearch_client)
58+
59+
self._storage_context = StorageContext.from_defaults(
60+
vector_store=self._vector_store
61+
)
62+
63+
self._embed_model = OpenAILikeEmbedding(
64+
model_name=self.embedding_config.name,
65+
api_key=self.embedding_config.api_key,
66+
api_base=self.embedding_config.api_base,
67+
)
68+
69+
self._vector_index = VectorStoreIndex.from_documents(
70+
documents=[],
71+
storage_context=self._storage_context,
72+
embed_model=self._embed_model,
73+
)
74+
self._retriever = self._vector_index.as_retriever()
75+
76+
@override
77+
def add_from_directory(self, directory: str) -> bool:
78+
documents = SimpleDirectoryReader(input_dir=directory).load_data()
79+
nodes = self._split_documents(documents)
80+
self._vector_index.insert_nodes(nodes)
81+
return True
82+
83+
@override
84+
def add_from_files(self, files: list[str]) -> bool:
85+
documents = SimpleDirectoryReader(input_files=files).load_data()
86+
nodes = self._split_documents(documents)
87+
self._vector_index.insert_nodes(nodes)
88+
return True
89+
90+
@override
91+
def add_from_text(self, text: str | list[str]) -> bool:
92+
if isinstance(text, str):
93+
documents = [Document(text=text)]
94+
else:
95+
documents = [Document(text=t) for t in text]
96+
nodes = self._split_documents(documents)
97+
self._vector_index.insert_nodes(nodes)
98+
return True
99+
100+
@override
101+
def search(self, query: str, top_k: int = 5) -> list[str]:
102+
retrieved_nodes = self._retriever.retrieve(query, top_k=top_k)
103+
return [node.text for node in retrieved_nodes]
104+
105+
def _split_documents(self, documents: list[Document]) -> list[BaseNode]:
106+
"""Split document into chunks"""
107+
nodes = []
108+
for document in documents:
109+
splitter = get_llama_index_splitter(document.metadata.get("file_path", ""))
110+
_nodes = splitter.get_nodes_from_documents([document])
111+
nodes.extend(_nodes)
112+
return nodes
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from llama_index.core import (
16+
Document,
17+
SimpleDirectoryReader,
18+
StorageContext,
19+
VectorStoreIndex,
20+
)
21+
from llama_index.core.schema import BaseNode
22+
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
23+
from llama_index.vector_stores.redis import RedisVectorStore
24+
from pydantic import Field
25+
from redis import Redis
26+
from typing_extensions import Any, override
27+
28+
from veadk.configs.database_configs import RedisConfig
29+
from veadk.configs.model_configs import EmbeddingModelConfig
30+
from veadk.knowledgebase.backends.base_backend import BaseKnowledgebaseBackend
31+
from veadk.knowledgebase.backends.utils import get_llama_index_splitter
32+
33+
34+
class RedisKnowledgeBackend(BaseKnowledgebaseBackend):
35+
redis_config: RedisConfig = Field(default_factory=RedisConfig)
36+
"""Redis client configs"""
37+
38+
embedding_config: EmbeddingModelConfig
39+
"""Embedding model configs"""
40+
41+
def model_post_init(self, __context: Any) -> None:
42+
# We will use `from_url` to init Redis client once the
43+
# AK/SK -> STS token is ready.
44+
# self._redis_client = Redis.from_url(url=...)
45+
46+
self._redis_client = Redis(
47+
host=self.redis_config.host,
48+
port=self.redis_config.port,
49+
db=self.redis_config.db,
50+
password=self.redis_config.password,
51+
)
52+
53+
self._embed_model = OpenAILikeEmbedding(
54+
model_name=self.embedding_config.name,
55+
api_key=self.embedding_config.api_key,
56+
api_base=self.embedding_config.api_base,
57+
)
58+
59+
self._vector_store = RedisVectorStore(
60+
redis_client=self._redis_client, overwrite=True
61+
)
62+
63+
self._storage_context = StorageContext.from_defaults(
64+
vector_store=self._vector_store
65+
)
66+
67+
self._vector_index = VectorStoreIndex.from_documents(
68+
documents=[], storage_context=self._storage_context
69+
)
70+
self._retriever = self._vector_index.as_retriever()
71+
72+
@override
73+
def add_from_directory(self, directory: str) -> bool:
74+
documents = SimpleDirectoryReader(input_dir=directory).load_data()
75+
nodes = self._split_documents(documents)
76+
self._vector_index.insert_nodes(nodes)
77+
return True
78+
79+
@override
80+
def add_from_files(self, files: list[str]) -> bool:
81+
documents = SimpleDirectoryReader(input_files=files).load_data()
82+
nodes = self._split_documents(documents)
83+
self._vector_index.insert_nodes(nodes)
84+
return True
85+
86+
@override
87+
def add_from_text(self, text: str | list[str]) -> bool:
88+
if isinstance(text, str):
89+
documents = [Document(text=text)]
90+
else:
91+
documents = [Document(text=t) for t in text]
92+
nodes = self._split_documents(documents)
93+
self._vector_index.insert_nodes(nodes)
94+
return True
95+
96+
@override
97+
def search(self, query: str, top_k: int = 5) -> list[str]:
98+
retrieved_nodes = self._retriever.retrieve(query, top_k=top_k)
99+
return [node.text for node in retrieved_nodes]
100+
101+
def _split_documents(self, documents: list[Document]) -> list[BaseNode]:
102+
"""Split document into chunks"""
103+
nodes = []
104+
for document in documents:
105+
splitter = get_llama_index_splitter(document.metadata.get("file_path", ""))
106+
_nodes = splitter.get_nodes_from_documents([document])
107+
nodes.extend(_nodes)
108+
return nodes

0 commit comments

Comments
 (0)