Skip to content

Commit 43adf6e

Browse files
committed
Use chroma for temp vectordb
1 parent dc09458 commit 43adf6e

File tree

3 files changed

+35
-46
lines changed

3 files changed

+35
-46
lines changed

llmstack/common/blocks/data/store/vectorstore/chroma.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Any, Tuple
33
from typing import List
44
from uuid import uuid4
5+
import uuid
56

67
import chromadb
78
from pydantic import BaseModel
@@ -13,6 +14,7 @@
1314
class ChromaConfiguration(BaseModel):
1415
_type = 'Chroma'
1516
anonymized_telemetry = False
17+
is_persistent = True
1618

1719

1820
class Chroma(VectorStoreInterface):
@@ -23,8 +25,12 @@ class Chroma(VectorStoreInterface):
2325
def __init__(self, *args, **kwargs) -> None:
2426
configuration = ChromaConfiguration(**kwargs)
2527
db_settings = chromadb.config.Settings(**configuration.dict())
26-
self._client = chromadb.PersistentClient(
27-
path=settings.DEFAULT_VECTOR_DATABASE_PATH, settings=db_settings) if settings.DEFAULT_VECTOR_DATABASE_PATH else chromadb.Client(settings=db_settings)
28+
29+
if db_settings.is_persistent:
30+
self._client = chromadb.PersistentClient(
31+
path=settings.DEFAULT_VECTOR_DATABASE_PATH, settings=db_settings) if settings.DEFAULT_VECTOR_DATABASE_PATH else chromadb.Client(settings=db_settings)
32+
else:
33+
self._client = chromadb.EphemeralClient(settings=db_settings)
2834

2935
def add_text(self, index_name: str, document: Document, **kwargs: Any):
3036
content_key = document.page_content_key
@@ -101,3 +107,9 @@ def similarity_search(self, index_name: str, document_query: DocumentQuery, **kw
101107
)
102108

103109
return result
110+
111+
def create_temp_index(self):
112+
index_name = 'Temp_{}'.format(str(uuid.uuid4())).replace('-', '_')
113+
self.create_index(schema='', index_name=index_name)
114+
115+
return index_name

llmstack/processors/providers/promptly/data_uri_text_extract.py

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from pydantic import conint
99
from pydantic import Field
1010

11-
from llmstack.common.blocks.data.store.vectorstore import Document
12-
from llmstack.common.blocks.data.store.vectorstore.temp_weaviate import TempWeaviate
11+
from llmstack.common.blocks.data.store.vectorstore import Document, DocumentQuery
12+
from llmstack.common.blocks.data.store.vectorstore.chroma import Chroma
1313
from llmstack.common.utils.text_extract import extract_text_from_b64_json, ExtraParams
1414
from llmstack.common.utils.splitter import SpacyTextSplitter
1515
from llmstack.common.utils.utils import validate_parse_data_uri
@@ -93,22 +93,8 @@ def session_data_to_persist(self) -> dict:
9393

9494
def process(self) -> str:
9595
openai_api_key = self._env.get('openai_api_key', None)
96-
weaviate_url = self._env['weaviate_url']
97-
weaviate_api_key = self._env.get('weaviate_api_key', None)
98-
azure_openai_api_key = self._env.get('azure_openai_api_key', None)
99-
weaviate_embedding_endpoint = self._env['weaviate_embedding_endpoint']
100-
weaviate_text2vec_config = self._env['weaviate_text2vec_config']
101-
10296
query = self._input.query
103-
104-
self.temp_store = TempWeaviate(
105-
url=weaviate_url,
106-
openai_key=openai_api_key,
107-
azure_openai_key=azure_openai_api_key,
108-
weaviate_rw_api_key=weaviate_api_key,
109-
weaviate_embedding_endpoint=weaviate_embedding_endpoint,
110-
weaviate_text2vec_config=weaviate_text2vec_config,
111-
)
97+
self.temp_store = Chroma(is_persistent=False)
11298

11399
file = self._input.file or None
114100
if (file is None or file == '') and self._input.file_data:
@@ -157,13 +143,15 @@ def process(self) -> str:
157143
).split_text(text)
158144
futures = [
159145
executor.submit(
160-
self.temp_store.add_content,
161-
index_name, text_chunk, source=file_name,
146+
self.temp_store.add_text,
147+
index_name, Document(page_content_key="content", page_content=text_chunk, metadata={
148+
'source': file_name}),
162149
) for text_chunk in text_chunks
163150
]
164151
concurrent.futures.wait(futures)
165-
documents: List[Document] = self.temp_store.search_temp_index(
166-
self.storage_index_name, query, self._config.document_limit,
152+
documents: List[Document] = self.temp_store.hybrid_search(
153+
self.storage_index_name, document_query=DocumentQuery(
154+
query=query, limit=self._config.document_limit),
167155
)
168156

169157
async_to_sync(self._output_stream.write)(

llmstack/processors/providers/promptly/http_uri_text_extract.py

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
from asgiref.sync import async_to_sync
66
from pydantic import conint
77
from pydantic import Field
8-
from pydantic import HttpUrl
98

10-
from llmstack.common.blocks.data.store.vectorstore import Document
11-
from llmstack.common.blocks.data.store.vectorstore.temp_weaviate import TempWeaviate
9+
from llmstack.common.blocks.data.store.vectorstore import Document, DocumentQuery
10+
from llmstack.common.blocks.data.store.vectorstore.chroma import Chroma
1211
from llmstack.common.utils.text_extract import extract_text_from_url, ExtraParams
1312
from llmstack.common.utils.splitter import SpacyTextSplitter
1413
from llmstack.processors.providers.api_processor_interface import ApiProcessorInterface, ApiProcessorSchema
@@ -74,23 +73,10 @@ def session_data_to_persist(self) -> dict:
7473

7574
def process(self) -> HttpUriTextExtractorOutput:
7675
openai_api_key = self._env.get('openai_api_key', None)
77-
weaviate_url = self._env['weaviate_url']
78-
weaviate_api_key = self._env.get('weaviate_api_key', None)
79-
azure_openai_api_key = self._env.get('azure_openai_api_key', None)
80-
weaviate_embedding_endpoint = self._env['weaviate_embedding_endpoint']
81-
weaviate_text2vec_config = self._env['weaviate_text2vec_config']
8276

8377
query = self._input.query
8478
url = self._input.url.strip().rstrip()
85-
86-
self.temp_store = TempWeaviate(
87-
url=weaviate_url,
88-
openai_key=openai_api_key,
89-
weaviate_rw_api_key=weaviate_api_key,
90-
azure_openai_key=azure_openai_api_key,
91-
weaviate_embedding_endpoint=weaviate_embedding_endpoint,
92-
weaviate_text2vec_config=weaviate_text2vec_config,
93-
)
79+
self.temp_store = Chroma(is_persistent=False)
9480

9581
if (query is None or query == '') and url == self.url and self.extracted_text is not None:
9682
async_to_sync(self._output_stream.write)(
@@ -100,8 +86,9 @@ def process(self) -> HttpUriTextExtractorOutput:
10086
return output
10187

10288
if query and self.storage_index_name and url == self.url:
103-
documents: List[Document] = self.temp_store.search_temp_index(
104-
self.storage_index_name, query, self._config.document_limit,
89+
documents: List[Document] = self.temp_store.hybrid_search(
90+
self.storage_index_name, document_query=DocumentQuery(
91+
query=query, limit=self._config.document_limit),
10592
)
10693
for document in documents:
10794
async_to_sync(self._output_stream.write)(
@@ -124,11 +111,13 @@ def process(self) -> HttpUriTextExtractorOutput:
124111
index_name = self.temp_store.create_temp_index()
125112
self.storage_index_name = index_name
126113
for text_chunk in SpacyTextSplitter(separator='\n', chunk_size=self._config.text_chunk_size).split_text(text):
127-
self.temp_store.add_content(
128-
index_name, text_chunk, source=self.url,
114+
self.temp_store.add_text(
115+
index_name, Document(page_content_key="content", page_content=text_chunk, metadata={
116+
'source': self.url}),
129117
)
130-
documents: List[Document] = self.temp_store.search_temp_index(
131-
self.storage_index_name, query, self._config.document_limit,
118+
documents: List[Document] = self.temp_store.hybrid_search(
119+
self.storage_index_name, document_query=DocumentQuery(
120+
query=query, limit=self._config.document_limit),
132121
)
133122

134123
for document in documents:

0 commit comments

Comments
 (0)