55from asgiref .sync import async_to_sync
66from pydantic import conint
77from pydantic import Field
8- from pydantic import HttpUrl
98
10- from llmstack .common .blocks .data .store .vectorstore import Document
11- from llmstack .common .blocks .data .store .vectorstore .temp_weaviate import TempWeaviate
9+ from llmstack .common .blocks .data .store .vectorstore import Document , DocumentQuery
10+ from llmstack .common .blocks .data .store .vectorstore .chroma import Chroma
1211from llmstack .common .utils .text_extract import extract_text_from_url , ExtraParams
1312from llmstack .common .utils .splitter import SpacyTextSplitter
1413from llmstack .processors .providers .api_processor_interface import ApiProcessorInterface , ApiProcessorSchema
@@ -74,23 +73,10 @@ def session_data_to_persist(self) -> dict:
7473
7574 def process (self ) -> HttpUriTextExtractorOutput :
7675 openai_api_key = self ._env .get ('openai_api_key' , None )
77- weaviate_url = self ._env ['weaviate_url' ]
78- weaviate_api_key = self ._env .get ('weaviate_api_key' , None )
79- azure_openai_api_key = self ._env .get ('azure_openai_api_key' , None )
80- weaviate_embedding_endpoint = self ._env ['weaviate_embedding_endpoint' ]
81- weaviate_text2vec_config = self ._env ['weaviate_text2vec_config' ]
8276
8377 query = self ._input .query
8478 url = self ._input .url .strip ().rstrip ()
85-
86- self .temp_store = TempWeaviate (
87- url = weaviate_url ,
88- openai_key = openai_api_key ,
89- weaviate_rw_api_key = weaviate_api_key ,
90- azure_openai_key = azure_openai_api_key ,
91- weaviate_embedding_endpoint = weaviate_embedding_endpoint ,
92- weaviate_text2vec_config = weaviate_text2vec_config ,
93- )
79+ self .temp_store = Chroma (is_persistent = False )
9480
9581 if (query is None or query == '' ) and url == self .url and self .extracted_text is not None :
9682 async_to_sync (self ._output_stream .write )(
@@ -100,8 +86,9 @@ def process(self) -> HttpUriTextExtractorOutput:
10086 return output
10187
10288 if query and self .storage_index_name and url == self .url :
103- documents : List [Document ] = self .temp_store .search_temp_index (
104- self .storage_index_name , query , self ._config .document_limit ,
89+ documents : List [Document ] = self .temp_store .hybrid_search (
90+ self .storage_index_name , document_query = DocumentQuery (
91+ query = query , limit = self ._config .document_limit ),
10592 )
10693 for document in documents :
10794 async_to_sync (self ._output_stream .write )(
@@ -124,11 +111,13 @@ def process(self) -> HttpUriTextExtractorOutput:
124111 index_name = self .temp_store .create_temp_index ()
125112 self .storage_index_name = index_name
126113 for text_chunk in SpacyTextSplitter (separator = '\n ' , chunk_size = self ._config .text_chunk_size ).split_text (text ):
127- self .temp_store .add_content (
128- index_name , text_chunk , source = self .url ,
114+ self .temp_store .add_text (
115+ index_name , Document (page_content_key = "content" , page_content = text_chunk , metadata = {
116+ 'source' : self .url }),
129117 )
130- documents : List [Document ] = self .temp_store .search_temp_index (
131- self .storage_index_name , query , self ._config .document_limit ,
118+ documents : List [Document ] = self .temp_store .hybrid_search (
119+ self .storage_index_name , document_query = DocumentQuery (
120+ query = query , limit = self ._config .document_limit ),
132121 )
133122
134123 for document in documents :
0 commit comments