22
33from llama_index import Document , StorageContext , VectorStoreIndex
44from llama_index .node_parser import SimpleNodeParser
5- from llama_index .vector_stores import VectorStoreQuery , VectorStoreQueryResult
5+ from llama_index .vector_stores import (
6+ FilterOperator ,
7+ MetadataFilter ,
8+ MetadataFilters ,
9+ VectorStoreQuery ,
10+ VectorStoreQueryResult ,
11+ )
612from unstract .adapters .exceptions import AdapterError
713from unstract .adapters .x2text .x2text_adapter import X2TextAdapter
814
915from unstract .sdk .constants import LogLevel , ToolEnv
1016from unstract .sdk .embedding import ToolEmbedding
11- from unstract .sdk .exceptions import SdkError
17+ from unstract .sdk .exceptions import IndexingError , SdkError
1218from unstract .sdk .tool .base import BaseTool
1319from unstract .sdk .utils import ToolUtils
1420from unstract .sdk .utils .service_context import ServiceContext
@@ -172,18 +178,21 @@ def index_file(
172178 )
173179 raise SdkError (f"Error loading { vector_db } " )
174180
175- filter = [{"field_name" : "doc_id" , "operator" : "=" , "value" : doc_id }]
181+ doc_id_eq_filter = MetadataFilter .from_dict (
182+ {"key" : "doc_id" , "operator" : FilterOperator .EQ , "value" : doc_id }
183+ )
184+ filters = MetadataFilters (filters = [doc_id_eq_filter ])
176185 q = VectorStoreQuery (
177186 query_embedding = embedding_li .get_query_embedding (" " ),
178187 doc_ids = [doc_id ],
179- filters = filter ,
188+ filters = filters ,
180189 )
181190
182- doc_id_not_found = True
191+ doc_id_found = False
183192 try :
184193 n : VectorStoreQueryResult = vector_db_li .query (query = q )
185194 if len (n .nodes ) > 0 :
186- doc_id_not_found = False
195+ doc_id_found = True
187196 self .tool .stream_log (f"Found { len (n .nodes )} nodes for { doc_id } " )
188197 else :
189198 self .tool .stream_log (f"No nodes found for { doc_id } " )
@@ -192,7 +201,7 @@ def index_file(
192201 f"Error querying { vector_db } : { e } " , level = LogLevel .ERROR
193202 )
194203
195- if doc_id_not_found is False and reindex :
204+ if doc_id_found and reindex :
196205 # Delete the nodes for the doc_id
197206 try :
198207 vector_db_li .delete (ref_doc_id = doc_id )
@@ -203,87 +212,91 @@ def index_file(
203212 level = LogLevel .ERROR ,
204213 )
205214 raise SdkError (f"Error deleting nodes for { doc_id } : { e } " )
206- doc_id_not_found = True
215+ doc_id_found = False
207216
208- if doc_id_not_found :
209- self .tool .stream_log ("Extracting text from input file" )
210- full_text = []
211- extracted_text = ""
212- try :
213- x2text = X2Text (tool = self .tool )
214- x2text_adapter_inst : X2TextAdapter = x2text .get_x2text (
215- adapter_instance_id = x2text_adapter
216- )
217- extracted_text = x2text_adapter_inst .process (
218- input_file_path = file_path , output_file_path = output_file_path
219- )
220- except AdapterError as e :
221- # Wrapping AdapterErrors with SdkError
222- raise SdkError (str (e )) from e
223- full_text .append (
224- {
225- "section" : "full" ,
226- "text_contents" : self ._cleanup_text (extracted_text ),
227- }
217+ if doc_id_found :
218+ self .tool .stream_log (f"File was indexed already under { doc_id } " )
219+ return doc_id
220+
221+ # Extract text and index
222+ self .tool .stream_log ("Extracting text from input file" )
223+ full_text = []
224+ extracted_text = ""
225+ try :
226+ x2text = X2Text (tool = self .tool )
227+ x2text_adapter_inst : X2TextAdapter = x2text .get_x2text (
228+ adapter_instance_id = x2text_adapter
229+ )
230+ extracted_text = x2text_adapter_inst .process (
231+ input_file_path = file_path , output_file_path = output_file_path
228232 )
233+ except AdapterError as e :
234+ # Wrapping AdapterErrors with SdkError
235+ raise IndexingError (str (e )) from e
236+ full_text .append (
237+ {
238+ "section" : "full" ,
239+ "text_contents" : self ._cleanup_text (extracted_text ),
240+ }
241+ )
229242
230- # Check if chunking is required
231- documents = []
232- for item in full_text :
233- text = item ["text_contents" ]
234- self .tool .stream_log ("Indexing file..." )
235- document = Document (
236- text = text ,
237- doc_id = doc_id ,
238- metadata = {"section" : item ["section" ]},
239- )
240- document .id_ = doc_id
241- documents .append (document )
242- self .tool .stream_log (f"Number of documents: { len (documents )} " )
243- if chunk_size == 0 :
244- parser = SimpleNodeParser .from_defaults (
245- chunk_size = len (documents [0 ].text ) + 10 , chunk_overlap = 0
246- )
247- nodes = parser .get_nodes_from_documents (
248- documents , show_progress = True
249- )
250- node = nodes [0 ]
251- node .embedding = embedding_li .get_query_embedding (" " )
252- vector_db_li .add (nodes = [node ])
253- self .tool .stream_log ("Added node to vector db" )
254- else :
255- storage_context = StorageContext .from_defaults (
256- vector_store = vector_db_li
257- )
258- parser = SimpleNodeParser .from_defaults (
259- chunk_size = chunk_size , chunk_overlap = chunk_overlap
260- )
243+ # Check if chunking is required
244+ documents = []
245+ for item in full_text :
246+ text = item ["text_contents" ]
247+ self .tool .stream_log ("Indexing file..." )
248+ document = Document (
249+ text = text ,
250+ doc_id = doc_id ,
251+ metadata = {"section" : item ["section" ]},
252+ )
253+ document .id_ = doc_id
254+ documents .append (document )
255+ self .tool .stream_log (f"Number of documents: { len (documents )} " )
256+ if chunk_size == 0 :
257+ parser = SimpleNodeParser .from_defaults (
258+ chunk_size = len (documents [0 ].text ) + 10 , chunk_overlap = 0
259+ )
260+ nodes = parser .get_nodes_from_documents (
261+ documents , show_progress = True
262+ )
263+ node = nodes [0 ]
264+ node .embedding = embedding_li .get_query_embedding (" " )
265+ vector_db_li .add (nodes = [node ])
266+ self .tool .stream_log ("Added node to vector db" )
267+ else :
268+ storage_context = StorageContext .from_defaults (
269+ vector_store = vector_db_li
270+ )
271+ parser = SimpleNodeParser .from_defaults (
272+ chunk_size = chunk_size , chunk_overlap = chunk_overlap
273+ )
261274
262- service_context = ServiceContext .get_service_context (
263- platform_api_key = self .tool .get_env_or_die (
264- ToolEnv .PLATFORM_API_KEY
265- ),
266- embed_model = embedding_li ,
267- node_parser = parser ,
268- )
275+ service_context = ServiceContext .get_service_context (
276+ platform_api_key = self .tool .get_env_or_die (
277+ ToolEnv .PLATFORM_API_KEY
278+ ),
279+ embed_model = embedding_li ,
280+ node_parser = parser ,
281+ )
269282
270- self .tool .stream_log ("Adding nodes to vector db..." )
271- try :
272- VectorStoreIndex .from_documents (
273- documents ,
274- storage_context = storage_context ,
275- show_progress = True ,
276- service_context = service_context ,
277- )
278- except Exception as e :
279- self .tool .stream_log (
280- f"Error adding nodes to vector db: { e } " ,
281- level = LogLevel .ERROR ,
282- )
283- raise SdkError (f"Error adding nodes to vector db: { e } " )
284- self .tool .stream_log ("Added nodes to vector db" )
283+ self .tool .stream_log ("Adding nodes to vector db..." )
284+ try :
285+ VectorStoreIndex .from_documents (
286+ documents ,
287+ storage_context = storage_context ,
288+ show_progress = True ,
289+ service_context = service_context ,
290+ )
291+ except Exception as e :
292+ self .tool .stream_log (
293+ f"Error adding nodes to vector db: { e } " ,
294+ level = LogLevel .ERROR ,
295+ )
296+ raise SdkError (f"Error adding nodes to vector db: { e } " )
297+ self .tool .stream_log ("Added nodes to vector db" )
285298
286- self .tool .stream_log ("File has been indexed successfully" )
299+ self .tool .stream_log ("File has been indexed successfully" )
287300 return doc_id
288301
289302 @staticmethod
0 commit comments