@@ -178,7 +178,7 @@ def extract_text(
178178 logger .error (f"Error occured inside function 'process_text': { e } " )
179179 return extracted_text
180180
181- @log_elapsed (operation = "INDEXING(might include EXTRACTION )" )
181+ @log_elapsed (operation = "CHECK_AND_INDEX(overall )" )
182182 def index (
183183 self ,
184184 tool_id : str ,
@@ -293,82 +293,101 @@ def index(
293293 if not extracted_text :
294294 raise IndexingError ("No text available to index" )
295295
296- full_text = [
297- {
298- "section" : "full" ,
299- "text_contents" : extracted_text ,
300- }
301- ]
302-
303- # Check if chunking is required
304- documents = []
305- for item in full_text :
306- text = item ["text_contents" ]
307- self .tool .stream_log ("Indexing file..." )
308- document = Document (
309- text = text ,
310- doc_id = doc_id ,
311- metadata = {"section" : item ["section" ]},
312- )
313- document .id_ = doc_id
314- documents .append (document )
315- self .tool .stream_log (f"Number of documents: { len (documents )} " )
316-
317- if doc_id_found :
318- # Delete the nodes for the doc_id
319- try :
320- vector_db .delete (ref_doc_id = doc_id )
321- self .tool .stream_log (f"Deleted nodes for { doc_id } " )
322- except Exception as e :
323- self .tool .stream_log (
324- f"Error deleting nodes for { doc_id } : { e } " ,
325- level = LogLevel .ERROR ,
326- )
327- raise SdkError (f"Error deleting nodes for { doc_id } : { e } " ) from e
296+ self .index_to_vector_db (
297+ vector_db = vector_db ,
298+ embedding = embedding ,
299+ chunk_size = chunk_size ,
300+ chunk_overlap = chunk_overlap ,
301+ doc_id = doc_id ,
302+ text_to_idx = extracted_text ,
303+ doc_id_found = doc_id_found ,
304+ )
305+ return doc_id
306+ finally :
307+ vector_db .close ()
308+
309+ @log_elapsed (operation = "INDEXING" )
310+ def index_to_vector_db (
311+ self ,
312+ vector_db : VectorDB ,
313+ embedding : Embedding ,
314+ chunk_size : int ,
315+ chunk_overlap : int ,
316+ text_to_idx : str ,
317+ doc_id : str ,
318+ doc_id_found : bool ,
319+ ):
320+ self .tool .stream_log ("Indexing file..." )
321+ full_text = [
322+ {
323+ "section" : "full" ,
324+ "text_contents" : text_to_idx ,
325+ }
326+ ]
327+ # Check if chunking is required
328+ documents = []
329+ for item in full_text :
330+ text = item ["text_contents" ]
331+ document = Document (
332+ text = text ,
333+ doc_id = doc_id ,
334+ metadata = {"section" : item ["section" ]},
335+ )
336+ document .id_ = doc_id
337+ documents .append (document )
338+ self .tool .stream_log (f"Number of documents: { len (documents )} " )
328339
340+ if doc_id_found :
341+ # Delete the nodes for the doc_id
329342 try :
330- if chunk_size == 0 :
331- parser = SentenceSplitter .from_defaults (
332- chunk_size = len (documents [0 ].text ) + 10 ,
333- chunk_overlap = 0 ,
334- callback_manager = embedding .get_callback_manager (),
335- )
336- nodes = parser .get_nodes_from_documents (
337- documents , show_progress = True
338- )
339- node = nodes [0 ]
340- node .embedding = embedding .get_query_embedding (" " )
341- vector_db .add (doc_id , nodes = [node ])
342- self .tool .stream_log ("Added node to vector db" )
343- else :
344- self .tool .stream_log ("Adding nodes to vector db..." )
345- # TODO: Phase 2:
346- # Post insertion to VDB, use query using doc_id and
347- # store all the VDB ids to a table against the doc_id
348- # During deletion for cases where metadata filtering
349- # does not work, these ids can be used for direct deletion
350- # This new table will also act like an audit trail for
351- # all nodes that were added to the VDB by Unstract
352- # Once this is in place, the overridden implementation
353- # of prefixing ids with doc_id before adding to VDB
354- # can be removed
355- vector_db .index_document (
356- documents ,
357- chunk_size = chunk_size ,
358- chunk_overlap = chunk_overlap ,
359- show_progress = True ,
360- )
343+ vector_db .delete (ref_doc_id = doc_id )
344+ self .tool .stream_log (f"Deleted nodes for { doc_id } " )
361345 except Exception as e :
362346 self .tool .stream_log (
363- f"Error adding nodes to vector db : { e } " ,
347+ f"Error deleting nodes for { doc_id } : { e } " ,
364348 level = LogLevel .ERROR ,
365349 )
366- raise IndexingError ( str ( e ) ) from e
350+ raise SdkError ( f"Error deleting nodes for { doc_id } : { e } " ) from e
367351
368- self .tool .stream_log ("File has been indexed successfully" )
369- return doc_id
370- finally :
371- vector_db .close ()
352+ try :
353+ if chunk_size == 0 :
354+ parser = SentenceSplitter .from_defaults (
355+ chunk_size = len (documents [0 ].text ) + 10 ,
356+ chunk_overlap = 0 ,
357+ callback_manager = embedding .get_callback_manager (),
358+ )
359+ nodes = parser .get_nodes_from_documents (documents , show_progress = True )
360+ node = nodes [0 ]
361+ node .embedding = embedding .get_query_embedding (" " )
362+ vector_db .add (doc_id , nodes = [node ])
363+ self .tool .stream_log ("Added node to vector db" )
364+ else :
365+ self .tool .stream_log ("Adding nodes to vector db..." )
366+ # TODO: Phase 2:
367+ # Post insertion to VDB, use query using doc_id and
368+ # store all the VDB ids to a table against the doc_id
369+ # During deletion for cases where metadata filtering
370+ # does not work, these ids can be used for direct deletion
371+ # This new table will also act like an audit trail for
372+ # all nodes that were added to the VDB by Unstract
373+ # Once this is in place, the overridden implementation
374+ # of prefixing ids with doc_id before adding to VDB
375+ # can be removed
376+ vector_db .index_document (
377+ documents ,
378+ chunk_size = chunk_size ,
379+ chunk_overlap = chunk_overlap ,
380+ show_progress = True ,
381+ )
382+ except Exception as e :
383+ self .tool .stream_log (
384+ f"Error adding nodes to vector db: { e } " ,
385+ level = LogLevel .ERROR ,
386+ )
387+ raise IndexingError (str (e )) from e
388+
389+ self .tool .stream_log ("File has been indexed successfully" )
390+ return
372391
373392 def generate_index_key (
374393 self ,
0 commit comments