11import json
2- from typing import Any , Optional
2+ import logging
3+ from typing import Any , Callable , Optional
34
45from llama_index .core import Document
56from llama_index .core .node_parser import SimpleNodeParser
2526from unstract .sdk .vector_db import VectorDB
2627from unstract .sdk .x2txt import X2Text
2728
29+ logger = logging .getLogger (__name__ )
30+
2831
2932class Constants :
3033 TOP_K = 5
@@ -101,27 +104,6 @@ def query_index(
101104 finally :
102105 vector_db .close ()
103106
104- def _cleanup_text (self , full_text ):
105- # Remove text which is not required
106- full_text_lines = full_text .split ("\n " )
107- new_context_lines = []
108- empty_line_count = 0
109- for line in full_text_lines :
110- if line .strip () == "" :
111- empty_line_count += 1
112- else :
113- if empty_line_count >= 3 :
114- empty_line_count = 3
115- for i in range (empty_line_count ):
116- new_context_lines .append ("" )
117- empty_line_count = 0
118- new_context_lines .append (line .rstrip ())
119- self .tool .stream_log (
120- f"Old context length: { len (full_text_lines )} , "
121- f"New context length: { len (new_context_lines )} "
122- )
123- return "\n " .join (new_context_lines )
124-
125107 def index (
126108 self ,
127109 tool_id : str ,
@@ -136,6 +118,7 @@ def index(
136118 output_file_path : Optional [str ] = None ,
137119 enable_highlight : bool = False ,
138120 usage_kwargs : dict [Any , Any ] = {},
121+ process_text : Optional [Callable [[str ], str ]] = None ,
139122 ) -> str :
140123 """Indexes an individual file using the passed arguments.
141124
@@ -276,10 +259,17 @@ def index(
276259 except AdapterError as e :
277260 # Wrapping AdapterErrors with SdkError
278261 raise IndexingError (str (e )) from e
262+ if process_text :
263+ try :
264+ result = process_text (extracted_text )
265+ if isinstance (result , str ):
266+ extracted_text = result
267+ except Exception as e :
268+ logger .error (f"Error occured inside function 'process_text': { e } " )
279269 full_text .append (
280270 {
281271 "section" : "full" ,
282- "text_contents" : self . _cleanup_text ( extracted_text ) ,
272+ "text_contents" : extracted_text ,
283273 }
284274 )
285275
0 commit comments