|
23 | 23 | from unstract.sdk.exceptions import IndexingError, SdkError |
24 | 24 | from unstract.sdk.tool.base import BaseTool |
25 | 25 | from unstract.sdk.utils import ToolUtils |
| 26 | +from unstract.sdk.utils.common_utils import log_elapsed |
26 | 27 | from unstract.sdk.vector_db import VectorDB |
27 | 28 | from unstract.sdk.x2txt import X2Text |
28 | 29 |
|
@@ -104,6 +105,80 @@ def query_index( |
104 | 105 | finally: |
105 | 106 | vector_db.close() |
106 | 107 |
|
| 108 | + @log_elapsed(operation="EXTRACTION") |
| 109 | + def extract_text( |
| 110 | + self, |
| 111 | + x2text_instance_id: str, |
| 112 | + file_path: str, |
| 113 | + output_file_path: Optional[str] = None, |
| 114 | + enable_highlight: bool = False, |
| 115 | + usage_kwargs: dict[Any, Any] = {}, |
| 116 | + process_text: Optional[Callable[[str], str]] = None, |
| 117 | + ) -> str: |
| 118 | + """Extracts text from a document. |
| 119 | +
|
| 120 | + Uses the configured service to perform the extraction |
| 121 | + - LLM Whisperer |
| 122 | + - Unstructured IO Community / Enterprise |
| 123 | + - Llama Parse |
| 124 | +
|
| 125 | + Args: |
| 126 | + x2text_instance_id (str): UUID of the text extractor |
| 127 | + file_path (str): Path to the file |
| 128 | + output_file_path (Optional[str], optional): File path to write |
| 129 | + the extracted contents into. Defaults to None. |
| 130 | + enable_highlight (bool, optional): Flag to provide highlighting metadata. |
| 131 | + Defaults to False. |
| 132 | + usage_kwargs (dict[Any, Any], optional): Dict to capture usage. |
| 133 | + Defaults to {}. |
| 134 | + process_text (Optional[Callable[[str], str]], optional): Optional function |
| 135 | + to post-process the text. Defaults to None. |
| 136 | +
|
| 137 | + Raises: |
| 138 | + IndexingError: Errors during text extraction |
| 139 | + """ |
| 140 | + self.tool.stream_log("Extracting text from input file") |
| 141 | + extracted_text = "" |
| 142 | + try: |
| 143 | + x2text = X2Text( |
| 144 | + tool=self.tool, |
| 145 | + adapter_instance_id=x2text_instance_id, |
| 146 | + usage_kwargs=usage_kwargs, |
| 147 | + ) |
| 148 | + if enable_highlight and isinstance(x2text._x2text_instance, LLMWhisperer): |
| 149 | + process_response: TextExtractionResult = x2text.process( |
| 150 | + input_file_path=file_path, |
| 151 | + output_file_path=output_file_path, |
| 152 | + enable_highlight=enable_highlight, |
| 153 | + ) |
| 154 | + whisper_hash_value = process_response.extraction_metadata.whisper_hash |
| 155 | + |
| 156 | + metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value} |
| 157 | + |
| 158 | + self.tool.update_exec_metadata(metadata) |
| 159 | + |
| 160 | + else: |
| 161 | + process_response: TextExtractionResult = x2text.process( |
| 162 | + input_file_path=file_path, |
| 163 | + output_file_path=output_file_path, |
| 164 | + ) |
| 165 | + |
| 166 | + extracted_text = process_response.extracted_text |
| 167 | + except AdapterError as e: |
| 168 | + # Wrapping AdapterErrors with SdkError |
| 169 | + raise IndexingError(str(e)) from e |
| 170 | + if process_text: |
| 171 | + try: |
| 172 | + result = process_text(extracted_text) |
| 173 | + if isinstance(result, str): |
| 174 | + extracted_text = result |
| 175 | + else: |
| 176 | + logger.warning("'process_text' is expected to return an 'str'") |
| 177 | + except Exception as e: |
| 178 | + logger.error(f"Error occured inside function 'process_text': {e}") |
| 179 | + return extracted_text |
| 180 | + |
| 181 | + @log_elapsed(operation="INDEXING(might include EXTRACTION)") |
107 | 182 | def index( |
108 | 183 | self, |
109 | 184 | tool_id: str, |
@@ -207,58 +282,23 @@ def index( |
207 | 282 | self.tool.stream_log(f"File was indexed already under {doc_id}") |
208 | 283 | return doc_id |
209 | 284 |
|
210 | | - # Extract text and index |
211 | | - self.tool.stream_log("Extracting text from input file") |
212 | | - full_text = [] |
213 | | - extracted_text = "" |
214 | | - try: |
215 | | - x2text = X2Text( |
216 | | - tool=self.tool, |
217 | | - adapter_instance_id=x2text_instance_id, |
218 | | - usage_kwargs=usage_kwargs, |
219 | | - ) |
220 | | - if enable_highlight and isinstance( |
221 | | - x2text._x2text_instance, LLMWhisperer |
222 | | - ): |
223 | | - process_response: TextExtractionResult = x2text.process( |
224 | | - input_file_path=file_path, |
225 | | - output_file_path=output_file_path, |
226 | | - enable_highlight=enable_highlight, |
227 | | - ) |
228 | | - whisper_hash_value = ( |
229 | | - process_response.extraction_metadata.whisper_hash |
230 | | - ) |
231 | | - |
232 | | - metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value} |
233 | | - |
234 | | - self.tool.update_exec_metadata(metadata) |
235 | | - |
236 | | - else: |
237 | | - process_response: TextExtractionResult = x2text.process( |
238 | | - input_file_path=file_path, |
239 | | - output_file_path=output_file_path, |
240 | | - ) |
| 285 | + extracted_text = self.extract_text( |
| 286 | + x2text_instance_id=x2text_instance_id, |
| 287 | + file_path=file_path, |
| 288 | + output_file_path=output_file_path, |
| 289 | + enable_highlight=enable_highlight, |
| 290 | + usage_kwargs=usage_kwargs, |
| 291 | + process_text=process_text, |
| 292 | + ) |
| 293 | + if not extracted_text: |
| 294 | + raise IndexingError("No text available to index") |
241 | 295 |
|
242 | | - extracted_text = process_response.extracted_text |
243 | | - except AdapterError as e: |
244 | | - # Wrapping AdapterErrors with SdkError |
245 | | - raise IndexingError(str(e)) from e |
246 | | - if process_text: |
247 | | - try: |
248 | | - result = process_text(extracted_text) |
249 | | - if isinstance(result, str): |
250 | | - extracted_text = result |
251 | | - except Exception as e: |
252 | | - logger.error(f"Error occured inside function 'process_text': {e}") |
253 | | - full_text.append( |
| 296 | + full_text = [ |
254 | 297 | { |
255 | 298 | "section": "full", |
256 | 299 | "text_contents": extracted_text, |
257 | 300 | } |
258 | | - ) |
259 | | - |
260 | | - if not extracted_text: |
261 | | - raise IndexingError("No text available to index") |
| 301 | + ] |
262 | 302 |
|
263 | 303 | # Check if chunking is required |
264 | 304 | documents = [] |
@@ -324,7 +364,6 @@ def index( |
324 | 364 | level=LogLevel.ERROR, |
325 | 365 | ) |
326 | 366 | raise IndexingError(str(e)) from e |
327 | | - self.tool.stream_log("Added nodes to vector db") |
328 | 367 |
|
329 | 368 | self.tool.stream_log("File has been indexed successfully") |
330 | 369 | return doc_id |
|
0 commit comments