Skip to content

Commit 4b44c02

Browse files
feat: Changes to measure time taken in index (#106)
* Changes to measure time taken in index and support for other operations * Minor docstring fix * Updated debug log to info for logging timing
1 parent dc91d79 commit 4b44c02

File tree

2 files changed

+113
-48
lines changed

2 files changed

+113
-48
lines changed

src/unstract/sdk/index.py

Lines changed: 87 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from unstract.sdk.exceptions import IndexingError, SdkError
2424
from unstract.sdk.tool.base import BaseTool
2525
from unstract.sdk.utils import ToolUtils
26+
from unstract.sdk.utils.common_utils import log_elapsed
2627
from unstract.sdk.vector_db import VectorDB
2728
from unstract.sdk.x2txt import X2Text
2829

@@ -104,6 +105,80 @@ def query_index(
104105
finally:
105106
vector_db.close()
106107

108+
@log_elapsed(operation="EXTRACTION")
109+
def extract_text(
110+
self,
111+
x2text_instance_id: str,
112+
file_path: str,
113+
output_file_path: Optional[str] = None,
114+
enable_highlight: bool = False,
115+
usage_kwargs: dict[Any, Any] = {},
116+
process_text: Optional[Callable[[str], str]] = None,
117+
) -> str:
118+
"""Extracts text from a document.
119+
120+
Uses the configured service to perform the extraction
121+
- LLM Whisperer
122+
- Unstructured IO Community / Enterprise
123+
- Llama Parse
124+
125+
Args:
126+
x2text_instance_id (str): UUID of the text extractor
127+
file_path (str): Path to the file
128+
output_file_path (Optional[str], optional): File path to write
129+
the extracted contents into. Defaults to None.
130+
enable_highlight (bool, optional): Flag to provide highlighting metadata.
131+
Defaults to False.
132+
usage_kwargs (dict[Any, Any], optional): Dict to capture usage.
133+
Defaults to {}.
134+
process_text (Optional[Callable[[str], str]], optional): Optional function
135+
to post-process the text. Defaults to None.
136+
137+
Raises:
138+
IndexingError: Errors during text extraction
139+
"""
140+
self.tool.stream_log("Extracting text from input file")
141+
extracted_text = ""
142+
try:
143+
x2text = X2Text(
144+
tool=self.tool,
145+
adapter_instance_id=x2text_instance_id,
146+
usage_kwargs=usage_kwargs,
147+
)
148+
if enable_highlight and isinstance(x2text._x2text_instance, LLMWhisperer):
149+
process_response: TextExtractionResult = x2text.process(
150+
input_file_path=file_path,
151+
output_file_path=output_file_path,
152+
enable_highlight=enable_highlight,
153+
)
154+
whisper_hash_value = process_response.extraction_metadata.whisper_hash
155+
156+
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
157+
158+
self.tool.update_exec_metadata(metadata)
159+
160+
else:
161+
process_response: TextExtractionResult = x2text.process(
162+
input_file_path=file_path,
163+
output_file_path=output_file_path,
164+
)
165+
166+
extracted_text = process_response.extracted_text
167+
except AdapterError as e:
168+
# Wrapping AdapterErrors with SdkError
169+
raise IndexingError(str(e)) from e
170+
if process_text:
171+
try:
172+
result = process_text(extracted_text)
173+
if isinstance(result, str):
174+
extracted_text = result
175+
else:
176+
logger.warning("'process_text' is expected to return an 'str'")
177+
except Exception as e:
178+
logger.error(f"Error occured inside function 'process_text': {e}")
179+
return extracted_text
180+
181+
@log_elapsed(operation="INDEXING(might include EXTRACTION)")
107182
def index(
108183
self,
109184
tool_id: str,
@@ -207,58 +282,23 @@ def index(
207282
self.tool.stream_log(f"File was indexed already under {doc_id}")
208283
return doc_id
209284

210-
# Extract text and index
211-
self.tool.stream_log("Extracting text from input file")
212-
full_text = []
213-
extracted_text = ""
214-
try:
215-
x2text = X2Text(
216-
tool=self.tool,
217-
adapter_instance_id=x2text_instance_id,
218-
usage_kwargs=usage_kwargs,
219-
)
220-
if enable_highlight and isinstance(
221-
x2text._x2text_instance, LLMWhisperer
222-
):
223-
process_response: TextExtractionResult = x2text.process(
224-
input_file_path=file_path,
225-
output_file_path=output_file_path,
226-
enable_highlight=enable_highlight,
227-
)
228-
whisper_hash_value = (
229-
process_response.extraction_metadata.whisper_hash
230-
)
231-
232-
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
233-
234-
self.tool.update_exec_metadata(metadata)
235-
236-
else:
237-
process_response: TextExtractionResult = x2text.process(
238-
input_file_path=file_path,
239-
output_file_path=output_file_path,
240-
)
285+
extracted_text = self.extract_text(
286+
x2text_instance_id=x2text_instance_id,
287+
file_path=file_path,
288+
output_file_path=output_file_path,
289+
enable_highlight=enable_highlight,
290+
usage_kwargs=usage_kwargs,
291+
process_text=process_text,
292+
)
293+
if not extracted_text:
294+
raise IndexingError("No text available to index")
241295

242-
extracted_text = process_response.extracted_text
243-
except AdapterError as e:
244-
# Wrapping AdapterErrors with SdkError
245-
raise IndexingError(str(e)) from e
246-
if process_text:
247-
try:
248-
result = process_text(extracted_text)
249-
if isinstance(result, str):
250-
extracted_text = result
251-
except Exception as e:
252-
logger.error(f"Error occured inside function 'process_text': {e}")
253-
full_text.append(
296+
full_text = [
254297
{
255298
"section": "full",
256299
"text_contents": extracted_text,
257300
}
258-
)
259-
260-
if not extracted_text:
261-
raise IndexingError("No text available to index")
301+
]
262302

263303
# Check if chunking is required
264304
documents = []
@@ -324,7 +364,6 @@ def index(
324364
level=LogLevel.ERROR,
325365
)
326366
raise IndexingError(str(e)) from e
327-
self.tool.stream_log("Added nodes to vector db")
328367

329368
self.tool.stream_log("File has been indexed successfully")
330369
return doc_id

src/unstract/sdk/utils/common_utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import functools
12
import logging
3+
import time
24
import uuid
35

46
from unstract.sdk.constants import LogLevel
@@ -20,3 +22,27 @@ def generate_uuid() -> str:
2022
logging.WARNING: LogLevel.WARN,
2123
logging.ERROR: LogLevel.ERROR,
2224
}
25+
26+
27+
def log_elapsed(operation):
28+
"""Adds an elapsed time log.
29+
30+
Args:
31+
operation (str): Operation being measured
32+
"""
33+
34+
def decorator(func):
35+
@functools.wraps(func)
36+
def wrapper(*args, **kwargs):
37+
start_time = time.time()
38+
try:
39+
result = func(*args, **kwargs)
40+
finally:
41+
end_time = time.time()
42+
elapsed_time = end_time - start_time
43+
logger.info(f"Time taken for '{operation}': {elapsed_time:.3f}s")
44+
return result
45+
46+
return wrapper
47+
48+
return decorator

0 commit comments

Comments
 (0)