Skip to content

Commit dd2e4dd

Browse files
Fix/adding whisper hash in metadata (#64)
* Update metadat with whisper hash * Update metadat with whisper hash * Update metadata with whisper hash * Removed unwanted comments * trying editable install * reverted the pytoml dependency * Updated the upstream function * Enabling condition in the upstream * Updated the handling using data class * reverting the PDM lock file --------- Signed-off-by: Rahul Johny <[email protected]> Co-authored-by: Gayathri <[email protected]>
1 parent f7f1427 commit dd2e4dd

File tree

3 files changed

+60
-34
lines changed

3 files changed

+60
-34
lines changed

src/unstract/sdk/index.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
)
1313
from typing_extensions import deprecated
1414
from unstract.adapters.exceptions import AdapterError
15+
from unstract.adapters.x2text.constants import X2TextConstants
16+
from unstract.adapters.x2text.dto import TextExtractionResult
17+
from unstract.adapters.x2text.llm_whisperer.src import LLMWhisperer
1518

1619
from unstract.sdk.adapters import ToolAdapter
1720
from unstract.sdk.constants import LogLevel
@@ -131,6 +134,7 @@ def index(
131134
reindex: bool = False,
132135
file_hash: Optional[str] = None,
133136
output_file_path: Optional[str] = None,
137+
enable_highlight: bool = False,
134138
usage_kwargs: dict[Any, Any] = {},
135139
) -> str:
136140
"""Indexes an individual file using the passed arguments.
@@ -246,9 +250,29 @@ def index(
246250
x2text = X2Text(
247251
tool=self.tool, adapter_instance_id=x2text_instance_id
248252
)
249-
extracted_text = x2text.process(
250-
input_file_path=file_path, output_file_path=output_file_path
251-
)
253+
if enable_highlight and isinstance(
254+
x2text._x2text_instance, LLMWhisperer
255+
):
256+
process_response: TextExtractionResult = x2text.process(
257+
input_file_path=file_path,
258+
output_file_path=output_file_path,
259+
enable_highlight=enable_highlight,
260+
)
261+
whisper_hash_value = (
262+
process_response.extraction_metadata.whisper_hash
263+
)
264+
265+
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
266+
267+
self.tool.update_exec_metadata(metadata)
268+
269+
else:
270+
process_response: TextExtractionResult = x2text.process(
271+
input_file_path=file_path,
272+
output_file_path=output_file_path,
273+
)
274+
275+
extracted_text = process_response.extracted_text
252276
except AdapterError as e:
253277
# Wrapping AdapterErrors with SdkError
254278
raise IndexingError(str(e)) from e

src/unstract/sdk/tool/base.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ def from_tool_args(cls, args: list[str]) -> "BaseTool":
5656
if parsed_args.command not in Command.static_commands():
5757
tool._exec_metadata = tool._get_exec_metadata()
5858
tool.workflow_id = tool._exec_metadata.get(MetadataKey.WORKFLOW_ID)
59-
tool.execution_id = tool._exec_metadata.get(
60-
MetadataKey.EXECUTION_ID
61-
)
59+
tool.execution_id = tool._exec_metadata.get(MetadataKey.EXECUTION_ID)
6260
tool.org_id = tool._exec_metadata.get(MetadataKey.ORG_ID)
6361
return tool
6462

@@ -108,15 +106,11 @@ def _get_data_dir(self) -> Path:
108106
self.stream_error_and_exit(f"{data_dir} is not a directory")
109107
return base_path.absolute()
110108

111-
def _get_file_from_data_dir(
112-
self, file_to_get: str, raise_err: bool = False
113-
) -> str:
109+
def _get_file_from_data_dir(self, file_to_get: str, raise_err: bool = False) -> str:
114110
base_path: Path = self._get_data_dir()
115111
file_path = base_path / file_to_get
116112
if raise_err and not file_path.exists():
117-
self.stream_error_and_exit(
118-
f"{file_to_get} is missing in TOOL_DATA_DIR"
119-
)
113+
self.stream_error_and_exit(f"{file_to_get} is missing in TOOL_DATA_DIR")
120114
return str(file_path)
121115

122116
def get_source_file(self) -> str:
@@ -171,17 +165,11 @@ def _get_exec_metadata(self) -> dict[str, Any]:
171165
with open(metadata_path, encoding="utf-8") as f:
172166
metadata_json = loads(f.read())
173167
except JSONDecodeError as e:
174-
self.stream_error_and_exit(
175-
f"JSON decode error for {metadata_path}: {e}"
176-
)
168+
self.stream_error_and_exit(f"JSON decode error for {metadata_path}: {e}")
177169
except FileNotFoundError:
178-
self.stream_error_and_exit(
179-
f"Metadata file not found at {metadata_path}"
180-
)
170+
self.stream_error_and_exit(f"Metadata file not found at {metadata_path}")
181171
except OSError as e:
182-
self.stream_error_and_exit(
183-
f"OS Error while opening {metadata_path}: {e}"
184-
)
172+
self.stream_error_and_exit(f"OS Error while opening {metadata_path}: {e}")
185173
return metadata_json
186174

187175
def _write_exec_metadata(self, metadata: dict[str, Any]) -> None:
@@ -204,18 +192,12 @@ def _update_exec_metadata(self) -> None:
204192
tool_metadata = {
205193
MetadataKey.TOOL_NAME: self.properties[PropKey.FUNCTION_NAME],
206194
MetadataKey.ELAPSED_TIME: self.elapsed_time(),
207-
MetadataKey.OUTPUT_TYPE: self.properties[PropKey.RESULT][
208-
PropKey.TYPE
209-
],
195+
MetadataKey.OUTPUT_TYPE: self.properties[PropKey.RESULT][PropKey.TYPE],
210196
}
211197
if MetadataKey.TOTAL_ELA_TIME not in self._exec_metadata:
212-
self._exec_metadata[
213-
MetadataKey.TOTAL_ELA_TIME
214-
] = self.elapsed_time()
198+
self._exec_metadata[MetadataKey.TOTAL_ELA_TIME] = self.elapsed_time()
215199
else:
216-
self._exec_metadata[
217-
MetadataKey.TOTAL_ELA_TIME
218-
] += self.elapsed_time()
200+
self._exec_metadata[MetadataKey.TOTAL_ELA_TIME] += self.elapsed_time()
219201

220202
if MetadataKey.TOOL_META not in self._exec_metadata:
221203
self._exec_metadata[MetadataKey.TOOL_META] = [tool_metadata]
@@ -224,16 +206,35 @@ def _update_exec_metadata(self) -> None:
224206

225207
self._write_exec_metadata(metadata=self._exec_metadata)
226208

209+
def update_exec_metadata(self, metadata: dict[str, Any]) -> None:
210+
"""Helps update the execution metadata with the provided metadata
211+
dictionary.
212+
213+
This method iterates over the key-value pairs in the input metadata dictionary
214+
and updates the internal `_exec_metadata` dictionary of the tool instance
215+
accordingly. It then writes the updated metadata to the `METADATA.json`
216+
file in the tool's data directory.
217+
218+
Args:
219+
metadata (dict[str, Any]): A dictionary containing the metadata
220+
key-value pairs to update in the execution metadata.
221+
222+
Returns:
223+
None
224+
"""
225+
for key, value in metadata.items():
226+
self._exec_metadata[key] = value
227+
228+
self._write_exec_metadata(metadata=self._exec_metadata)
229+
227230
def write_tool_result(self, data: Union[str, dict[str, Any]]) -> None:
228231
"""Helps write contents of the tool result into TOOL_DATA_DIR.
229232
230233
Args:
231234
data (Union[str, dict[str, Any]]): Data to be written
232235
"""
233236
output_type = self.properties[PropKey.RESULT][PropKey.TYPE]
234-
if output_type is PropKey.OutputType.JSON and not isinstance(
235-
data, dict
236-
):
237+
if output_type is PropKey.OutputType.JSON and not isinstance(data, dict):
237238
# TODO: Validate JSON type output with output schema as well
238239
self.stream_error_and_exit(
239240
f"Expected result to have type {PropKey.OutputType.JSON} "

src/unstract/sdk/x2txt.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from unstract.adapters.constants import Common
66
from unstract.adapters.x2text import adapters
77
from unstract.adapters.x2text.constants import X2TextConstants
8+
from unstract.adapters.x2text.dto import TextExtractionResult
89
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
910

1011
from unstract.sdk.adapters import ToolAdapter
@@ -67,7 +68,7 @@ def process(
6768
input_file_path: str,
6869
output_file_path: Optional[str] = None,
6970
**kwargs: dict[Any, Any],
70-
) -> str:
71+
) -> TextExtractionResult:
7172
return self._x2text_instance.process(
7273
input_file_path, output_file_path, **kwargs
7374
)

0 commit comments

Comments
 (0)