Skip to content

Commit 9a9b6a8

Browse files
passing tags to x2text adapter in kwargs (#150)
* passing tags to x2text adapter in kwargs * re difined the DTO for params * Update src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/dto.py Signed-off-by: Chandrasekharan M <[email protected]> --------- Signed-off-by: Chandrasekharan M <[email protected]> Co-authored-by: Chandrasekharan M <[email protected]>
1 parent 80d3561 commit 9a9b6a8

File tree

7 files changed

+53
-5
lines changed

7 files changed

+53
-5
lines changed

src/unstract/sdk/adapters/x2text/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ class X2TextConstants:
33
X2TEXT_HOST = "X2TEXT_HOST"
44
X2TEXT_PORT = "X2TEXT_PORT"
55
ENABLE_HIGHLIGHT = "enable_highlight"
6+
TAGS = "tags"
67
EXTRACTED_TEXT = "extracted_text"
78
WHISPER_HASH = "whisper-hash"
89
WHISPER_HASH_V2 = "whisper_hash"
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from dataclasses import dataclass
2+
from typing import Optional
3+
4+
5+
@dataclass
6+
class WhispererRequestParams:
7+
"""DTO for LLM Whisperer API request parameters.
8+
9+
Args:
10+
tag: Tag value. Can be initialized with List[str], str, or None.
11+
Will be converted to str | None after initialization.
12+
"""
13+
14+
# TODO: Extend this DTO to include all Whisperer API parameters
15+
tag: Optional[str] = None
16+
17+
def __post_init__(self) -> None:
18+
# TODO: Allow list of tags once its supported in LLMW v2
19+
if isinstance(self.tag, list):
20+
self.tag = self.tag[0] if self.tag else None

src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
WhispererHeader,
2121
WhisperStatus,
2222
)
23+
from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.dto import WhispererRequestParams
2324
from unstract.sdk.constants import MimeType
2425
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
2526

@@ -108,7 +109,9 @@ def make_request(
108109
return response
109110

110111
@staticmethod
111-
def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]:
112+
def get_whisperer_params(
113+
config: dict[str, Any], extra_params: WhispererRequestParams
114+
) -> dict[str, Any]:
112115
"""Gets query params meant for /whisper endpoint.
113116
114117
The params is filled based on the configuration passed.
@@ -152,7 +155,8 @@ def get_whisperer_params(config: dict[str, Any]) -> dict[str, Any]:
152155
),
153156
# Not providing default value to maintain legacy compatablity
154157
# these are optional params and identifiers for audit
155-
WhispererConfig.TAG: config.get(
158+
WhispererConfig.TAG: extra_params.tag
159+
or config.get(
156160
WhispererConfig.TAG,
157161
WhispererDefaults.TAG,
158162
),
@@ -292,11 +296,14 @@ def extract_async(config: dict[str, Any], whisper_hash: str) -> dict[Any, Any]:
292296
def send_whisper_request(
293297
input_file_path: str,
294298
config: dict[str, Any],
299+
extra_params: WhispererRequestParams,
295300
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
296301
) -> requests.Response:
297302
headers = LLMWhispererHelper.get_request_headers(config)
298303
headers["Content-Type"] = "application/octet-stream"
299-
params = LLMWhispererHelper.get_whisperer_params(config)
304+
params = LLMWhispererHelper.get_whisperer_params(
305+
config=config, extra_params=extra_params
306+
)
300307

301308
response: requests.Response
302309
try:

src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
HTTPMethod,
1515
WhispererEndpoint,
1616
)
17+
from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.dto import WhispererRequestParams
1718
from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper
1819
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
1920
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
@@ -76,8 +77,12 @@ def process(
7677
str: Extracted text
7778
"""
7879

80+
extra_params = WhispererRequestParams(tag=kwargs.get(X2TextConstants.TAGS))
7981
response: requests.Response = LLMWhispererHelper.send_whisper_request(
80-
input_file_path, self.config, fs=fs
82+
input_file_path=input_file_path,
83+
config=self.config,
84+
fs=fs,
85+
extra_params=extra_params,
8186
)
8287
response_text = response.text
8388
reponse_dict = json.loads(response_text)

src/unstract/sdk/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class MetadataKey:
128128
WORKFLOW_ID = "workflow_id"
129129
EXECUTION_ID = "execution_id"
130130
FILE_EXECUTION_ID = "file_execution_id"
131+
TAGS = "tags"
131132
ORG_ID = "organization_id"
132133
TOOL_META = "tool_metadata"
133134
TOOL_NAME = "tool_name"

src/unstract/sdk/index.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def extract_text(
128128
usage_kwargs: dict[Any, Any] = {},
129129
process_text: Optional[Callable[[str], str]] = None,
130130
fs: FileStorage = FileStorage(FileStorageProvider.LOCAL),
131+
tags: Optional[list[str]] = None,
131132
) -> str:
132133
"""Extracts text from a document.
133134
@@ -147,6 +148,7 @@ def extract_text(
147148
Defaults to {}.
148149
process_text (Optional[Callable[[str], str]], optional): Optional function
149150
to post-process the text. Defaults to None.
151+
tags: (Optional[list[str]], optional): Tags
150152
151153
Raises:
152154
IndexingError: Errors during text extraction
@@ -164,14 +166,18 @@ def extract_text(
164166
input_file_path=file_path,
165167
output_file_path=output_file_path,
166168
enable_highlight=enable_highlight,
169+
tags=tags,
167170
fs=fs,
168171
)
169172
whisper_hash_value = process_response.extraction_metadata.whisper_hash
170173
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
171174
self.tool.update_exec_metadata(metadata)
172175
else:
173176
process_response: TextExtractionResult = x2text.process(
174-
input_file_path=file_path, output_file_path=output_file_path, fs=fs
177+
input_file_path=file_path,
178+
output_file_path=output_file_path,
179+
tags=tags,
180+
fs=fs,
175181
)
176182
extracted_text = process_response.extracted_text
177183
# TODO: Handle prepend of context where error is raised and remove this
@@ -193,6 +199,7 @@ def extract_text(
193199
)
194200
return extracted_text
195201

202+
# TODO: Reduce the number of params by some dataclass
196203
@log_elapsed(operation="CHECK_AND_INDEX(overall)")
197204
@capture_metrics
198205
def index(
@@ -211,6 +218,7 @@ def index(
211218
usage_kwargs: dict[Any, Any] = {},
212219
process_text: Optional[Callable[[str], str]] = None,
213220
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
221+
tags: Optional[list[str]] = None,
214222
) -> str:
215223
"""Indexes an individual file using the passed arguments.
216224
@@ -231,6 +239,8 @@ def index(
231239
output_file_path (Optional[str], optional): File path to write
232240
the extracted contents into. Defaults to None.
233241
fs (FileStorage): file storage object to perfrom file operations
242+
tags (Optional[list[str]], optional): List of tags to be associated with
243+
the indexed document.
234244
235245
Returns:
236246
str: A unique ID for the file and indexing arguments combination
@@ -300,6 +310,7 @@ def index(
300310
usage_kwargs=usage_kwargs,
301311
process_text=process_text,
302312
fs=fs,
313+
tags=tags,
303314
)
304315
return doc_id
305316

@@ -310,6 +321,7 @@ def index(
310321
enable_highlight=enable_highlight,
311322
usage_kwargs=usage_kwargs,
312323
process_text=process_text,
324+
tags=tags,
313325
fs=fs,
314326
)
315327
if not extracted_text:

src/unstract/sdk/tool/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(self, log_level: LogLevel = LogLevel.INFO) -> None:
4040
self.workflow_id = ""
4141
self.execution_id = ""
4242
self.file_execution_id = ""
43+
self.tags = []
4344
self.source_file_name = ""
4445
self.org_id = ""
4546
self._exec_metadata = {}
@@ -90,6 +91,7 @@ def from_tool_args(cls, args: list[str]) -> "BaseTool":
9091
tool.file_execution_id = tool._exec_metadata.get(
9192
MetadataKey.FILE_EXECUTION_ID, ""
9293
)
94+
tool.tags = tool._exec_metadata.get(MetadataKey.TAGS, [])
9395
tool.source_file_name = tool._exec_metadata.get(MetadataKey.SOURCE_NAME, "")
9496
tool.org_id = tool._exec_metadata.get(MetadataKey.ORG_ID)
9597
return tool

0 commit comments

Comments
 (0)