Skip to content

Commit be328d6

Browse files
Remove feature flag related code changes for remote storage (#157)
* Remove feature flag related code changes * Fix summary issue * Read entire length for mime type detection * Read entire length for mime type detection * Add MINIO as permanent file storage proivder * SDK version * Fix SDK for OSS use cases * Promote to GA --------- Signed-off-by: Gayathri <[email protected]>
1 parent abd949b commit be328d6

File tree

13 files changed

+135
-106
lines changed

13 files changed

+135
-106
lines changed

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.57.0rc5"
1+
__version__ = "0.58.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def send_whisper_request(
225225

226226
response: requests.Response
227227
try:
228-
input_file_data = BytesIO(fs.read(input_file_path, "rb"))
228+
input_file_data = BytesIO(fs.read(path=input_file_path, mode="rb"))
229229
response = LLMWhispererHelper.make_request(
230230
config=config,
231231
params=params,
@@ -276,7 +276,10 @@ def write_output_to_file(
276276
text_output = output_json.get("result_text", "")
277277
logger.info(f"Writing output to {output_file_path}")
278278
fs.write(
279-
path=str(output_file_path), mode="w", data=text_output, encoding="utf-8"
279+
path=str(output_file_path),
280+
mode="w",
281+
data=text_output,
282+
encoding="utf-8",
280283
)
281284
except Exception as e:
282285
logger.error(f"Error while writing {output_file_path}: {e}")

src/unstract/sdk/constants.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,13 @@ class ToolEnv:
1111
PLATFORM_API_KEY (str): Platform service API key.
1212
PLATFORM_HOST (str): Platform service host.
1313
PLATFORM_PORT (str): Platform service port.
14-
DATA_DIR (str): The environment variable for the tool data directory.
1514
EXECUTION_BY_TOOL (str): Implicitly set to 1 by the SDK if its executed
1615
by a tool.
1716
"""
1817

1918
PLATFORM_API_KEY = "PLATFORM_SERVICE_API_KEY"
2019
PLATFORM_HOST = "PLATFORM_SERVICE_HOST"
2120
PLATFORM_PORT = "PLATFORM_SERVICE_PORT"
22-
DATA_DIR = "TOOL_DATA_DIR"
2321
EXECUTION_BY_TOOL = "EXECUTION_BY_TOOL"
2422
EXECUTION_DATA_DIR = "EXECUTION_DATA_DIR"
2523
WORKFLOW_EXECUTION_FILE_STORAGE_CREDENTIALS = (

src/unstract/sdk/file_storage/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
class FileOperationParams:
55
READ_ENTIRE_LENGTH = -1
6-
MIME_TYPE_DEFAULT_READ_LENGTH = 100
76
EXTENSION_DEFAULT_READ_LENGTH = 100
87
DEFAULT_ENCODING = "utf-8"
98

src/unstract/sdk/file_storage/impl.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def read(
4141
4242
Args:
4343
path (str): Path to the file to be opened
44-
mode (str): Mode in whicg the file is to be opened. Usual options
44+
mode (str): Mode in which the file is to be opened. Usual options
4545
include r, rb, w and wb
4646
encoding (str): Encoding type like utf-8 or utf-16
4747
seek_position (int): Position to start reading from
@@ -212,19 +212,18 @@ def modification_time(self, path: str) -> datetime:
212212
file_mtime = datetime.fromtimestamp(file_mtime)
213213
return file_mtime
214214

215-
@skip_local_cache
216215
def mime_type(
217216
self,
218217
path: str,
219-
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
218+
read_length: int = FileOperationParams.READ_ENTIRE_LENGTH,
220219
) -> str:
221220
"""Gets the file MIME type for an input file. Uses libmagic to perform
222221
the same.
223222
224223
Args:
225224
path (str): Path of the input file
226225
read_length (int): Length(bytes) to be read from the file for in
227-
order to identify the mime type
226+
order to identify the mime type. Defaults to read the entire length.
228227
229228
Returns:
230229
str: MIME type of the file
@@ -364,7 +363,6 @@ def yaml_load(
364363
data: dict[str, Any] = yaml.safe_load(f)
365364
return data
366365

367-
@skip_local_cache
368366
def guess_extension(self, path: str) -> str:
369367
"""Returns the extension of the file passed.
370368

src/unstract/sdk/file_storage/interface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def modification_time(self, path: str) -> datetime:
7777
def mime_type(
7878
self,
7979
path: str,
80-
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
80+
read_length: int = FileOperationParams.READ_ENTIRE_LENGTH,
8181
) -> str:
8282
pass
8383

src/unstract/sdk/file_storage/permanent.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import logging
22
from typing import Any, Optional, Union
33

4+
import filetype
5+
import magic
6+
47
from unstract.sdk.exceptions import FileOperationError, FileStorageError
58
from unstract.sdk.file_storage.constants import FileOperationParams
69
from unstract.sdk.file_storage.impl import FileStorage
@@ -13,6 +16,7 @@ class PermanentFileStorage(FileStorage):
1316
SUPPORTED_FILE_STORAGE_TYPES = [
1417
FileStorageProvider.GCS.value,
1518
FileStorageProvider.S3.value,
19+
FileStorageProvider.MINIO.value,
1620
FileStorageProvider.LOCAL.value,
1721
]
1822

@@ -27,7 +31,11 @@ def __init__(
2731
f"supported in Permanent mode. "
2832
f"Supported providers: {self.SUPPORTED_FILE_STORAGE_TYPES}"
2933
)
30-
if provider == FileStorageProvider.GCS or provider == FileStorageProvider.LOCAL:
34+
if (
35+
provider == FileStorageProvider.GCS
36+
or provider == FileStorageProvider.LOCAL
37+
or provider == FileStorageProvider.MINIO
38+
):
3139
super().__init__(provider, **storage_config)
3240
else:
3341
raise NotImplementedError(f"Provider {provider.value} is not implemented")
@@ -53,6 +61,11 @@ def _copy_on_read(self, path: str, legacy_storage_path: str):
5361
# to remote storage
5462
if local_file_storage.exists(local_file_path):
5563
self.upload(local_file_path, path)
64+
logger.info(
65+
f"Uploading {local_file_path} from "
66+
f"{local_file_storage.provider} to remote "
67+
f"storage {self.provider} in the path {path}"
68+
)
5669

5770
def read(
5871
self,
@@ -87,3 +100,54 @@ def read(
87100
if isinstance(e, FileNotFoundError) or isinstance(e, FileOperationError):
88101
raise e
89102
raise FileOperationError(str(e)) from e
103+
104+
def mime_type(
105+
self,
106+
path: str,
107+
read_length: int = FileOperationParams.READ_ENTIRE_LENGTH,
108+
legacy_storage_path: Optional[str] = None,
109+
) -> str:
110+
"""Gets the file MIME type for an input file. Uses libmagic to perform
111+
the same.
112+
113+
Args:
114+
path (str): Path of the input file
115+
read_length (int): Length(bytes) to be read from the file for in
116+
order to identify the mime type. Defaults to read the entire length.
117+
legacy_storage_path (str): Legacy path to the same file
118+
119+
Returns:
120+
str: MIME type of the file
121+
"""
122+
sample_contents = self.read(
123+
path=path,
124+
mode="rb",
125+
length=read_length,
126+
legacy_storage_path=legacy_storage_path,
127+
)
128+
mime_type = magic.from_buffer(sample_contents, mime=True)
129+
return mime_type
130+
131+
def guess_extension(
132+
self, path: str, legacy_storage_path: Optional[str] = None
133+
) -> str:
134+
"""Returns the extension of the file passed.
135+
136+
Args:
137+
path (str): String holding the path
138+
legacy_storage_path (str): Legacy path to the same file
139+
140+
Returns:
141+
str: File extension
142+
"""
143+
file_extension = ""
144+
sample_contents = self.read(
145+
path=path,
146+
mode="rb",
147+
length=FileOperationParams.EXTENSION_DEFAULT_READ_LENGTH,
148+
legacy_storage_path=legacy_storage_path,
149+
)
150+
if sample_contents:
151+
file_type = filetype.guess(sample_contents)
152+
file_extension = file_type.EXTENSION
153+
return file_extension

src/unstract/sdk/index.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,7 @@ def index(
297297

298298
if doc_id_found and not reindex:
299299
self.tool.stream_log(f"File was indexed already under {doc_id}")
300-
301-
if not fs.exists(output_file_path):
300+
if output_file_path and not fs.exists(output_file_path):
302301
# Added this as a workaround to handle extraction
303302
# for documents uploaded twice in different projects.
304303
# to be reconsidered after permanent fixes.

src/unstract/sdk/tool/base.py

Lines changed: 25 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import datetime
2-
import json
32
import os
43
from abc import ABC, abstractmethod
5-
from json import JSONDecodeError, loads
4+
from json import JSONDecodeError
65
from pathlib import Path
76
from typing import Any, Union
87

@@ -127,34 +126,14 @@ def handle_static_command(self, command: str) -> None:
127126
else:
128127
raise ValueError(f"Unknown command {command}")
129128

130-
def _get_data_dir(self) -> Path:
131-
"""Gets the TOOL_DATA_DIR that houses the input and output files of
132-
tool execution.
133-
134-
Returns:
135-
Path: Path object of the TOOL_DATA_DIR that's configured.
136-
"""
137-
data_dir = self.get_env_or_die(ToolEnv.DATA_DIR)
138-
base_path = Path(data_dir)
139-
if not base_path.exists():
140-
self.stream_error_and_exit(f"{data_dir} does not exist")
141-
if not base_path.is_dir():
142-
self.stream_error_and_exit(f"{data_dir} is not a directory")
143-
return base_path.absolute()
144-
145129
def _get_file_from_data_dir(self, file_to_get: str, raise_err: bool = False) -> str:
146-
if self.workflow_filestorage:
147-
base_path = self.execution_dir
148-
file_path = base_path / file_to_get
149-
if raise_err and not self.workflow_filestorage.exists(path=file_path):
150-
self.stream_error_and_exit(
151-
f"{file_to_get} is missing in EXECUTION_DATA_DIR"
152-
)
153-
else:
154-
base_path: Path = self._get_data_dir()
155-
file_path = base_path / file_to_get
156-
if raise_err and not file_path.exists():
157-
self.stream_error_and_exit(f"{file_to_get} is missing in TOOL_DATA_DIR")
130+
base_path = self.execution_dir
131+
file_path = base_path / file_to_get
132+
if raise_err and not self.workflow_filestorage.exists(path=file_path):
133+
self.stream_error_and_exit(
134+
f"{file_to_get} is missing in EXECUTION_DATA_DIR"
135+
)
136+
158137
return str(file_path)
159138

160139
def get_source_file(self) -> str:
@@ -183,10 +162,7 @@ def get_output_dir(self) -> str:
183162
Returns:
184163
str: Absolute path to the output directory.
185164
"""
186-
if self.workflow_filestorage:
187-
base_path = self.execution_dir
188-
else:
189-
base_path: Path = self._get_data_dir()
165+
base_path = self.execution_dir
190166
return str(base_path / ToolExecKey.OUTPUT_DIR)
191167

192168
@property
@@ -205,20 +181,13 @@ def _get_exec_metadata(self) -> dict[str, Any]:
205181
Returns:
206182
dict[str, Any]: Contents of METADATA.json
207183
"""
208-
if self.workflow_filestorage:
209-
base_path = self.execution_dir
210-
else:
211-
base_path: Path = self._get_data_dir()
184+
base_path = self.execution_dir
212185
metadata_path = base_path / ToolExecKey.METADATA_FILE
213186
metadata_json = {}
214187
try:
215-
if self.workflow_filestorage:
216-
metadata_json = ToolUtils.load_json(
217-
file_to_load=metadata_path, fs=self.workflow_filestorage
218-
)
219-
else:
220-
with open(metadata_path, encoding="utf-8") as f:
221-
metadata_json = loads(f.read())
188+
metadata_json = ToolUtils.load_json(
189+
file_to_load=metadata_path, fs=self.workflow_filestorage
190+
)
222191
except JSONDecodeError as e:
223192
self.stream_error_and_exit(f"JSON decode error for {metadata_path}: {e}")
224193
except FileNotFoundError:
@@ -233,19 +202,13 @@ def _write_exec_metadata(self, metadata: dict[str, Any]) -> None:
233202
Args:
234203
metadata (dict[str, Any]): Metadata to write
235204
"""
236-
if self.workflow_filestorage:
237-
base_path = self.execution_dir
238-
metadata_path = base_path / ToolExecKey.METADATA_FILE
239-
ToolUtils.dump_json(
240-
file_to_dump=metadata_path,
241-
json_to_dump=metadata,
242-
fs=self.workflow_filestorage,
243-
)
244-
else:
245-
base_path: Path = self._get_data_dir()
246-
metadata_path = base_path / ToolExecKey.METADATA_FILE
247-
with metadata_path.open("w", encoding="utf-8") as f:
248-
f.write(ToolUtils.json_to_str(metadata))
205+
base_path = self.execution_dir
206+
metadata_path = base_path / ToolExecKey.METADATA_FILE
207+
ToolUtils.dump_json(
208+
file_to_dump=metadata_path,
209+
json_to_dump=metadata,
210+
fs=self.workflow_filestorage,
211+
)
249212

250213
def _update_exec_metadata(self) -> None:
251214
"""Updates the execution metadata after a tool executes.
@@ -314,18 +277,13 @@ def write_tool_result(self, data: Union[str, dict[str, Any]]) -> None:
314277
self.stream_result(result)
315278

316279
self._update_exec_metadata()
317-
json_data = json.dumps(data)
318280
# INFILE is overwritten for next tool to run
319281
input_file_path: Path = Path(self.get_input_file())
320-
if self.workflow_filestorage:
321-
ToolUtils.dump_json(
322-
file_to_dump=input_file_path,
323-
json_to_dump=data,
324-
fs=self.workflow_filestorage,
325-
)
326-
else:
327-
with input_file_path.open("w", encoding="utf-8") as f:
328-
f.write(json_data)
282+
ToolUtils.dump_json(
283+
file_to_dump=input_file_path,
284+
json_to_dump=data,
285+
fs=self.workflow_filestorage,
286+
)
329287

330288
def validate(self, input_file: str, settings: dict[str, Any]) -> None:
331289
"""Override to implement custom validation for the tool.

src/unstract/sdk/tool/executor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,6 @@ def execute_run(self, args: argparse.Namespace) -> None:
5757
f"Execution ID: {self.tool.execution_id}, "
5858
f"SDK Version: {get_sdk_version()}"
5959
)
60-
if not self.tool.workflow_filestorage:
61-
self._setup_for_run()
6260
validator = ToolValidator(self.tool)
6361
settings = validator.validate_pre_execution(settings=settings)
6462

0 commit comments

Comments
 (0)