Skip to content

Commit ef00f81

Browse files
gaya3-zipstackchandrasekharan-zipstackmuhammad-ali-e
authored
File Storage interface and implementation (#112)
* contextSizeChanges * contextSizeChanges * Version roll and test folder check in * Fix enum values * Fix test cases, address review comments * Address review comments * Update pyproject.toml Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Address mypy issues * Change class design and implementation * Remove unused definitions * Add cp() and function refactoring * Check-in sample env * Default value of dict changed to None * Add size() * Refctor for using FileStorage * Refactor to use FileStorage * Fix issues * Add mim_type, download functions * change comments * Refactor het_hash_from_file * Add return types * Remove permanent file storage from sdk * Fix SDK functional issues * Support minio * Test cases for Minio * Bring file variants back to sdk * Fix copy_on_write * Add new test cases for uploadd/download * Add new functions to support platform-service * Change modififcation_time return type to datetime * Refactor env pick-up logic * Sample env * contextSizeChanges * Remove commented code and some improvisations * contextSizeChanges * Add right JSON formatted string * Update src/unstract/sdk/file_storage/fs_permanent.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Address review comments * Address review comments * Update src/unstract/sdk/file_storage/fs_shared_temporary.py Co-authored-by: ali <[email protected]> Signed-off-by: Gayathri <[email protected]> * Refactor for change in enum value * Add return type --------- Signed-off-by: Gayathri <[email protected]> Co-authored-by: Chandrasekharan M <[email protected]> Co-authored-by: ali <[email protected]>
1 parent 8c4610f commit ef00f81

File tree

30 files changed

+2903
-1169
lines changed

30 files changed

+2903
-1169
lines changed

pdm.lock

Lines changed: 1268 additions & 1055 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,13 @@ scripts = { unstract-tool-gen = "unstract.sdk.scripts.tool_gen:main" }
8282

8383
[tool.pdm.dev-dependencies]
8484
docs = [ "lazydocs~=0.4.8" ]
85-
test = [ "parameterized==0.9.0" ]
85+
test = [
86+
"parameterized==0.9.0",
87+
"pytest==8.3.3",
88+
"pytest-mock==3.14.0",
89+
"gcsfs==2024.10.0",
90+
"s3fs==2024.10.0"
91+
]
8692
lint = [
8793
"autopep8~=2.0.2",
8894
"black~=23.3.0",

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.53.0"
1+
__version__ = "0.54.0rc1"
22

33

44
def get_sdk_version():

src/unstract/sdk/adapters/ocr/google_document_ai/src/google_document_ai.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from unstract.sdk.adapters.exceptions import AdapterError
1313
from unstract.sdk.adapters.ocr.constants import FileType
1414
from unstract.sdk.adapters.ocr.ocr_adapter import OCRAdapter
15+
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
1516

1617
logger = logging.getLogger(__name__)
1718

@@ -95,10 +96,13 @@ def _get_request_headers(self) -> dict[str, Any]:
9596

9697
""" Detect the mime type from the file content """
9798

98-
def _get_input_file_type_mime(self, input_file_path: str) -> str:
99-
with open(input_file_path, mode="rb") as file_obj:
100-
sample_contents = file_obj.read(100)
101-
file_type = filetype.guess(sample_contents)
99+
def _get_input_file_type_mime(
100+
self,
101+
input_file_path: str,
102+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
103+
) -> str:
104+
sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
105+
file_type = filetype.guess(sample_contents)
102106

103107
file_type_mime: str = file_type.MIME if file_type else FileType.TEXT_PLAIN
104108

@@ -110,13 +114,15 @@ def _get_input_file_type_mime(self, input_file_path: str) -> str:
110114
return file_type_mime
111115

112116
def process(
113-
self, input_file_path: str, output_file_path: Optional[str] = None
117+
self,
118+
input_file_path: str,
119+
output_file_path: Optional[str] = None,
120+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
114121
) -> str:
115122
try:
116123
file_type_mime = self._get_input_file_type_mime(input_file_path)
117-
if os.path.isfile(input_file_path):
118-
with open(input_file_path, "rb") as fop:
119-
file_content_in_bytes: bytes = fop.read()
124+
if fs.exists(input_file_path):
125+
file_content_in_bytes = fs.read(path=input_file_path, mode="rb")
120126
else:
121127
raise AdapterError(f"File not found {input_file_path}")
122128
processor_url = self.config.get(Constants.URL, "") + ":process"
@@ -131,19 +137,14 @@ def process(
131137
response_json: dict[str, Any] = response.json()
132138
result_text: str = response_json["document"]["text"]
133139
if output_file_path is not None:
134-
with open(output_file_path, "w", encoding="utf-8") as f:
135-
f.write(result_text)
136-
f.close()
140+
fs.write(path=output_file_path, mode="w", encoding="utf-8")
137141
return result_text
138142
except Exception as e:
139143
logger.error(f"Error while processing document {e}")
140144
if not isinstance(e, AdapterError):
141145
raise AdapterError(str(e))
142146
else:
143147
raise e
144-
finally:
145-
if fop is not None:
146-
fop.close()
147148

148149
def test_connection(self) -> bool:
149150
try:

src/unstract/sdk/adapters/utils.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from requests.exceptions import RequestException
77

88
from unstract.sdk.adapters.constants import Common
9+
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
910

1011

1112
class AdapterUtils:
@@ -34,8 +35,13 @@ def get_msg_from_request_exc(
3435
return err_response.text # type: ignore
3536
return default_err
3637

38+
# ToDo: get_file_mime_type() to be removed once migrated to FileStorage
39+
# FileStorage has mime_type() which could be used instead.
3740
@staticmethod
38-
def get_file_mime_type(input_file: Path) -> str:
41+
def get_file_mime_type(
42+
input_file: Path,
43+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
44+
) -> str:
3945
"""Gets the file MIME type for an input file. Uses libmagic to perform
4046
the same.
4147
@@ -45,15 +51,15 @@ def get_file_mime_type(input_file: Path) -> str:
4551
Returns:
4652
str: MIME type of the file
4753
"""
48-
input_file_mime = ""
49-
with open(input_file, mode="rb") as input_file_obj:
50-
sample_contents = input_file_obj.read(100)
51-
input_file_mime = magic.from_buffer(sample_contents, mime=True)
52-
input_file_obj.seek(0)
54+
sample_contents = fs.read(path=input_file, mode="rb", length=100)
55+
input_file_mime = magic.from_buffer(sample_contents, mime=True)
5356
return input_file_mime
5457

5558
@staticmethod
56-
def guess_extention(input_file_path: str) -> str:
59+
def guess_extention(
60+
input_file_path: str,
61+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
62+
) -> str:
5763
"""Returns the extention of the file passed.
5864
5965
Args:
@@ -63,8 +69,8 @@ def guess_extention(input_file_path: str) -> str:
6369
str: File extention
6470
"""
6571
input_file_extention = ""
66-
with open(input_file_path, mode="rb") as file_obj:
67-
sample_contents = file_obj.read(100)
72+
sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
73+
if sample_contents:
6874
file_type = filetype.guess(sample_contents)
6975
input_file_extention = file_type.EXTENSION
7076
return input_file_extention

src/unstract/sdk/adapters/x2text/helper.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from unstract.sdk.adapters.exceptions import AdapterError
99
from unstract.sdk.adapters.utils import AdapterUtils
1010
from unstract.sdk.adapters.x2text.constants import X2TextConstants
11+
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
1112

1213
logger = logging.getLogger(__name__)
1314

@@ -17,7 +18,9 @@ class X2TextHelper:
1718

1819
@staticmethod
1920
def parse_response(
20-
response: Response, out_file_path: Optional[str] = None
21+
response: Response,
22+
out_file_path: Optional[str] = None,
23+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
2124
) -> tuple[str, bool]:
2225
"""Parses the response from a request.
2326
@@ -27,6 +30,8 @@ def parse_response(
2730
response (Response): Response to parse
2831
out_file_path (Optional[str], optional): Output file path to write
2932
to, skipped if None or emtpy. Defaults to None.
33+
fs (FileStorage): file storage object to perfrom file operations
34+
3035
Returns:
3136
tuple[str, bool]: Response's content and status of parsing
3237
"""
@@ -35,8 +40,7 @@ def parse_response(
3540
if isinstance(response.content, bytes):
3641
output = response.content.decode("utf-8")
3742
if out_file_path:
38-
with open(out_file_path, "w", encoding="utf-8") as f:
39-
f.write(output)
43+
fs.write(path=out_file_path, mode="w", encoding="utf-8", data=output)
4044
return output, True
4145

4246

@@ -49,9 +53,7 @@ class UnstructuredHelper:
4953
PROCESS = "process"
5054

5155
@staticmethod
52-
def test_server_connection(
53-
unstructured_adapter_config: dict[str, Any]
54-
) -> bool:
56+
def test_server_connection(unstructured_adapter_config: dict[str, Any]) -> bool:
5557
UnstructuredHelper.make_request(
5658
unstructured_adapter_config, UnstructuredHelper.TEST_CONNECTION
5759
)
@@ -62,21 +64,23 @@ def process_document(
6264
unstructured_adapter_config: dict[str, Any],
6365
input_file_path: str,
6466
output_file_path: Optional[str] = None,
67+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
6568
) -> str:
6669
try:
6770
response: Response
71+
local_storage = FileStorage(FileStorageProvider.LOCAL)
72+
if not local_storage.exists(input_file_path):
73+
fs.download(from_path=input_file_path, to_path=input_file_path)
6874
with open(input_file_path, "rb") as input_f:
69-
mime_type = AdapterUtils.get_file_mime_type(
70-
input_file=input_file_path
71-
)
75+
mime_type = AdapterUtils.get_file_mime_type(input_file=input_file_path)
7276
files = {"file": (input_file_path, input_f, mime_type)}
7377
response = UnstructuredHelper.make_request(
7478
unstructured_adapter_config=unstructured_adapter_config,
7579
request_type=UnstructuredHelper.PROCESS,
7680
files=files,
7781
)
7882
output, is_success = X2TextHelper.parse_response(
79-
response=response, out_file_path=output_file_path
83+
response=response, out_file_path=output_file_path, fs=fs
8084
)
8185
if not is_success:
8286
raise AdapterError("Couldn't extract text from file")
@@ -95,9 +99,7 @@ def make_request(
9599
request_type: str,
96100
**kwargs: dict[Any, Any],
97101
) -> Response:
98-
unstructured_url = unstructured_adapter_config.get(
99-
UnstructuredHelper.URL
100-
)
102+
unstructured_url = unstructured_adapter_config.get(UnstructuredHelper.URL)
101103

102104
x2text_service_url = unstructured_adapter_config.get(
103105
X2TextConstants.X2TEXT_HOST

src/unstract/sdk/adapters/x2text/llama_parse/src/llama_parse.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from unstract.sdk.adapters.x2text.dto import TextExtractionResult
1212
from unstract.sdk.adapters.x2text.llama_parse.src.constants import LlamaParseConfig
1313
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
14+
from unstract.sdk.file_storage import FileStorage, FileStorageProvider
1415

1516
logger = logging.getLogger(__name__)
1617

@@ -46,8 +47,8 @@ def get_json_schema() -> str:
4647
def _call_parser(
4748
self,
4849
input_file_path: str,
50+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
4951
) -> str:
50-
5152
parser = LlamaParse(
5253
api_key=self.config.get(LlamaParseConfig.API_KEY),
5354
base_url=self.config.get(LlamaParseConfig.BASE_URL),
@@ -61,7 +62,9 @@ def _call_parser(
6162
file_extension = pathlib.Path(input_file_path).suffix
6263
if not file_extension:
6364
try:
64-
input_file_extension = AdapterUtils.guess_extention(input_file_path)
65+
input_file_extension = AdapterUtils.guess_extention(
66+
input_file_path, fs
67+
)
6568
input_file_path_copy = input_file_path
6669
input_file_path = ".".join(
6770
(input_file_path_copy, input_file_extension)
@@ -70,7 +73,8 @@ def _call_parser(
7073
logger.error("Exception raised while handling input file.")
7174
raise AdapterError(str(os_err))
7275

73-
documents = parser.load_data(input_file_path)
76+
file_bytes = fs.read(path=input_file_path, mode="rb")
77+
documents = parser.load_data(file_bytes)
7478

7579
except ConnectError as connec_err:
7680
logger.error(f"Invalid Base URL given. : {connec_err}")
@@ -91,13 +95,17 @@ def process(
9195
self,
9296
input_file_path: str,
9397
output_file_path: Optional[str] = None,
98+
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
9499
**kwargs: dict[Any, Any],
95100
) -> TextExtractionResult:
96-
97-
response_text = self._call_parser(input_file_path=input_file_path)
101+
response_text = self._call_parser(input_file_path=input_file_path, fs=fs)
98102
if output_file_path:
99-
with open(output_file_path, "w", encoding="utf-8") as f:
100-
f.write(response_text)
103+
fs.write(
104+
path=output_file_path,
105+
mode="w",
106+
encoding="utf-8",
107+
data=response_text,
108+
)
101109

102110
return TextExtractionResult(extracted_text=response_text)
103111

0 commit comments

Comments
 (0)