Skip to content

Commit c1a6ed0

Browse files
Support length to be passed while identifying mime type (#134)
* Refactoring changed file names * Roll version * Update src/unstract/sdk/utils/tool_utils.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Update tests/test_fs_permanent.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Update tests/test_fs_permanent.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Update src/unstract/sdk/utils/tool_utils.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Address review comments * Add support for passing length to mime_type * Add recursive and fix mypy issue * CHange test case with new behavior to return FileNotFound in read() * Remove typing kwargs. * Resolve mypy issues * Resolve mypy issues * Remove unwanted conditionals/vars * Remove pandoc and tessaract. * Details of provider added to error message --------- Signed-off-by: Gayathri <[email protected]> Co-authored-by: Chandrasekharan M <[email protected]>
1 parent e2cf928 commit c1a6ed0

File tree

9 files changed

+101
-38
lines changed

9 files changed

+101
-38
lines changed

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ They also contain helper methods/classes to aid with other tasks such as indexin
1919
- Linux
2020

2121
```
22-
sudo apt install build-essential pkg-config libmagic-dev tesseract-ocr pandoc
22+
sudo apt install build-essential pkg-config libmagic-dev
2323
```
2424
2525
- Mac
@@ -101,6 +101,16 @@ unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch
101101

102102
- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server
103103

104+
#### Additonal dependencies for tool
105+
Tools may need to be backed up by a file storage. unstract.sdk.file_storage contains the required interfaces for the
106+
same. fssepc is being used underneath to implement these interfaces. Hence, one can choose to use a file_system
107+
supported by fsspec for this. However, the required dependencies need to be added in the tool dependency manager.
108+
Eg. If the tool is using Minio as the underlying file storage, then s3fs can be added to support it.
109+
Similarly, for Google Cloud Storage, gcsfs is to be added.
110+
The following versions are tested in the SDK using unit test cases for the above package.
111+
gcsfs==2024.10.0
112+
s3fs==2024.10.0
113+
104114

105115
### Documentation generation
106116

src/unstract/sdk/file_storage/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
class FileOperationParams:
55
READ_ENTIRE_LENGTH = -1
6+
MIME_TYPE_DEFAULT_READ_LENGTH = 100
67
DEFAULT_ENCODING = "utf-8"
78

89

src/unstract/sdk/file_storage/env_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ def get_storage(storage_type: StorageType, env_name: str) -> FileStorage:
3232
and credentials configured in the env
3333
"""
3434
try:
35-
file_storage_creds = json.loads(os.environ.get(env_name))
35+
file_storage_creds = json.loads(os.environ.get(env_name, ""))
3636
provider = FileStorageProvider(
3737
file_storage_creds[CredentialKeyword.PROVIDER]
3838
)
39-
credentials = file_storage_creds.get(CredentialKeyword.CREDENTIALS, {})
39+
credentials = file_storage_creds.get(CredentialKeyword.CREDENTIALS, "{}")
4040
if storage_type == StorageType.PERMANENT.value:
4141
file_storage = PermanentFileStorage(provider=provider, **credentials)
4242
elif storage_type == StorageType.TEMPORARY.value:

src/unstract/sdk/file_storage/helper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ def file_storage_init(
2929
try:
3030
protocol = provider.value
3131
if provider == FileStorageProvider.LOCAL:
32-
storage_config.update({"auto_mkdir": True})
32+
# Hard set auto_mkdir to True as default
33+
storage_config.update({"auto_mkdir": True}) # type: ignore
3334
elif provider in [FileStorageProvider.MINIO]:
3435
# Initialise using s3 for Minio
3536
protocol = FileStorageProvider.S3.value

src/unstract/sdk/file_storage/impl.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -161,17 +161,25 @@ def rm(self, path: str, recursive: bool = True):
161161
return self.fs.rm(path=path, recursive=recursive)
162162

163163
@skip_local_cache
164-
def cp(self, src: str, dest: str, overwrite: bool = True):
164+
def cp(
165+
self,
166+
src: str,
167+
dest: str,
168+
recursive: bool = False,
169+
overwrite: bool = True,
170+
):
165171
"""Copies files from source(lpath) path to the destination(rpath) path.
166172
167173
Args:
168174
src (str): Path to the source
169175
dest (str): Path to the destination
176+
recursive (bool): Copy recursively when set to True
177+
overwrite (bool): Overwrite existing path with same name
170178
171179
Returns:
172180
NA
173181
"""
174-
return self.fs.cp(src, dest, overwrite=overwrite)
182+
return self.fs.cp(src, dest, recursive=recursive, overwrite=overwrite)
175183

176184
@skip_local_cache
177185
def size(self, path: str) -> int:
@@ -203,17 +211,23 @@ def modification_time(self, path: str) -> datetime:
203211
return file_mtime
204212

205213
@skip_local_cache
206-
def mime_type(self, path: str) -> str:
214+
def mime_type(
215+
self,
216+
path: str,
217+
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
218+
) -> str:
207219
"""Gets the file MIME type for an input file. Uses libmagic to perform
208220
the same.
209221
210222
Args:
211223
path (str): Path of the input file
224+
read_length (int): Length(bytes) to be read from the file for in
225+
order to identify the mime type
212226
213227
Returns:
214228
str: MIME type of the file
215229
"""
216-
sample_contents = self.read(path=path, mode="rb", length=100)
230+
sample_contents = self.read(path=path, mode="rb", length=read_length)
217231
mime_type = magic.from_buffer(sample_contents, mime=True)
218232
return mime_type
219233

@@ -291,7 +305,7 @@ def json_dump(
291305
self,
292306
path: str,
293307
data: dict[str, Any],
294-
**kwargs: dict[Any, Any],
308+
**kwargs: dict[Any, Any], # type: ignore
295309
):
296310
"""Dumps data into the given file specified by path.
297311
@@ -302,15 +316,15 @@ def json_dump(
302316
"""
303317
try:
304318
with self.fs.open(path=path, mode="w", encoding="utf-8") as f:
305-
json.dump(data, f, **kwargs)
319+
json.dump(obj=data, fp=f, **kwargs) # type: ignore
306320
except Exception as e:
307321
raise FileOperationError(str(e)) from e
308322

309323
def yaml_dump(
310324
self,
311325
path: str,
312326
data: dict[str, Any],
313-
**kwargs: dict[Any, Any],
327+
**kwargs: dict[Any, Any], # type: ignore
314328
):
315329
"""Dumps data into the given file specified by path.
316330
@@ -321,7 +335,7 @@ def yaml_dump(
321335
"""
322336
try:
323337
with self.fs.open(path=path, mode="w", encoding="utf-8") as f:
324-
yaml.dump(data, f, **kwargs)
338+
yaml.dump(data=data, stream=f, **kwargs) # type: ignore
325339
except Exception as e:
326340
raise FileOperationError(str(e)) from e
327341

src/unstract/sdk/file_storage/interface.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,13 @@ def rm(self, path: str, recursive: bool = True):
5656
pass
5757

5858
@abstractmethod
59-
def cp(self, lpath: str, rpath: str):
59+
def cp(
60+
self,
61+
lpath: str,
62+
rpath: str,
63+
recursive: bool = False,
64+
overwrite: bool = True,
65+
):
6066
pass
6167

6268
@abstractmethod
@@ -68,7 +74,11 @@ def modification_time(self, path: str) -> datetime:
6874
pass
6975

7076
@abstractmethod
71-
def mime_type(self, path: str) -> str:
77+
def mime_type(
78+
self,
79+
path: str,
80+
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
81+
) -> str:
7282
pass
7383

7484
@abstractmethod

src/unstract/sdk/file_storage/permanent.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,10 @@ def __init__(
2828
f"supported in Permanent mode. "
2929
f"Supported providers: {self.SUPPORTED_FILE_STORAGE_TYPES}"
3030
)
31-
if provider == FileStorageProvider.GCS:
31+
if provider == FileStorageProvider.GCS or provider == FileStorageProvider.LOCAL:
3232
super().__init__(provider, **storage_config)
33-
elif provider == FileStorageProvider.LOCAL:
34-
super().__init__(provider)
3533
else:
36-
raise NotImplementedError
34+
raise NotImplementedError(f"Provider {provider.value} is not implemented")
3735

3836
def _copy_on_read(self, path: str, legacy_storage_path: str):
3937
"""Copies the file to the remote storage lazily if not present already.
@@ -76,6 +74,7 @@ def read(
7674
seek_position (int): Position to start reading from
7775
length (int): Number of bytes to be read. Default is full
7876
file content.
77+
legacy_storage_path (str): Legacy path to the same file
7978
8079
Returns:
8180
Union[bytes, str] - File contents in bytes/string based on the opened mode
@@ -85,7 +84,7 @@ def read(
8584
if legacy_storage_path:
8685
self._copy_on_read(path, legacy_storage_path)
8786
return super().read(path, mode, encoding, seek_position, length)
88-
except FileNotFoundError:
89-
logger.warning(f"File {path} not found. Ignoring.")
9087
except Exception as e:
88+
if isinstance(e, FileNotFoundError) or isinstance(e, FileOperationError):
89+
raise e
9190
raise FileOperationError(str(e)) from e

tests/test_file_storage.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from unstract.sdk.constants import MimeType
1111
from unstract.sdk.exceptions import FileOperationError
12-
from unstract.sdk.file_storage.constants import StorageType
12+
from unstract.sdk.file_storage.constants import FileOperationParams, StorageType
1313
from unstract.sdk.file_storage.env_helper import EnvHelper
1414
from unstract.sdk.file_storage.impl import FileStorage
1515
from unstract.sdk.file_storage.provider import FileStorageProvider
@@ -488,29 +488,51 @@ def test_file(provider):
488488

489489

490490
@pytest.mark.parametrize(
491-
"file_storage, lpath, rpath",
491+
"file_storage, lpath, rpath, recursive, expected_result",
492492
[
493493
(
494494
file_storage(provider=FileStorageProvider.GCS),
495495
TEST_CONSTANTS.READ_TEXT_FILE,
496496
TEST_CONSTANTS.TEST_FOLDER,
497+
True,
498+
True,
497499
),
498500
(
499501
file_storage(provider=FileStorageProvider.LOCAL),
500502
TEST_CONSTANTS.READ_TEXT_FILE,
501503
TEST_CONSTANTS.TEST_FOLDER,
504+
True,
505+
True,
506+
),
507+
(
508+
file_storage(provider=FileStorageProvider.LOCAL),
509+
TEST_CONSTANTS.READ_FOLDER_PATH,
510+
TEST_CONSTANTS.TEST_FOLDER,
511+
True,
512+
True,
513+
),
514+
(
515+
file_storage(provider=FileStorageProvider.LOCAL),
516+
TEST_CONSTANTS.READ_FOLDER_PATH,
517+
TEST_CONSTANTS.TEST_FOLDER,
518+
False,
519+
False,
502520
),
503521
(
504522
file_storage(provider=FileStorageProvider.MINIO),
505523
TEST_CONSTANTS.READ_TEXT_FILE,
506524
TEST_CONSTANTS.TEST_FOLDER,
525+
True,
526+
True,
507527
),
508528
],
509529
)
510-
def test_cp(file_storage, lpath, rpath):
511-
file_storage.cp(lpath, rpath, overwrite=True)
512-
assert file_storage.exists(rpath) is True
513-
file_storage.rm(rpath, recursive=True)
530+
def test_cp(file_storage, lpath, rpath, recursive, expected_result):
531+
file_storage.cp(lpath, rpath, recursive=recursive, overwrite=True)
532+
actual_result = file_storage.exists(rpath)
533+
assert actual_result == expected_result
534+
if actual_result:
535+
file_storage.rm(rpath, recursive=True)
514536
assert file_storage.exists(rpath) is False
515537

516538

@@ -566,49 +588,59 @@ def test_file_size(file_storage, path, expected_size):
566588

567589

568590
@pytest.mark.parametrize(
569-
"file_storage, path, expected_mime_type",
591+
"file_storage, path, read_length, expected_mime_type",
570592
[
571593
(
572594
file_storage(provider=FileStorageProvider.GCS),
573595
TEST_CONSTANTS.READ_PDF_FILE,
596+
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
574597
MimeType.PDF,
575598
),
576599
(
577600
file_storage(provider=FileStorageProvider.GCS),
578601
TEST_CONSTANTS.READ_TEXT_FILE,
602+
FileOperationParams.READ_ENTIRE_LENGTH,
579603
MimeType.TEXT,
580604
),
581605
(
582606
file_storage(provider=FileStorageProvider.GCS),
583607
TEST_CONSTANTS.READ_PDF_FILE,
608+
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
584609
MimeType.PDF,
585610
),
586611
(
587612
file_storage(provider=FileStorageProvider.LOCAL),
588613
TEST_CONSTANTS.READ_TEXT_FILE,
614+
50,
589615
MimeType.TEXT,
590616
),
591617
(
592618
file_storage(provider=FileStorageProvider.MINIO),
593619
TEST_CONSTANTS.READ_PDF_FILE,
620+
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
594621
MimeType.PDF,
595622
),
596623
(
597624
file_storage(provider=FileStorageProvider.MINIO),
598625
TEST_CONSTANTS.READ_TEXT_FILE,
626+
FileOperationParams.READ_ENTIRE_LENGTH,
599627
MimeType.TEXT,
600628
),
601629
(
602630
file_storage(provider=FileStorageProvider.MINIO),
603631
TEST_CONSTANTS.READ_PDF_FILE,
632+
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
604633
MimeType.PDF,
605634
),
606635
],
607636
)
608-
def test_file_mime_type(file_storage, path, expected_mime_type):
637+
def test_file_mime_type(file_storage, path, read_length, expected_mime_type):
609638
mime_type = file_storage.mime_type(path=path)
610639
file_storage.mkdir(path=TEST_CONSTANTS.READ_FOLDER_PATH)
611640
assert mime_type == expected_mime_type
641+
mime_type_read_length = file_storage.mime_type(path=path, read_length=read_length)
642+
file_storage.mkdir(path=TEST_CONSTANTS.READ_FOLDER_PATH)
643+
assert mime_type_read_length == expected_mime_type
612644

613645

614646
@pytest.mark.parametrize(

tests/test_fs_permanent.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,7 @@ def permanent_file_storage(provider: FileStorageProvider):
3333
creds = json.loads(os.environ.get(TEST_CONSTANTS.FILE_STORAGE_LOCAL, "{}"))
3434
except JSONDecodeError:
3535
creds = {}
36-
file_storage = PermanentFileStorage(
37-
provider=provider, legacy_storage_path="./prompt_studio_data", **creds
38-
)
36+
file_storage = PermanentFileStorage(provider=provider, **creds)
3937
assert file_storage is not None
4038
return file_storage
4139

@@ -53,13 +51,11 @@ def permanent_file_storage(provider: FileStorageProvider):
5351
def test_permanent_fs_copy_on_read(file_storage, file_read_path, read_mode):
5452
if file_storage.exists(file_read_path):
5553
file_storage.rm(file_read_path)
56-
file_read_contents = file_storage.read(
57-
file_read_path,
58-
read_mode,
59-
)
60-
print(file_read_contents)
61-
# File in the path does not exist. So no contents can be read
62-
assert file_read_contents is None
54+
with pytest.raises(FileNotFoundError):
55+
file_storage.read(
56+
file_read_path,
57+
read_mode,
58+
)
6359

6460

6561
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)