Skip to content

Commit 2989a45

Browse files
authored
feat/explicit opensearch classes (#14)
* Leverage explicit config classes for opensearch connectors * Add changelog entry * prep for new release * tidy * remove unused check_connection metho * add in missing fsspec destination precheck * add changelog entry * Add requires_dependencies to prechecks in fsspec connectors
1 parent 0bb38d1 commit 2989a45

File tree

11 files changed

+103
-12
lines changed

11 files changed

+103
-12
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
## 0.0.3-dev0
1+
## 0.0.3
22

33
### Enhancements
44

55
* **Improve documentation** Update the README's.
6+
* **Explicit Opensearch classes** For the connector registry entries for opensearch, use only opensearch specific classes rather than any elasticsearch ones.
7+
* **Add missing fsspec destination precheck** check connection in precheck for all fsspec-based destination connectors
68

79
## 0.0.2
810

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.3-dev0" # pragma: no cover
1+
__version__ = "0.0.3" # pragma: no cover

unstructured_ingest/v2/interfaces/process.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,3 @@ def run(self, **kwargs: Any) -> Any:
1717

1818
async def run_async(self, **kwargs: Any) -> Any:
1919
return self.run(**kwargs)
20-
21-
def check_connection(self):
22-
# If the process requires external connections, run a quick check
23-
pass

unstructured_ingest/v2/processes/connectors/fsspec/azure.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ class AzureIndexer(FsspecIndexer):
7575
index_config: AzureIndexerConfig
7676
connector_type: str = CONNECTOR_TYPE
7777

78+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
79+
def precheck(self) -> None:
80+
super().precheck()
81+
7882
def sterilize_info(self, path) -> dict:
7983
info = self.fs.info(path=path)
8084
return sterilize_dict(data=info, default=azure_json_serial)
@@ -120,6 +124,10 @@ class AzureUploader(FsspecUploader):
120124
def __post_init__(self):
121125
super().__post_init__()
122126

127+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
128+
def precheck(self) -> None:
129+
super().precheck()
130+
123131
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
124132
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
125133
return super().run(contents=contents, **kwargs)

unstructured_ingest/v2/processes/connectors/fsspec/box.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ class BoxIndexer(FsspecIndexer):
7070
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
7171
return super().run(**kwargs)
7272

73+
@requires_dependencies(["boxfs"], extras="box")
74+
def precheck(self) -> None:
75+
super().precheck()
76+
7377

7478
@dataclass
7579
class BoxDownloaderConfig(FsspecDownloaderConfig):
@@ -107,6 +111,10 @@ class BoxUploader(FsspecUploader):
107111
def __post_init__(self):
108112
super().__post_init__()
109113

114+
@requires_dependencies(["boxfs"], extras="box")
115+
def precheck(self) -> None:
116+
super().precheck()
117+
110118
@requires_dependencies(["boxfs"], extras="box")
111119
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
112120
return super().run(contents=contents, **kwargs)

unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ def __post_init__(self):
5757
if not self.index_config.path_without_protocol.startswith("/"):
5858
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
5959

60+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61+
def precheck(self) -> None:
62+
super().precheck()
63+
6064
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
6165
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
6266
return super().run(**kwargs)
@@ -106,6 +110,10 @@ class DropboxUploader(FsspecUploader):
106110
def __post_init__(self):
107111
super().__post_init__()
108112

113+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
114+
def precheck(self) -> None:
115+
super().precheck()
116+
109117
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
110118
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
111119
return super().run(contents=contents, **kwargs)

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
from uuid import NAMESPACE_DNS, uuid5
1010

1111
from unstructured_ingest.enhanced_dataclass import enhanced_field
12-
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12+
from unstructured_ingest.error import (
13+
DestinationConnectionError,
14+
SourceConnectionError,
15+
SourceConnectionNetworkError,
16+
)
1317
from unstructured_ingest.v2.interfaces import (
1418
AccessConfig,
1519
ConnectionConfig,
@@ -300,6 +304,19 @@ def __post_init__(self):
300304
f"missing 1 required positional argument: 'upload_config'"
301305
)
302306

307+
def precheck(self) -> None:
308+
from fsspec import get_filesystem_class
309+
310+
try:
311+
fs = get_filesystem_class(self.upload_config.protocol)(
312+
**self.connection_config.get_access_config(),
313+
)
314+
root_dir = self.upload_config.path_without_protocol.split("/")[0]
315+
fs.ls(path=root_dir, detail=False)
316+
except Exception as e:
317+
logger.error(f"failed to validate connection: {e}", exc_info=True)
318+
raise DestinationConnectionError(f"failed to validate connection: {e}")
319+
303320
def get_upload_path(self, file_data: FileData) -> Path:
304321
upload_path = (
305322
Path(self.upload_config.path_without_protocol)

unstructured_ingest/v2/processes/connectors/fsspec/gcs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ class GcsIndexer(FsspecIndexer):
8080
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
8181
return super().run(**kwargs)
8282

83+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
84+
def precheck(self) -> None:
85+
super().precheck()
86+
8387

8488
@dataclass
8589
class GcsDownloaderConfig(FsspecDownloaderConfig):
@@ -117,6 +121,10 @@ class GcsUploader(FsspecUploader):
117121
def __post_init__(self):
118122
super().__post_init__()
119123

124+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
125+
def precheck(self) -> None:
126+
super().precheck()
127+
120128
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
121129
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122130
return super().run(contents=contents, **kwargs)

unstructured_ingest/v2/processes/connectors/fsspec/s3.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ def get_metadata(self, path: str) -> FileDataSourceMetadata:
111111
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
112112
return super().run(**kwargs)
113113

114+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
115+
def precheck(self) -> None:
116+
super().precheck()
117+
114118

115119
@dataclass
116120
class S3DownloaderConfig(FsspecDownloaderConfig):
@@ -144,6 +148,10 @@ class S3Uploader(FsspecUploader):
144148
connection_config: S3ConnectionConfig
145149
upload_config: S3UploaderConfig = field(default=None)
146150

151+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
152+
def precheck(self) -> None:
153+
super().precheck()
154+
147155
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
148156
def __post_init__(self):
149157
super().__post_init__()

unstructured_ingest/v2/processes/connectors/fsspec/sftp.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
9191
file.identifier = new_identifier
9292
yield file
9393

94+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
95+
def precheck(self) -> None:
96+
super().precheck()
97+
9498

9599
@dataclass
96100
class SftpDownloaderConfig(FsspecDownloaderConfig):
@@ -142,6 +146,10 @@ class SftpUploader(FsspecUploader):
142146
def __post_init__(self):
143147
super().__post_init__()
144148

149+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
150+
def precheck(self) -> None:
151+
super().precheck()
152+
145153
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
146154
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147155
return super().run(contents=contents, **kwargs)

0 commit comments

Comments
 (0)