Skip to content

Commit c91398b

Browse files
authored
feat/use uuid for s3 identifiers (#8)
* use uuid for s3 identifiers * Fix mapping between indexer and downloader
1 parent 71d226f commit c91398b

File tree

5 files changed

+19
-7
lines changed

5 files changed

+19
-7
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.0.2-dev0
2+
3+
### Enhancements
4+
5+
* **Use uuid for s3 identifiers** Update unique id to use uuid derived from file path rather than the filepath itself.
6+
17
## 0.0.1
28

39
### Enhancements

scripts/version-sync.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ fi
106106
declare FAILED_CHECK=0
107107

108108
git fetch origin main
109-
MAIN_VERSION=$(git show origin/main:unstructured/__version__.py | grep -o -m 1 -E "${RE_SEMVER_FULL}")
109+
MAIN_VERSION=$(git show origin/main:unstructured_ingest/__version__.py | grep -o -m 1 -E "${RE_SEMVER_FULL}")
110110
MAIN_IS_RELEASE=false
111111
[[ $MAIN_VERSION != *"-dev"* ]] && MAIN_IS_RELEASE=true
112112
CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD)

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.1" # pragma: no cover
1+
__version__ = "0.0.2-dev0" # pragma: no cover

unstructured_ingest/v2/examples/example_s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
if __name__ == "__main__":
2525
logger.info(f"Writing all content in: {work_dir.resolve()}")
2626
Pipeline.from_configs(
27-
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
27+
context=ProcessorConfig(work_dir=str(work_dir.resolve()), verbose=True),
2828
indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
2929
downloader_config=S3DownloaderConfig(download_dir=download_path),
3030
source_connection_config=S3ConnectionConfig(anonymous=True),

unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pathlib import Path
88
from time import time
99
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10+
from uuid import NAMESPACE_DNS, uuid5
1011

1112
from unstructured.documents.elements import DataSourceMetadata
1213

@@ -210,16 +211,19 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
210211
# Note: we remove any remaining leading slashes (Box introduces these)
211212
# to get a valid relative path
212213
rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
214+
215+
additional_metadata = self.sterilize_info(path=file)
216+
additional_metadata["original_file_path"] = file
213217
yield FileData(
214-
identifier=file,
218+
identifier=str(uuid5(NAMESPACE_DNS, file)),
215219
connector_type=self.connector_type,
216220
source_identifiers=SourceIdentifiers(
217221
filename=Path(file).name,
218222
rel_path=rel_path or None,
219223
fullpath=file,
220224
),
221225
metadata=self.get_metadata(path=file),
222-
additional_metadata=self.sterilize_info(path=file),
226+
additional_metadata=additional_metadata,
223227
)
224228

225229

@@ -262,7 +266,8 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
262266
download_path = self.get_download_path(file_data=file_data)
263267
download_path.parent.mkdir(parents=True, exist_ok=True)
264268
try:
265-
self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
269+
rpath = file_data.additional_metadata["original_file_path"]
270+
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
266271
except Exception as e:
267272
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
268273
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +277,8 @@ async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadRespons
272277
download_path = self.get_download_path(file_data=file_data)
273278
download_path.parent.mkdir(parents=True, exist_ok=True)
274279
try:
275-
await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
280+
rpath = file_data.additional_metadata["original_file_path"]
281+
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
276282
except Exception as e:
277283
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
278284
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")

0 commit comments

Comments
 (0)