diff --git a/CHANGELOG.md b/CHANGELOG.md index 29bda1d95..da0034646 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.2.2 + +* **Fix**: prevent S3 path conflicts using tempfile for directory isolation + ## 1.2.1 * **Fix**: Embeddings are properly assigned when embedding in batches diff --git a/test/integration/connectors/expected_results/s3-minio/directory_structure.json b/test/integration/connectors/expected_results/s3-minio/expected_s3_keys.json similarity index 59% rename from test/integration/connectors/expected_results/s3-minio/directory_structure.json rename to test/integration/connectors/expected_results/s3-minio/expected_s3_keys.json index 416044a4d..867581e3f 100644 --- a/test/integration/connectors/expected_results/s3-minio/directory_structure.json +++ b/test/integration/connectors/expected_results/s3-minio/expected_s3_keys.json @@ -1,5 +1,5 @@ { - "directory_structure": [ + "s3_keys": [ "wiki_movie_plots_small.csv" ] } \ No newline at end of file diff --git a/test/integration/connectors/expected_results/s3-specialchar/directory_structure.json b/test/integration/connectors/expected_results/s3-specialchar/expected_s3_keys.json similarity index 68% rename from test/integration/connectors/expected_results/s3-specialchar/directory_structure.json rename to test/integration/connectors/expected_results/s3-specialchar/expected_s3_keys.json index d9deb69e3..ebdee84c3 100644 --- a/test/integration/connectors/expected_results/s3-specialchar/directory_structure.json +++ b/test/integration/connectors/expected_results/s3-specialchar/expected_s3_keys.json @@ -1,5 +1,5 @@ { - "directory_structure": [ + "s3_keys": [ "Why_is_the_sky_blue?.txt", "[test]?*.txt" ] diff --git a/test/integration/connectors/expected_results/s3/directory_structure.json b/test/integration/connectors/expected_results/s3/expected_s3_keys.json similarity index 83% rename from test/integration/connectors/expected_results/s3/directory_structure.json rename to test/integration/connectors/expected_results/s3/expected_s3_keys.json index 50a3f2db0..ab9f86a31 100644 --- a/test/integration/connectors/expected_results/s3/directory_structure.json +++ b/test/integration/connectors/expected_results/s3/expected_s3_keys.json @@ -1,5 +1,5 @@ { - "directory_structure": [ + "s3_keys": [ "2023-Jan-economic-outlook.pdf", "Silent-Giant-(1).pdf", "page-with-formula.pdf", diff --git a/test/integration/connectors/utils/validation/source.py b/test/integration/connectors/utils/validation/source.py index 7c2e7a7e4..e9db9bb04 100644 --- a/test/integration/connectors/utils/validation/source.py +++ b/test/integration/connectors/utils/validation/source.py @@ -167,11 +167,24 @@ def run_expected_download_files_validation( def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]): - directory_record = expected_output_dir / "directory_structure.json" - with directory_record.open("r") as directory_file: - directory_file_contents = json.load(directory_file) - directory_structure = directory_file_contents["directory_structure"] - assert directory_structure == download_files + s3_keys_file = expected_output_dir / "expected_s3_keys.json" + + if s3_keys_file.exists(): + with s3_keys_file.open("r") as f: + s3_keys = json.load(f)["s3_keys"] + + expected_filenames = {Path(s3_key).name for s3_key in s3_keys} + actual_filenames = {Path(download_file).name for download_file in download_files} + + assert expected_filenames == actual_filenames, ( + f"Expected filenames: {sorted(expected_filenames)}, " + f"Got filenames: {sorted(actual_filenames)}" + ) + else: + directory_record = expected_output_dir / "directory_structure.json" + with directory_record.open("r") as f: + directory_structure = json.load(f)["directory_structure"] + assert directory_structure == download_files def update_fixtures( diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index b839bbb3c..4602f33df 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "1.2.1" # pragma: no cover +__version__ = "1.2.2" # pragma: no cover \ No newline at end of file diff --git a/unstructured_ingest/interfaces/downloader.py b/unstructured_ingest/interfaces/downloader.py index d72dfe89f..01f3c4787 100644 --- a/unstructured_ingest/interfaces/downloader.py +++ b/unstructured_ingest/interfaces/downloader.py @@ -36,9 +36,11 @@ class Downloader(BaseProcess, BaseConnector, ABC): def get_download_path(self, file_data: FileData) -> Optional[Path]: if not file_data.source_identifiers: return None + rel_path = file_data.source_identifiers.relative_path if not rel_path: return None + rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path return self.download_dir / Path(rel_path) diff --git a/unstructured_ingest/processes/connectors/fsspec/fsspec.py b/unstructured_ingest/processes/connectors/fsspec/fsspec.py index 38cfe3539..594f0ada1 100644 --- a/unstructured_ingest/processes/connectors/fsspec/fsspec.py +++ b/unstructured_ingest/processes/connectors/fsspec/fsspec.py @@ -264,12 +264,31 @@ class FsspecDownloaderConfig(DownloaderConfig): @dataclass class FsspecDownloader(Downloader): + TEMP_DIR_PREFIX = "unstructured_" + protocol: str connection_config: FsspecConnectionConfigT connector_type: str = CONNECTOR_TYPE download_config: Optional[FsspecDownloaderConfigT] = field( default_factory=lambda: FsspecDownloaderConfig() ) + + def get_download_path(self, file_data: FileData) -> Optional[Path]: + has_source_identifiers = file_data.source_identifiers is not None + has_filename = has_source_identifiers and file_data.source_identifiers.filename + + if not (has_source_identifiers and has_filename): + return None + + filename = file_data.source_identifiers.filename + + mkdir_concurrent_safe(self.download_dir) + + temp_dir = tempfile.mkdtemp( + prefix=self.TEMP_DIR_PREFIX, + dir=self.download_dir + ) + return Path(temp_dir) / filename def is_async(self) -> bool: with self.connection_config.get_client(protocol=self.protocol) as client: