Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 1.1.4

* **Fix**: prevent S3 path conflicts using hash-based directory isolation

## 1.1.3

* **Fix: Remove unnecessary deletion operation in ES connector**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"directory_structure": [
"s3_keys": [
"wiki_movie_plots_small.csv"
]
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"directory_structure": [
"s3_keys": [
"Why_is_the_sky_blue?.txt",
"[test]?*.txt"
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"directory_structure": [
"s3_keys": [
"2023-Jan-economic-outlook.pdf",
"Silent-Giant-(1).pdf",
"page-with-formula.pdf",
Expand Down
25 changes: 20 additions & 5 deletions test/integration/connectors/utils/validation/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,26 @@ def run_expected_download_files_validation(


def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
directory_record = expected_output_dir / "directory_structure.json"
with directory_record.open("r") as directory_file:
directory_file_contents = json.load(directory_file)
directory_structure = directory_file_contents["directory_structure"]
assert directory_structure == download_files
s3_keys_file = expected_output_dir / "expected_s3_keys.json"

if s3_keys_file.exists():
with s3_keys_file.open("r") as f:
s3_keys = json.load(f)["s3_keys"]

expected_filenames = [Path(s3_key).name for s3_key in s3_keys]
actual_filenames = [Path(download_file).name for download_file in download_files]

expected_filenames.sort()
actual_filenames.sort()
assert expected_filenames == actual_filenames, (
Copy link

@PastelStorm PastelStorm Jul 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not super important here and shouldn't be a blocker but in general I would avoid this pattern in the code.

I did some math and you should get about 10x-12x speedup if you create and compare two sets because TimSort has O(n*log n) complexity. Comparing two sets or two lists is the same O(n).

For 100k files this would be a difference of 3.5kk operations (sorted lists) vs. 300k operations (sets)

expected_filenames = {Path(s3_key).name for s3_key in s3_keys}
actual_filenames = {Path(download_file).name for download_file in download_files}

assert expected_filenames == actual_filenames

f"Expected filenames: {expected_filenames}, "
f"Got filenames: {actual_filenames}"
)
else:
directory_record = expected_output_dir / "directory_structure.json"
with directory_record.open("r") as f:
directory_structure = json.load(f)["directory_structure"]
assert directory_structure == download_files


def update_fixtures(
Expand Down
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.3" # pragma: no cover
__version__ = "1.1.4" # pragma: no cover
2 changes: 2 additions & 0 deletions unstructured_ingest/interfaces/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ class Downloader(BaseProcess, BaseConnector, ABC):
def get_download_path(self, file_data: FileData) -> Optional[Path]:
if not file_data.source_identifiers:
return None

rel_path = file_data.source_identifiers.relative_path
if not rel_path:
return None

rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
return self.download_dir / Path(rel_path)

Expand Down
16 changes: 16 additions & 0 deletions unstructured_ingest/processes/connectors/fsspec/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,22 @@ class FsspecDownloader(Downloader):
download_config: Optional[FsspecDownloaderConfigT] = field(
default_factory=lambda: FsspecDownloaderConfig()
)

def get_download_path(self, file_data: FileData) -> Optional[Path]:
if not file_data.source_identifiers:
return None

filename = file_data.source_identifiers.filename
if not filename:
return None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

define both booleans as variables, join them with an and and return None once


mkdir_concurrent_safe(self.download_dir)

temp_dir = tempfile.mkdtemp(
prefix="unstructured_",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make this a class-level constant

dir=self.download_dir
)
return Path(temp_dir) / filename

def is_async(self) -> bool:
with self.connection_config.get_client(protocol=self.protocol) as client:
Expand Down
2 changes: 1 addition & 1 deletion unstructured_ingest/utils/filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ def mkdir_concurrent_safe(path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
except FileExistsError:
if not (path.exists() and path.is_dir()):
raise
raise