diff --git a/dvuploader/file.py b/dvuploader/file.py index a43edb0..dd8b1d0 100644 --- a/dvuploader/file.py +++ b/dvuploader/file.py @@ -29,6 +29,7 @@ class File(BaseModel): Private Attributes: _size (int): Size of the file in bytes. + _is_inside_zip (bool): Indicates if the file is packaged inside a zip archive. Methods: extract_file_name(): Extracts filename from filepath and initializes file handler. @@ -57,6 +58,7 @@ class File(BaseModel): tab_ingest: bool = Field(default=True, alias="tabIngest") _size: int = PrivateAttr(default=0) + _is_inside_zip: bool = PrivateAttr(default=False) def extract_file_name(self): """ diff --git a/dvuploader/nativeupload.py b/dvuploader/nativeupload.py index 7296c86..f760f96 100644 --- a/dvuploader/nativeupload.py +++ b/dvuploader/nativeupload.py @@ -345,8 +345,8 @@ async def _update_metadata( try: if _tab_extension(dv_path) in file_mapping: file_id = file_mapping[_tab_extension(dv_path)] - elif file.file_name and _is_zip(file.file_name): - # When the file is a zip it will be unpacked and thus + elif file.file_name and _is_zip(file.file_name) and not file._is_inside_zip: + # When the file is a zip package it will be unpacked and thus # the expected file name of the zip will not be in the # dataset, since it has been unpacked. continue diff --git a/dvuploader/packaging.py b/dvuploader/packaging.py index c99d4d1..9567fd0 100644 --- a/dvuploader/packaging.py +++ b/dvuploader/packaging.py @@ -98,6 +98,7 @@ def zip_files( data=file.handler.read(), # type: ignore zinfo_or_arcname=_create_arcname(file), ) + file._is_inside_zip = True return path diff --git a/tests/integration/test_native_upload.py b/tests/integration/test_native_upload.py index ae80990..fbef99f 100644 --- a/tests/integration/test_native_upload.py +++ b/tests/integration/test_native_upload.py @@ -157,6 +157,7 @@ def test_native_upload_with_proxy( assert len(files) == 3 assert sorted([file["label"] for file in files]) == sorted(expected_files) + @pytest.mark.xfail(reason="See discussion in #34") def test_native_upload_by_handler( self, credentials, @@ -464,6 +465,76 @@ def test_zipzip_file_upload( assert sorted([file["label"] for file in files]) == sorted(expected_files) + def test_metadata_with_zip_files_in_package(self, credentials): + BASE_URL, API_TOKEN = credentials + + # Create Dataset + pid = create_dataset( + parent="Root", + server_url=BASE_URL, + api_token=API_TOKEN, + ) + + # Arrange + files = [ + File(filepath="tests/fixtures/archive.zip", + dv_dir="subdir2", + description="This file should not be unzipped", + categories=["Test file"] + ), + File(filepath="tests/fixtures/add_dir_files/somefile.txt", + dv_dir="subdir", + description="A simple text file", + categories=["Test file"] + ), + ] + + # Act + uploader = DVUploader(files=files) + uploader.upload( + persistent_id=pid, + api_token=API_TOKEN, + dataverse_url=BASE_URL, + n_parallel_uploads=10, + ) + + # Assert + files = retrieve_dataset_files( + dataverse_url=BASE_URL, + persistent_id=pid, + api_token=API_TOKEN, + ) + + assert len(files) == 2, f"Expected 2 files, got {len(files)}" + + expected_files = [ + { + "label": "archive.zip", + "description": "This file should not be unzipped", + "categories": ["Test file"] + }, + { + "label": "somefile.txt", + "description": "A simple text file", + "categories": ["Test file"] + }, + ] + + files_as_expected = sorted( + [ + { + k: (f[k] if k in f else None) + for k in expected_files[0].keys() + } + for f in files + ], + key=lambda x: x["label"] + ) + assert files_as_expected == expected_files, ( + f"File metadata not as expected: {json.dumps(files, indent=2)}" + ) + + def test_too_many_zip_files( self, credentials,