diff --git a/mapillary_tools/uploader.py b/mapillary_tools/uploader.py index ede0ae713..76146673f 100644 --- a/mapillary_tools/uploader.py +++ b/mapillary_tools/uploader.py @@ -158,19 +158,23 @@ def zip_images( zip_filename = zip_dir.joinpath(filename) with wip_file_context(wip_zip_filename, zip_filename) as wip_path: with wip_path.open("wb") as wip_fp: - actual_md5sum = cls.zip_sequence_fp(sequence, wip_fp) + actual_md5sum = cls.zip_sequence_deterministically(sequence, wip_fp) assert actual_md5sum == upload_md5sum, "md5sum mismatch" @classmethod - def zip_sequence_fp( + def zip_sequence_deterministically( cls, sequence: T.Sequence[types.ImageMetadata], zip_fp: T.IO[bytes], ) -> str: """ - Write a sequence of ImageMetadata into the zipfile handle. + Write a sequence of ImageMetadata into the zipfile handle. It should guarantee + that the same sequence always produces the same zipfile, because the + sequence md5sum will be used to upload the zipfile or resume the upload. + The sequence has to be one sequence and sorted. """ + sequence_groups = types.group_and_sort_images(sequence) assert len(sequence_groups) == 1, ( f"Only one sequence is allowed but got {len(sequence_groups)}: {list(sequence_groups.keys())}" @@ -179,9 +183,11 @@ def zip_sequence_fp( upload_md5sum = types.update_sequence_md5sum(sequence) with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf: - arcnames: set[str] = set() - for metadata in sequence: - cls._write_imagebytes_in_zip(zipf, metadata, arcnames=arcnames) + for idx, metadata in enumerate(sequence): + # Use {idx}.jpg (suffix does not matter) as the archive name to ensure the + # resulting zipfile is deterministic. This determinism is based on the upload_md5sum, + # which is derived from a list of image md5sums + cls._write_imagebytes_in_zip(zipf, metadata, arcname=f"{idx}.jpg") assert len(sequence) == len(set(zipf.namelist())) zipf.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8") @@ -210,28 +216,10 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str: return upload_md5sum - @classmethod - def _uniq_arcname(cls, filename: Path, arcnames: set[str]): - arcname: str = filename.name - - # make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones - arcname_idx = 0 - while arcname in arcnames: - arcname_idx += 1 - arcname = f"{filename.stem}_{arcname_idx}{filename.suffix}" - - return arcname - @classmethod def _write_imagebytes_in_zip( - cls, - zipf: zipfile.ZipFile, - metadata: types.ImageMetadata, - arcnames: set[str] | None = None, + cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata, arcname: str ): - if arcnames is None: - arcnames = set() - try: edit = exif_write.ExifEdit(metadata.filename) except struct.error as ex: @@ -249,9 +237,6 @@ def _write_imagebytes_in_zip( f"Failed to dump EXIF bytes: {ex}", metadata.filename ) from ex - arcname = cls._uniq_arcname(metadata.filename, arcnames) - arcnames.add(arcname) - zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0)) zipf.writestr(zipinfo, image_bytes) @@ -319,7 +304,7 @@ def prepare_images_and_upload( with tempfile.NamedTemporaryFile() as fp: try: - upload_md5sum = cls.zip_sequence_fp(sequence, fp) + upload_md5sum = cls.zip_sequence_deterministically(sequence, fp) except Exception as ex: yield sequence_uuid, UploadResult(error=ex) continue diff --git a/tests/integration/test_process_and_upload.py b/tests/integration/test_process_and_upload.py index c9aa56036..dd4f4c03e 100644 --- a/tests/integration/test_process_and_upload.py +++ b/tests/integration/test_process_and_upload.py @@ -150,7 +150,7 @@ def _validate_uploads(upload_dir: py.path.local, expected): actual = {} for desc in descs: - actual[os.path.basename(desc["filename"])] = { + actual[os.path.basename(desc["MAPFilename"])] = { k: v for k, v in desc.items() if k not in excludes } diff --git a/tests/unit/test_uploader.py b/tests/unit/test_uploader.py index ae860d902..84a0ec91c 100644 --- a/tests/unit/test_uploader.py +++ b/tests/unit/test_uploader.py @@ -146,7 +146,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_24_41_140", "filename": str(test_exif), - "md5sum": None, + "md5sum": "1", "filetype": "image", "MAPSequenceUUID": "sequence_1", }, @@ -155,7 +155,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_24_41_140", "filename": str(test_exif2), - "md5sum": None, + "md5sum": "2", "filetype": "image", "MAPSequenceUUID": "sequence_1", }, @@ -164,7 +164,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_25_41_140", "filename": str(test_exif), - "md5sum": None, + "md5sum": "3", "filetype": "image", "MAPSequenceUUID": "sequence_2", },