From 227b5af65f3dcb4018045257af27b881ae7617c5 Mon Sep 17 00:00:00 2001 From: Tao Peng Date: Mon, 31 Mar 2025 23:50:01 -0700 Subject: [PATCH 1/3] make sure the same sequence always produces the same zipfile --- mapillary_tools/uploader.py | 39 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/mapillary_tools/uploader.py b/mapillary_tools/uploader.py index ede0ae713..8e34dee3c 100644 --- a/mapillary_tools/uploader.py +++ b/mapillary_tools/uploader.py @@ -158,19 +158,23 @@ def zip_images( zip_filename = zip_dir.joinpath(filename) with wip_file_context(wip_zip_filename, zip_filename) as wip_path: with wip_path.open("wb") as wip_fp: - actual_md5sum = cls.zip_sequence_fp(sequence, wip_fp) + actual_md5sum = cls.zip_sequence_deterministically(sequence, wip_fp) assert actual_md5sum == upload_md5sum, "md5sum mismatch" @classmethod - def zip_sequence_fp( + def zip_sequence_deterministically( cls, sequence: T.Sequence[types.ImageMetadata], zip_fp: T.IO[bytes], ) -> str: """ - Write a sequence of ImageMetadata into the zipfile handle. + Write a sequence of ImageMetadata into the zipfile handle. It should guarantee + that the same sequence always produces the same zipfile, because the + sequence md5sum will be used to upload the zipfile or resume the upload. + The sequence has to be one sequence and sorted. """ + sequence_groups = types.group_and_sort_images(sequence) assert len(sequence_groups) == 1, ( f"Only one sequence is allowed but got {len(sequence_groups)}: {list(sequence_groups.keys())}" @@ -179,9 +183,8 @@ def zip_sequence_fp( upload_md5sum = types.update_sequence_md5sum(sequence) with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf: - arcnames: set[str] = set() for metadata in sequence: - cls._write_imagebytes_in_zip(zipf, metadata, arcnames=arcnames) + cls._write_imagebytes_in_zip(zipf, metadata) assert len(sequence) == len(set(zipf.namelist())) zipf.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8") @@ -210,27 +213,12 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str: return upload_md5sum - @classmethod - def _uniq_arcname(cls, filename: Path, arcnames: set[str]): - arcname: str = filename.name - - # make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones - arcname_idx = 0 - while arcname in arcnames: - arcname_idx += 1 - arcname = f"{filename.stem}_{arcname_idx}{filename.suffix}" - - return arcname - @classmethod def _write_imagebytes_in_zip( - cls, - zipf: zipfile.ZipFile, - metadata: types.ImageMetadata, - arcnames: set[str] | None = None, + cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata ): - if arcnames is None: - arcnames = set() + assert metadata.md5sum, "md5sum is not set" + arcname = metadata.md5sum try: edit = exif_write.ExifEdit(metadata.filename) @@ -249,9 +237,6 @@ def _write_imagebytes_in_zip( f"Failed to dump EXIF bytes: {ex}", metadata.filename ) from ex - arcname = cls._uniq_arcname(metadata.filename, arcnames) - arcnames.add(arcname) - zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0)) zipf.writestr(zipinfo, image_bytes) @@ -319,7 +304,7 @@ def prepare_images_and_upload( with tempfile.NamedTemporaryFile() as fp: try: - upload_md5sum = cls.zip_sequence_fp(sequence, fp) + upload_md5sum = cls.zip_sequence_deterministically(sequence, fp) except Exception as ex: yield sequence_uuid, UploadResult(error=ex) continue From 7d9ab4f32fe8dd34a6cd50d957baf2e4c9614877 Mon Sep 17 00:00:00 2001 From: Tao Peng Date: Tue, 1 Apr 2025 00:09:41 -0700 Subject: [PATCH 2/3] fix tests --- tests/integration/test_process_and_upload.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_process_and_upload.py b/tests/integration/test_process_and_upload.py index 1e8b97bd0..539be752f 100644 --- a/tests/integration/test_process_and_upload.py +++ b/tests/integration/test_process_and_upload.py @@ -22,7 +22,8 @@ EXPECTED_DESCS = { "image": { - "DSC00001.JPG": { + "1daf0afb27a5a852693006ef98040dba": { + # DSC00001.JPG "MAPAltitude": 70.3, "MAPCaptureTime": "2018_06_08_20_24_11_000", "MAPCompassHeading": {"MagneticHeading": 270.89, "TrueHeading": 270.89}, @@ -33,7 +34,8 @@ "MAPOrientation": 1, "filetype": "image", }, - "DSC00497.JPG": { + "ad0d5264fc195a0f93f275895dc88fb2": { + # DSC00497.JPG "MAPAltitude": 77.5, "MAPCaptureTime": "2018_06_08_20_32_28_000", "MAPCompassHeading": {"MagneticHeading": 271.27, "TrueHeading": 271.27}, @@ -44,7 +46,8 @@ "MAPOrientation": 1, "filetype": "image", }, - "V0370574.JPG": { + "e48c08e5df11c52270ef2568dbf493d6": { + # V0370574.JPG # convert DateTimeOriginal "2018:07:27 11:32:14" in local time to UTC "MAPCaptureTime": datetime.datetime.fromisoformat("2018-07-27T11:32:14") .astimezone(datetime.timezone.utc) @@ -218,7 +221,8 @@ def test_video_process_and_upload( assert x.returncode == 0, x.stderr assert 1 == len(setup_upload.listdir()) expected = { - "sample-5s_NA_000001.jpg": { + "11db367c55e8e8b751daef7a42632a59": { + # sample-5s_NA_000001.jpg "MAPAltitude": 94.75, "MAPCaptureTime": "2025_03_14_07_00_00_000", "MAPCompassHeading": { @@ -230,7 +234,8 @@ def test_video_process_and_upload( "MAPOrientation": 1, "filetype": "image", }, - "sample-5s_NA_000002.jpg": { + "d73aa1576a2dd75d8f17b2f6f2bba0fb": { + # sample-5s_NA_000002.jpg "MAPAltitude": 93.347, "MAPCaptureTime": "2025_03_14_07_00_02_000", "MAPCompassHeading": { @@ -242,7 +247,8 @@ def test_video_process_and_upload( "MAPOrientation": 1, "filetype": "image", }, - "sample-5s_NA_000003.jpg": { + "e89f809c1b51ef5a0a8a496c1030d875": { + # sample-5s_NA_000003.jpg "MAPAltitude": 92.492, "MAPCaptureTime": "2025_03_14_07_00_04_000", "MAPCompassHeading": { From a9ae7776cee4df2b2e069558f17dc9fed60053d0 Mon Sep 17 00:00:00 2001 From: Tao Peng Date: Tue, 1 Apr 2025 01:20:27 -0700 Subject: [PATCH 3/3] use idx.jpg as arcname --- mapillary_tools/uploader.py | 12 ++++---- tests/integration/test_process_and_upload.py | 29 ++++++++++---------- tests/unit/test_uploader.py | 6 ++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/mapillary_tools/uploader.py b/mapillary_tools/uploader.py index 8e34dee3c..76146673f 100644 --- a/mapillary_tools/uploader.py +++ b/mapillary_tools/uploader.py @@ -183,8 +183,11 @@ def zip_sequence_deterministically( upload_md5sum = types.update_sequence_md5sum(sequence) with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf: - for metadata in sequence: - cls._write_imagebytes_in_zip(zipf, metadata) + for idx, metadata in enumerate(sequence): + # Use {idx}.jpg (suffix does not matter) as the archive name to ensure the + # resulting zipfile is deterministic. This determinism is based on the upload_md5sum, + # which is derived from a list of image md5sums + cls._write_imagebytes_in_zip(zipf, metadata, arcname=f"{idx}.jpg") assert len(sequence) == len(set(zipf.namelist())) zipf.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8") @@ -215,11 +218,8 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str: @classmethod def _write_imagebytes_in_zip( - cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata + cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata, arcname: str ): - assert metadata.md5sum, "md5sum is not set" - arcname = metadata.md5sum - try: edit = exif_write.ExifEdit(metadata.filename) except struct.error as ex: diff --git a/tests/integration/test_process_and_upload.py b/tests/integration/test_process_and_upload.py index 539be752f..dd4f4c03e 100644 --- a/tests/integration/test_process_and_upload.py +++ b/tests/integration/test_process_and_upload.py @@ -22,8 +22,7 @@ EXPECTED_DESCS = { "image": { - "1daf0afb27a5a852693006ef98040dba": { - # DSC00001.JPG + "DSC00001.JPG": { "MAPAltitude": 70.3, "MAPCaptureTime": "2018_06_08_20_24_11_000", "MAPCompassHeading": {"MagneticHeading": 270.89, "TrueHeading": 270.89}, @@ -34,8 +33,7 @@ "MAPOrientation": 1, "filetype": "image", }, - "ad0d5264fc195a0f93f275895dc88fb2": { - # DSC00497.JPG + "DSC00497.JPG": { "MAPAltitude": 77.5, "MAPCaptureTime": "2018_06_08_20_32_28_000", "MAPCompassHeading": {"MagneticHeading": 271.27, "TrueHeading": 271.27}, @@ -46,8 +44,7 @@ "MAPOrientation": 1, "filetype": "image", }, - "e48c08e5df11c52270ef2568dbf493d6": { - # V0370574.JPG + "V0370574.JPG": { # convert DateTimeOriginal "2018:07:27 11:32:14" in local time to UTC "MAPCaptureTime": datetime.datetime.fromisoformat("2018-07-27T11:32:14") .astimezone(datetime.timezone.utc) @@ -142,11 +139,18 @@ def _validate_uploads(upload_dir: py.path.local, expected): else: raise Exception(f"invalid file {file}") - excludes = ["filename", "filesize", "md5sum", "MAPMetaTags", "MAPSequenceUUID"] + excludes = [ + "filename", + "filesize", + "md5sum", + "MAPMetaTags", + "MAPSequenceUUID", + "MAPFilename", + ] actual = {} for desc in descs: - actual[os.path.basename(desc["filename"])] = { + actual[os.path.basename(desc["MAPFilename"])] = { k: v for k, v in desc.items() if k not in excludes } @@ -221,8 +225,7 @@ def test_video_process_and_upload( assert x.returncode == 0, x.stderr assert 1 == len(setup_upload.listdir()) expected = { - "11db367c55e8e8b751daef7a42632a59": { - # sample-5s_NA_000001.jpg + "sample-5s_NA_000001.jpg": { "MAPAltitude": 94.75, "MAPCaptureTime": "2025_03_14_07_00_00_000", "MAPCompassHeading": { @@ -234,8 +237,7 @@ def test_video_process_and_upload( "MAPOrientation": 1, "filetype": "image", }, - "d73aa1576a2dd75d8f17b2f6f2bba0fb": { - # sample-5s_NA_000002.jpg + "sample-5s_NA_000002.jpg": { "MAPAltitude": 93.347, "MAPCaptureTime": "2025_03_14_07_00_02_000", "MAPCompassHeading": { @@ -247,8 +249,7 @@ def test_video_process_and_upload( "MAPOrientation": 1, "filetype": "image", }, - "e89f809c1b51ef5a0a8a496c1030d875": { - # sample-5s_NA_000003.jpg + "sample-5s_NA_000003.jpg": { "MAPAltitude": 92.492, "MAPCaptureTime": "2025_03_14_07_00_04_000", "MAPCompassHeading": { diff --git a/tests/unit/test_uploader.py b/tests/unit/test_uploader.py index ae860d902..84a0ec91c 100644 --- a/tests/unit/test_uploader.py +++ b/tests/unit/test_uploader.py @@ -146,7 +146,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_24_41_140", "filename": str(test_exif), - "md5sum": None, + "md5sum": "1", "filetype": "image", "MAPSequenceUUID": "sequence_1", }, @@ -155,7 +155,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_24_41_140", "filename": str(test_exif2), - "md5sum": None, + "md5sum": "2", "filetype": "image", "MAPSequenceUUID": "sequence_1", }, @@ -164,7 +164,7 @@ def test_upload_zip( "MAPLongitude": 16.1840944, "MAPCaptureTime": "2021_02_13_13_25_41_140", "filename": str(test_exif), - "md5sum": None, + "md5sum": "3", "filetype": "image", "MAPSequenceUUID": "sequence_2", },