Skip to content

Commit 386c8c5

Browse files
authored
fix: make sure the same sequence always produces the same zipfile (#738)
* make sure the same sequence always produces the same zipfile * fix tests * use idx.jpg as arcname
1 parent 0aa34c8 commit 386c8c5

File tree

3 files changed

+18
-33
lines changed

3 files changed

+18
-33
lines changed

mapillary_tools/uploader.py

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -158,19 +158,23 @@ def zip_images(
158158
zip_filename = zip_dir.joinpath(filename)
159159
with wip_file_context(wip_zip_filename, zip_filename) as wip_path:
160160
with wip_path.open("wb") as wip_fp:
161-
actual_md5sum = cls.zip_sequence_fp(sequence, wip_fp)
161+
actual_md5sum = cls.zip_sequence_deterministically(sequence, wip_fp)
162162
assert actual_md5sum == upload_md5sum, "md5sum mismatch"
163163

164164
@classmethod
165-
def zip_sequence_fp(
165+
def zip_sequence_deterministically(
166166
cls,
167167
sequence: T.Sequence[types.ImageMetadata],
168168
zip_fp: T.IO[bytes],
169169
) -> str:
170170
"""
171-
Write a sequence of ImageMetadata into the zipfile handle.
171+
Write a sequence of ImageMetadata into the zipfile handle. It should guarantee
172+
that the same sequence always produces the same zipfile, because the
173+
sequence md5sum will be used to upload the zipfile or resume the upload.
174+
172175
The sequence has to be one sequence and sorted.
173176
"""
177+
174178
sequence_groups = types.group_and_sort_images(sequence)
175179
assert len(sequence_groups) == 1, (
176180
f"Only one sequence is allowed but got {len(sequence_groups)}: {list(sequence_groups.keys())}"
@@ -179,9 +183,11 @@ def zip_sequence_fp(
179183
upload_md5sum = types.update_sequence_md5sum(sequence)
180184

181185
with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf:
182-
arcnames: set[str] = set()
183-
for metadata in sequence:
184-
cls._write_imagebytes_in_zip(zipf, metadata, arcnames=arcnames)
186+
for idx, metadata in enumerate(sequence):
187+
# Use {idx}.jpg (suffix does not matter) as the archive name to ensure the
188+
# resulting zipfile is deterministic. This determinism is based on the upload_md5sum,
189+
# which is derived from a list of image md5sums
190+
cls._write_imagebytes_in_zip(zipf, metadata, arcname=f"{idx}.jpg")
185191
assert len(sequence) == len(set(zipf.namelist()))
186192
zipf.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8")
187193

@@ -210,28 +216,10 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str:
210216

211217
return upload_md5sum
212218

213-
@classmethod
214-
def _uniq_arcname(cls, filename: Path, arcnames: set[str]):
215-
arcname: str = filename.name
216-
217-
# make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones
218-
arcname_idx = 0
219-
while arcname in arcnames:
220-
arcname_idx += 1
221-
arcname = f"{filename.stem}_{arcname_idx}{filename.suffix}"
222-
223-
return arcname
224-
225219
@classmethod
226220
def _write_imagebytes_in_zip(
227-
cls,
228-
zipf: zipfile.ZipFile,
229-
metadata: types.ImageMetadata,
230-
arcnames: set[str] | None = None,
221+
cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata, arcname: str
231222
):
232-
if arcnames is None:
233-
arcnames = set()
234-
235223
try:
236224
edit = exif_write.ExifEdit(metadata.filename)
237225
except struct.error as ex:
@@ -249,9 +237,6 @@ def _write_imagebytes_in_zip(
249237
f"Failed to dump EXIF bytes: {ex}", metadata.filename
250238
) from ex
251239

252-
arcname = cls._uniq_arcname(metadata.filename, arcnames)
253-
arcnames.add(arcname)
254-
255240
zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0))
256241
zipf.writestr(zipinfo, image_bytes)
257242

@@ -319,7 +304,7 @@ def prepare_images_and_upload(
319304

320305
with tempfile.NamedTemporaryFile() as fp:
321306
try:
322-
upload_md5sum = cls.zip_sequence_fp(sequence, fp)
307+
upload_md5sum = cls.zip_sequence_deterministically(sequence, fp)
323308
except Exception as ex:
324309
yield sequence_uuid, UploadResult(error=ex)
325310
continue

tests/integration/test_process_and_upload.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def _validate_uploads(upload_dir: py.path.local, expected):
150150

151151
actual = {}
152152
for desc in descs:
153-
actual[os.path.basename(desc["filename"])] = {
153+
actual[os.path.basename(desc["MAPFilename"])] = {
154154
k: v for k, v in desc.items() if k not in excludes
155155
}
156156

tests/unit/test_uploader.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_upload_zip(
146146
"MAPLongitude": 16.1840944,
147147
"MAPCaptureTime": "2021_02_13_13_24_41_140",
148148
"filename": str(test_exif),
149-
"md5sum": None,
149+
"md5sum": "1",
150150
"filetype": "image",
151151
"MAPSequenceUUID": "sequence_1",
152152
},
@@ -155,7 +155,7 @@ def test_upload_zip(
155155
"MAPLongitude": 16.1840944,
156156
"MAPCaptureTime": "2021_02_13_13_24_41_140",
157157
"filename": str(test_exif2),
158-
"md5sum": None,
158+
"md5sum": "2",
159159
"filetype": "image",
160160
"MAPSequenceUUID": "sequence_1",
161161
},
@@ -164,7 +164,7 @@ def test_upload_zip(
164164
"MAPLongitude": 16.1840944,
165165
"MAPCaptureTime": "2021_02_13_13_25_41_140",
166166
"filename": str(test_exif),
167-
"md5sum": None,
167+
"md5sum": "3",
168168
"filetype": "image",
169169
"MAPSequenceUUID": "sequence_2",
170170
},

0 commit comments

Comments
 (0)