Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 14 additions & 29 deletions mapillary_tools/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,19 +158,23 @@ def zip_images(
zip_filename = zip_dir.joinpath(filename)
with wip_file_context(wip_zip_filename, zip_filename) as wip_path:
with wip_path.open("wb") as wip_fp:
actual_md5sum = cls.zip_sequence_fp(sequence, wip_fp)
actual_md5sum = cls.zip_sequence_deterministically(sequence, wip_fp)
assert actual_md5sum == upload_md5sum, "md5sum mismatch"

@classmethod
def zip_sequence_fp(
def zip_sequence_deterministically(
cls,
sequence: T.Sequence[types.ImageMetadata],
zip_fp: T.IO[bytes],
) -> str:
"""
Write a sequence of ImageMetadata into the zipfile handle.
Write a sequence of ImageMetadata into the zipfile handle. It should guarantee
that the same sequence always produces the same zipfile, because the
sequence md5sum will be used to upload the zipfile or resume the upload.

The sequence has to be one sequence and sorted.
"""

sequence_groups = types.group_and_sort_images(sequence)
assert len(sequence_groups) == 1, (
f"Only one sequence is allowed but got {len(sequence_groups)}: {list(sequence_groups.keys())}"
Expand All @@ -179,9 +183,11 @@ def zip_sequence_fp(
upload_md5sum = types.update_sequence_md5sum(sequence)

with zipfile.ZipFile(zip_fp, "w", zipfile.ZIP_DEFLATED) as zipf:
arcnames: set[str] = set()
for metadata in sequence:
cls._write_imagebytes_in_zip(zipf, metadata, arcnames=arcnames)
for idx, metadata in enumerate(sequence):
# Use {idx}.jpg (suffix does not matter) as the archive name to ensure the
# resulting zipfile is deterministic. This determinism is based on the upload_md5sum,
# which is derived from a list of image md5sums
cls._write_imagebytes_in_zip(zipf, metadata, arcname=f"{idx}.jpg")
assert len(sequence) == len(set(zipf.namelist()))
zipf.comment = json.dumps({"upload_md5sum": upload_md5sum}).encode("utf-8")

Expand Down Expand Up @@ -210,28 +216,10 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str:

return upload_md5sum

@classmethod
def _uniq_arcname(cls, filename: Path, arcnames: set[str]):
arcname: str = filename.name

# make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones
arcname_idx = 0
while arcname in arcnames:
arcname_idx += 1
arcname = f"{filename.stem}_{arcname_idx}{filename.suffix}"

return arcname

@classmethod
def _write_imagebytes_in_zip(
cls,
zipf: zipfile.ZipFile,
metadata: types.ImageMetadata,
arcnames: set[str] | None = None,
cls, zipf: zipfile.ZipFile, metadata: types.ImageMetadata, arcname: str
):
if arcnames is None:
arcnames = set()

try:
edit = exif_write.ExifEdit(metadata.filename)
except struct.error as ex:
Expand All @@ -249,9 +237,6 @@ def _write_imagebytes_in_zip(
f"Failed to dump EXIF bytes: {ex}", metadata.filename
) from ex

arcname = cls._uniq_arcname(metadata.filename, arcnames)
arcnames.add(arcname)

zipinfo = zipfile.ZipInfo(arcname, date_time=(1980, 1, 1, 0, 0, 0))
zipf.writestr(zipinfo, image_bytes)

Expand Down Expand Up @@ -319,7 +304,7 @@ def prepare_images_and_upload(

with tempfile.NamedTemporaryFile() as fp:
try:
upload_md5sum = cls.zip_sequence_fp(sequence, fp)
upload_md5sum = cls.zip_sequence_deterministically(sequence, fp)
except Exception as ex:
yield sequence_uuid, UploadResult(error=ex)
continue
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_process_and_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _validate_uploads(upload_dir: py.path.local, expected):

actual = {}
for desc in descs:
actual[os.path.basename(desc["filename"])] = {
actual[os.path.basename(desc["MAPFilename"])] = {
k: v for k, v in desc.items() if k not in excludes
}

Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def test_upload_zip(
"MAPLongitude": 16.1840944,
"MAPCaptureTime": "2021_02_13_13_24_41_140",
"filename": str(test_exif),
"md5sum": None,
"md5sum": "1",
"filetype": "image",
"MAPSequenceUUID": "sequence_1",
},
Expand All @@ -155,7 +155,7 @@ def test_upload_zip(
"MAPLongitude": 16.1840944,
"MAPCaptureTime": "2021_02_13_13_24_41_140",
"filename": str(test_exif2),
"md5sum": None,
"md5sum": "2",
"filetype": "image",
"MAPSequenceUUID": "sequence_1",
},
Expand All @@ -164,7 +164,7 @@ def test_upload_zip(
"MAPLongitude": 16.1840944,
"MAPCaptureTime": "2021_02_13_13_25_41_140",
"filename": str(test_exif),
"md5sum": None,
"md5sum": "3",
"filetype": "image",
"MAPSequenceUUID": "sequence_2",
},
Expand Down
Loading