@@ -158,19 +158,23 @@ def zip_images(
158158 zip_filename = zip_dir .joinpath (filename )
159159 with wip_file_context (wip_zip_filename , zip_filename ) as wip_path :
160160 with wip_path .open ("wb" ) as wip_fp :
161- actual_md5sum = cls .zip_sequence_fp (sequence , wip_fp )
161+ actual_md5sum = cls .zip_sequence_deterministically (sequence , wip_fp )
162162 assert actual_md5sum == upload_md5sum , "md5sum mismatch"
163163
164164 @classmethod
165- def zip_sequence_fp (
165+ def zip_sequence_deterministically (
166166 cls ,
167167 sequence : T .Sequence [types .ImageMetadata ],
168168 zip_fp : T .IO [bytes ],
169169 ) -> str :
170170 """
171- Write a sequence of ImageMetadata into the zipfile handle.
171+ Write a sequence of ImageMetadata into the zipfile handle. It should guarantee
172+ that the same sequence always produces the same zipfile, because the
173+ sequence md5sum will be used to upload the zipfile or resume the upload.
174+
172175 The sequence has to be one sequence and sorted.
173176 """
177+
174178 sequence_groups = types .group_and_sort_images (sequence )
175179 assert len (sequence_groups ) == 1 , (
176180 f"Only one sequence is allowed but got { len (sequence_groups )} : { list (sequence_groups .keys ())} "
@@ -179,9 +183,11 @@ def zip_sequence_fp(
179183 upload_md5sum = types .update_sequence_md5sum (sequence )
180184
181185 with zipfile .ZipFile (zip_fp , "w" , zipfile .ZIP_DEFLATED ) as zipf :
182- arcnames : set [str ] = set ()
183- for metadata in sequence :
184- cls ._write_imagebytes_in_zip (zipf , metadata , arcnames = arcnames )
186+ for idx , metadata in enumerate (sequence ):
187+ # Use {idx}.jpg (suffix does not matter) as the archive name to ensure the
188+ # resulting zipfile is deterministic. This determinism is based on the upload_md5sum,
189+ # which is derived from a list of image md5sums
190+ cls ._write_imagebytes_in_zip (zipf , metadata , arcname = f"{ idx } .jpg" )
185191 assert len (sequence ) == len (set (zipf .namelist ()))
186192 zipf .comment = json .dumps ({"upload_md5sum" : upload_md5sum }).encode ("utf-8" )
187193
@@ -210,28 +216,10 @@ def extract_upload_md5sum(cls, zip_fp: T.IO[bytes]) -> str:
210216
211217 return upload_md5sum
212218
213- @classmethod
214- def _uniq_arcname (cls , filename : Path , arcnames : set [str ]):
215- arcname : str = filename .name
216-
217- # make sure the arcname is unique, otherwise zipfile.extractAll will eliminate duplicated ones
218- arcname_idx = 0
219- while arcname in arcnames :
220- arcname_idx += 1
221- arcname = f"{ filename .stem } _{ arcname_idx } { filename .suffix } "
222-
223- return arcname
224-
225219 @classmethod
226220 def _write_imagebytes_in_zip (
227- cls ,
228- zipf : zipfile .ZipFile ,
229- metadata : types .ImageMetadata ,
230- arcnames : set [str ] | None = None ,
221+ cls , zipf : zipfile .ZipFile , metadata : types .ImageMetadata , arcname : str
231222 ):
232- if arcnames is None :
233- arcnames = set ()
234-
235223 try :
236224 edit = exif_write .ExifEdit (metadata .filename )
237225 except struct .error as ex :
@@ -249,9 +237,6 @@ def _write_imagebytes_in_zip(
249237 f"Failed to dump EXIF bytes: { ex } " , metadata .filename
250238 ) from ex
251239
252- arcname = cls ._uniq_arcname (metadata .filename , arcnames )
253- arcnames .add (arcname )
254-
255240 zipinfo = zipfile .ZipInfo (arcname , date_time = (1980 , 1 , 1 , 0 , 0 , 0 ))
256241 zipf .writestr (zipinfo , image_bytes )
257242
@@ -319,7 +304,7 @@ def prepare_images_and_upload(
319304
320305 with tempfile .NamedTemporaryFile () as fp :
321306 try :
322- upload_md5sum = cls .zip_sequence_fp (sequence , fp )
307+ upload_md5sum = cls .zip_sequence_deterministically (sequence , fp )
323308 except Exception as ex :
324309 yield sequence_uuid , UploadResult (error = ex )
325310 continue
0 commit comments