44import logging
55import types
66import zipfile
7- from contextlib import AsyncExitStack , contextmanager
7+ from collections .abc import Awaitable , Callable , Iterator
8+ from contextlib import AsyncExitStack , contextmanager , suppress
89from functools import partial
910from pathlib import Path
10- from typing import Any , Awaitable , Callable , Final , Iterator
11+ from typing import Any , Final
1112
1213import tqdm
1314from models_library .basic_types import IDStr
15+ from pydantic import NonNegativeFloat
16+ from repro_zipfile import ReproducibleZipFile # type: ignore[import-untyped]
1417from tqdm .contrib .logging import logging_redirect_tqdm , tqdm_logging_redirect
1518
1619from .file_utils import remove_directory
2124_MIN : Final [int ] = 60 # secs
2225_MAX_UNARCHIVING_WORKER_COUNT : Final [int ] = 2
2326_CHUNK_SIZE : Final [int ] = 1024 * 8
27+ _UNIT_MULTIPLIER : Final [NonNegativeFloat ] = 1024.0
2428
25- log = logging .getLogger (__name__ )
29+ _logger = logging .getLogger (__name__ )
2630
2731
2832class ArchiveError (Exception ):
@@ -35,10 +39,10 @@ def _human_readable_size(size, decimal_places=3):
3539 human_readable_file_size = float (size )
3640 unit = "B"
3741 for t_unit in ["B" , "KiB" , "MiB" , "GiB" , "TiB" ]:
38- if human_readable_file_size < 1024.0 :
42+ if human_readable_file_size < _UNIT_MULTIPLIER :
3943 unit = t_unit
4044 break
41- human_readable_file_size /= 1024.0
45+ human_readable_file_size /= _UNIT_MULTIPLIER
4246
4347 return f"{ human_readable_file_size :.{decimal_places }f} { unit } "
4448
@@ -56,19 +60,21 @@ def _iter_files_to_compress(
5660 dir_path : Path , exclude_patterns : set [str ] | None
5761) -> Iterator [Path ]:
5862 exclude_patterns = exclude_patterns if exclude_patterns else set ()
59- for path in dir_path .rglob ("*" ):
63+ # NOTE: make sure to sort paths othrwise between different runs
64+ # the zip will have a different structure and hash
65+ for path in sorted (dir_path .rglob ("*" )):
6066 if path .is_file () and not any (
6167 fnmatch .fnmatch (f"{ path } " , x ) for x in exclude_patterns
6268 ):
6369 yield path
6470
6571
6672def _strip_directory_from_path (input_path : Path , to_strip : Path ) -> Path :
67- _to_strip = f"{ str ( to_strip ) } /"
73+ _to_strip = f"{ to_strip } /"
6874 return Path (str (input_path ).replace (_to_strip , "" ))
6975
7076
71- class _FastZipFileReader (zipfile . ZipFile ):
77+ class _FastZipFileReader (ReproducibleZipFile ):
7278 """
7379 Used to gain a speed boost of several orders of magnitude.
7480
@@ -86,7 +92,7 @@ class _FastZipFileReader(zipfile.ZipFile):
8692 files contained in the archive.
8793 """
8894
89- def _RealGetContents (self ):
95+ def _RealGetContents (self ): # noqa: N802
9096 """method disabled"""
9197
9298
@@ -107,7 +113,7 @@ def _zipfile_single_file_extract_worker(
107113 zip_file_path : Path ,
108114 file_in_archive : zipfile .ZipInfo ,
109115 destination_folder : Path ,
110- is_dir : bool ,
116+ is_dir : bool , # noqa: FBT001
111117) -> Path :
112118 """Extracts file_in_archive from the archive zip_file_path -> destination_folder/file_in_archive
113119
@@ -129,7 +135,7 @@ def _zipfile_single_file_extract_worker(
129135 desc = desc ,
130136 ** (
131137 _TQDM_FILE_OPTIONS
132- | dict ( miniters = _compute_tqdm_miniters (file_in_archive .file_size ))
138+ | { " miniters" : _compute_tqdm_miniters (file_in_archive .file_size )}
133139 ),
134140 ) as pbar :
135141 while chunk := zip_fp .read (_CHUNK_SIZE ):
@@ -139,7 +145,7 @@ def _zipfile_single_file_extract_worker(
139145
140146
141147def _ensure_destination_subdirectories_exist (
142- zip_file_handler : zipfile . ZipFile , destination_folder : Path
148+ zip_file_handler : ReproducibleZipFile , destination_folder : Path
143149) -> None :
144150 # assemble full destination paths
145151 full_destination_paths = {
@@ -177,7 +183,7 @@ async def unarchive_dir(
177183 )
178184 async with AsyncExitStack () as zip_stack :
179185 zip_file_handler = zip_stack .enter_context (
180- zipfile . ZipFile ( # pylint: disable=consider-using-with
186+ ReproducibleZipFile ( # pylint: disable=consider-using-with
181187 archive_to_extract ,
182188 mode = "r" ,
183189 )
@@ -232,7 +238,7 @@ async def unarchive_dir(
232238 extracted_path = await future
233239 extracted_file_size = extracted_path .stat ().st_size
234240 if tqdm_progress .update (extracted_file_size ) and log_cb :
235- with log_catch (log , reraise = False ):
241+ with log_catch (_logger , reraise = False ):
236242 await log_cb (f"{ tqdm_progress } " )
237243 await sub_prog .update (extracted_file_size )
238244 extracted_paths .append (extracted_path )
@@ -266,34 +272,37 @@ async def unarchive_dir(
266272
267273@contextmanager
268274def _progress_enabled_zip_write_handler (
269- zip_file_handler : zipfile . ZipFile , progress_bar : tqdm .tqdm
270- ) -> Iterator [zipfile . ZipFile ]:
275+ zip_file_handler : ReproducibleZipFile , progress_bar : tqdm .tqdm
276+ ) -> Iterator [ReproducibleZipFile ]:
271277 """This function overrides the default zip write fct to allow to get progress using tqdm library"""
272278
273279 def _write_with_progress (
274- original_write_fct , self , data , pbar # pylint: disable=unused-argument
280+ original_write_fct ,
281+ self , # pylint: disable=unused-argument # noqa: ARG001
282+ data ,
283+ pbar ,
275284 ):
276285 pbar .update (len (data ))
277286 return original_write_fct (data )
278287
279288 # Replace original write() with a wrapper to track progress
280289 assert zip_file_handler .fp # nosec
281290 old_write_method = zip_file_handler .fp .write
282- zip_file_handler .fp .write = types .MethodType ( # type: ignore[assignment]
291+ zip_file_handler .fp .write = types .MethodType (
283292 partial (_write_with_progress , old_write_method , pbar = progress_bar ),
284293 zip_file_handler .fp ,
285294 )
286295 try :
287296 yield zip_file_handler
288297 finally :
289- zip_file_handler .fp .write = old_write_method # type: ignore[method-assign]
298+ zip_file_handler .fp .write = old_write_method
290299
291300
292301def _add_to_archive (
293302 dir_to_compress : Path ,
294303 destination : Path ,
295- compress : bool ,
296- store_relative_path : bool ,
304+ compress : bool , # noqa: FBT001
305+ store_relative_path : bool , # noqa: FBT001
297306 update_progress ,
298307 loop ,
299308 exclude_patterns : set [str ] | None = None ,
@@ -308,11 +317,10 @@ def _add_to_archive(
308317 desc = f"{ desc } \n " ,
309318 total = folder_size_bytes ,
310319 ** (
311- _TQDM_FILE_OPTIONS
312- | dict (miniters = _compute_tqdm_miniters (folder_size_bytes ))
320+ _TQDM_FILE_OPTIONS | {"miniters" : _compute_tqdm_miniters (folder_size_bytes )}
313321 ),
314322 ) as progress_bar , _progress_enabled_zip_write_handler (
315- zipfile . ZipFile (destination , "w" , compression = compression ), progress_bar
323+ ReproducibleZipFile (destination , "w" , compression = compression ), progress_bar
316324 ) as zip_file_handler :
317325 for file_to_add in _iter_files_to_compress (dir_to_compress , exclude_patterns ):
318326 progress_bar .set_description (f"{ desc } /{ file_to_add .name } \n " )
@@ -393,10 +401,11 @@ async def archive_dir(
393401 if destination .is_file ():
394402 destination .unlink (missing_ok = True )
395403
396- raise ArchiveError (
404+ msg = (
397405 f"Failed archiving { dir_to_compress } -> { destination } due to { type (err )} ."
398406 f"Details: { err } "
399- ) from err
407+ )
408+ raise ArchiveError (msg ) from err
400409
401410 except BaseException :
402411 if destination .is_file ():
@@ -453,11 +462,9 @@ def prune(self, exclude: set[Path]) -> None:
453462 if path .is_file ():
454463 path .unlink ()
455464 elif path .is_dir ():
456- try :
465+ # prevents deleting non-empty folders
466+ with suppress (OSError ):
457467 path .rmdir ()
458- except OSError :
459- # prevents deleting non-empty folders
460- pass
461468
462469 # second pass to delete empty folders
463470 # after deleting files, some folders might have been left empty
0 commit comments