Skip to content

Commit 17b3da2

Browse files
author
Andrei Neagu
committed
zip files are now predictable
1 parent 0ed6d0a commit 17b3da2

File tree

6 files changed

+121
-22
lines changed

6 files changed

+121
-22
lines changed

packages/service-library/requirements/_base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pydantic
2424
pyinstrument
2525
pyyaml
2626
redis
27+
repro-zipfile
2728
tenacity
2829
toolz
2930
tqdm

packages/service-library/requirements/_base.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ referencing==0.29.3
172172
# -c requirements/./constraints.txt
173173
# jsonschema
174174
# jsonschema-specifications
175+
repro-zipfile==0.3.1
176+
# via -r requirements/_base.in
175177
requests==2.32.3
176178
# via opentelemetry-exporter-otlp-proto-http
177179
rich==13.8.1

packages/service-library/requirements/_test.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ coverage
1717
docker
1818
faker
1919
flaky
20+
numpy
2021
openapi-spec-validator
22+
pillow
2123
pytest
2224
pytest-aiohttp
2325
pytest-asyncio

packages/service-library/requirements/_test.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ mypy==1.11.2
121121
# via sqlalchemy
122122
mypy-extensions==1.0.0
123123
# via mypy
124+
numpy==2.1.1
125+
# via -r requirements/_test.in
124126
openapi-schema-validator==0.6.2
125127
# via
126128
# -c requirements/_aiohttp.txt
@@ -137,6 +139,8 @@ pathable==0.4.3
137139
# via
138140
# -c requirements/_aiohttp.txt
139141
# jsonschema-path
142+
pillow==10.4.0
143+
# via -r requirements/_test.in
140144
pluggy==1.5.0
141145
# via pytest
142146
pprintpp==0.4.0

packages/service-library/src/servicelib/archiving_utils.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
import logging
55
import types
66
import zipfile
7-
from contextlib import AsyncExitStack, contextmanager
7+
from collections.abc import Awaitable, Callable, Iterator
8+
from contextlib import AsyncExitStack, contextmanager, suppress
89
from functools import partial
910
from pathlib import Path
10-
from typing import Any, Awaitable, Callable, Final, Iterator
11+
from typing import Any, Final
1112

1213
import tqdm
1314
from models_library.basic_types import IDStr
15+
from repro_zipfile import ReproducibleZipFile
1416
from tqdm.contrib.logging import logging_redirect_tqdm, tqdm_logging_redirect
1517

1618
from .file_utils import remove_directory
@@ -22,7 +24,7 @@
2224
_MAX_UNARCHIVING_WORKER_COUNT: Final[int] = 2
2325
_CHUNK_SIZE: Final[int] = 1024 * 8
2426

25-
log = logging.getLogger(__name__)
27+
_logger = logging.getLogger(__name__)
2628

2729

2830
class ArchiveError(Exception):
@@ -56,19 +58,21 @@ def _iter_files_to_compress(
5658
dir_path: Path, exclude_patterns: set[str] | None
5759
) -> Iterator[Path]:
5860
exclude_patterns = exclude_patterns if exclude_patterns else set()
59-
for path in dir_path.rglob("*"):
61+
# NOTE: make sure to sort paths othrwise between different runs
62+
# the zip will have a different structure and hash
63+
for path in sorted(dir_path.rglob("*")):
6064
if path.is_file() and not any(
6165
fnmatch.fnmatch(f"{path}", x) for x in exclude_patterns
6266
):
6367
yield path
6468

6569

6670
def _strip_directory_from_path(input_path: Path, to_strip: Path) -> Path:
67-
_to_strip = f"{str(to_strip)}/"
71+
_to_strip = f"{to_strip}/"
6872
return Path(str(input_path).replace(_to_strip, ""))
6973

7074

71-
class _FastZipFileReader(zipfile.ZipFile):
75+
class _FastZipFileReader(ReproducibleZipFile):
7276
"""
7377
Used to gain a speed boost of several orders of magnitude.
7478
@@ -129,7 +133,7 @@ def _zipfile_single_file_extract_worker(
129133
desc=desc,
130134
**(
131135
_TQDM_FILE_OPTIONS
132-
| dict(miniters=_compute_tqdm_miniters(file_in_archive.file_size))
136+
| {"miniters": _compute_tqdm_miniters(file_in_archive.file_size)}
133137
),
134138
) as pbar:
135139
while chunk := zip_fp.read(_CHUNK_SIZE):
@@ -139,7 +143,7 @@ def _zipfile_single_file_extract_worker(
139143

140144

141145
def _ensure_destination_subdirectories_exist(
142-
zip_file_handler: zipfile.ZipFile, destination_folder: Path
146+
zip_file_handler: ReproducibleZipFile, destination_folder: Path
143147
) -> None:
144148
# assemble full destination paths
145149
full_destination_paths = {
@@ -177,7 +181,7 @@ async def unarchive_dir(
177181
)
178182
async with AsyncExitStack() as zip_stack:
179183
zip_file_handler = zip_stack.enter_context(
180-
zipfile.ZipFile( # pylint: disable=consider-using-with
184+
ReproducibleZipFile( # pylint: disable=consider-using-with
181185
archive_to_extract,
182186
mode="r",
183187
)
@@ -232,7 +236,7 @@ async def unarchive_dir(
232236
extracted_path = await future
233237
extracted_file_size = extracted_path.stat().st_size
234238
if tqdm_progress.update(extracted_file_size) and log_cb:
235-
with log_catch(log, reraise=False):
239+
with log_catch(_logger, reraise=False):
236240
await log_cb(f"{tqdm_progress}")
237241
await sub_prog.update(extracted_file_size)
238242
extracted_paths.append(extracted_path)
@@ -266,8 +270,8 @@ async def unarchive_dir(
266270

267271
@contextmanager
268272
def _progress_enabled_zip_write_handler(
269-
zip_file_handler: zipfile.ZipFile, progress_bar: tqdm.tqdm
270-
) -> Iterator[zipfile.ZipFile]:
273+
zip_file_handler: ReproducibleZipFile, progress_bar: tqdm.tqdm
274+
) -> Iterator[ReproducibleZipFile]:
271275
"""This function overrides the default zip write fct to allow to get progress using tqdm library"""
272276

273277
def _write_with_progress(
@@ -308,11 +312,10 @@ def _add_to_archive(
308312
desc=f"{desc}\n",
309313
total=folder_size_bytes,
310314
**(
311-
_TQDM_FILE_OPTIONS
312-
| dict(miniters=_compute_tqdm_miniters(folder_size_bytes))
315+
_TQDM_FILE_OPTIONS | {"miniters": _compute_tqdm_miniters(folder_size_bytes)}
313316
),
314317
) as progress_bar, _progress_enabled_zip_write_handler(
315-
zipfile.ZipFile(destination, "w", compression=compression), progress_bar
318+
ReproducibleZipFile(destination, "w", compression=compression), progress_bar
316319
) as zip_file_handler:
317320
for file_to_add in _iter_files_to_compress(dir_to_compress, exclude_patterns):
318321
progress_bar.set_description(f"{desc}/{file_to_add.name}\n")
@@ -393,10 +396,11 @@ async def archive_dir(
393396
if destination.is_file():
394397
destination.unlink(missing_ok=True)
395398

396-
raise ArchiveError(
399+
msg = (
397400
f"Failed archiving {dir_to_compress} -> {destination} due to {type(err)}."
398401
f"Details: {err}"
399-
) from err
402+
)
403+
raise ArchiveError(msg) from err
400404

401405
except BaseException:
402406
if destination.is_file():
@@ -453,11 +457,9 @@ def prune(self, exclude: set[Path]) -> None:
453457
if path.is_file():
454458
path.unlink()
455459
elif path.is_dir():
456-
try:
460+
# prevents deleting non-empty folders
461+
with suppress(OSError):
457462
path.rmdir()
458-
except OSError:
459-
# prevents deleting non-empty folders
460-
pass
461463

462464
# second pass to delete empty folders
463465
# after deleting files, some folders might have been left empty

packages/service-library/tests/test_archiving_utils.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,20 @@
1111
import secrets
1212
import string
1313
import tempfile
14+
from collections.abc import AsyncIterable, Callable, Iterable, Iterator
1415
from concurrent.futures import ProcessPoolExecutor
1516
from dataclasses import dataclass
1617
from pathlib import Path
17-
from typing import Callable, Iterable, Iterator
1818

19+
import numpy
1920
import pytest
2021
from faker import Faker
22+
from PIL import Image
2123
from pydantic import ByteSize, parse_obj_as
2224
from pytest_benchmark.plugin import BenchmarkFixture
2325
from servicelib import archiving_utils
2426
from servicelib.archiving_utils import ArchiveError, archive_dir, unarchive_dir
27+
from servicelib.file_utils import remove_directory
2528

2629

2730
def _print_tree(path: Path, level=0):
@@ -596,3 +599,88 @@ def run_async_test(*args, **kwargs):
596599
)
597600

598601
benchmark(run_async_test)
602+
603+
604+
def _touch_all_files_in_path(path_to_archive: Path) -> None:
605+
for path in path_to_archive.rglob("*"):
606+
print("touching", path)
607+
path.touch()
608+
609+
610+
@pytest.fixture
611+
async def mixed_file_types(tmp_path: Path, faker: Faker) -> AsyncIterable[Path]:
612+
"""Directory with well known structure"""
613+
base_dir = tmp_path / "mixed_types_dir"
614+
base_dir.mkdir()
615+
(base_dir / "empty").mkdir()
616+
(base_dir / "d1").mkdir()
617+
(base_dir / "d1" / "f1.txt").write_text(faker.text())
618+
(base_dir / "d1" / "b2.bin").write_bytes(faker.json_bytes())
619+
(base_dir / "d1" / "sd1").mkdir()
620+
(base_dir / "d1" / "sd1" / "f1.txt").write_text(faker.text())
621+
(base_dir / "d1" / "sd1" / "b2.bin").write_bytes(faker.json_bytes())
622+
(base_dir / "images").mkdir()
623+
624+
# generate some data that can make the test fail, images
625+
for i in range(2):
626+
image_dir = base_dir / f"images{i}"
627+
image_dir.mkdir()
628+
for n in range(50):
629+
a = numpy.random.rand(900, 900, 3) * 255 # noqa: NPY002
630+
im_out = Image.fromarray(a.astype("uint8")).convert("RGB")
631+
image_path = image_dir / f"out{n}.jpg"
632+
im_out.save(image_path)
633+
634+
print("mixed_types_dir ---")
635+
_print_tree(base_dir)
636+
637+
yield base_dir
638+
639+
await remove_directory(base_dir)
640+
assert not base_dir.exists()
641+
642+
643+
# strangely enough no issues here
644+
@pytest.mark.parametrize(
645+
"store_relative_path, compress",
646+
[
647+
pytest.param(True, False, id="nodeports_options"),
648+
pytest.param(False, False, id="no_relative_path_no_compress"),
649+
pytest.param(True, True, id="with_relative_path_with_compression"),
650+
pytest.param(False, True, id="no_relative_path_with_compression"),
651+
],
652+
)
653+
async def test_regression_archive_hash_does_not_change(
654+
mixed_file_types: Path,
655+
tmp_path: Path,
656+
store_relative_path: bool,
657+
compress: bool,
658+
):
659+
destination_path = tmp_path / "archives_to_compare"
660+
destination_path.mkdir(parents=True, exist_ok=True)
661+
662+
first_archive = destination_path / "first"
663+
second_archive = destination_path / "second"
664+
assert not first_archive.exists()
665+
assert not second_archive.exists()
666+
assert first_archive != second_archive
667+
668+
await archive_dir(
669+
mixed_file_types,
670+
first_archive,
671+
compress=compress,
672+
store_relative_path=store_relative_path,
673+
)
674+
675+
_touch_all_files_in_path(mixed_file_types)
676+
677+
await archive_dir(
678+
mixed_file_types,
679+
second_archive,
680+
compress=compress,
681+
store_relative_path=store_relative_path,
682+
)
683+
684+
_, first_hash = _compute_hash(first_archive)
685+
_, second_hash = _compute_hash(second_archive)
686+
assert first_hash == second_hash

0 commit comments

Comments
 (0)