Skip to content

Commit 4c5aa41

Browse files
authored
🎨Dask sidecar: use reproducible zipfile library (#6571)
1 parent 6e8867b commit 4c5aa41

File tree

4 files changed

+85
-6
lines changed

4 files changed

+85
-6
lines changed

services/dask-sidecar/requirements/_base.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ fsspec[http, s3] # sub types needed as we acces http and s3 here
2626
lz4 # for compression
2727
pydantic[email,dotenv]
2828
prometheus_client
29+
repro-zipfile

services/dask-sidecar/requirements/_base.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,9 @@ referencing==0.29.3
325325
# jsonschema
326326
# jsonschema-specifications
327327
repro-zipfile==0.3.1
328-
# via -r requirements/../../../packages/service-library/requirements/_base.in
328+
# via
329+
# -r requirements/../../../packages/service-library/requirements/_base.in
330+
# -r requirements/_base.in
329331
requests==2.32.3
330332
# via opentelemetry-exporter-otlp-proto-http
331333
rich==13.7.1

services/dask-sidecar/src/simcore_service_dask_sidecar/file_utils.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
import time
66
import zipfile
77
from collections.abc import Awaitable, Callable
8-
from io import BytesIO
8+
from io import IOBase
99
from pathlib import Path
1010
from typing import Any, Final, TypedDict, cast
1111

1212
import aiofiles
1313
import aiofiles.tempfile
1414
import fsspec # type: ignore[import-untyped]
15+
import repro_zipfile # type: ignore[import-untyped]
1516
from pydantic import ByteSize, FileUrl, parse_obj_as
1617
from pydantic.networks import AnyUrl
1718
from servicelib.logging_utils import LogLevelInt, LogMessageStr
@@ -33,7 +34,7 @@ def _file_progress_cb(
3334
log_publishing_cb: LogPublishingCB,
3435
text_prefix: str,
3536
main_loop: asyncio.AbstractEventLoop,
36-
**kwargs,
37+
**kwargs, # noqa: ARG001
3738
):
3839
asyncio.run_coroutine_threadsafe(
3940
log_publishing_cb(
@@ -78,7 +79,7 @@ def _s3fs_settings_from_s3_settings(s3_settings: S3Settings) -> S3FsSettingsDict
7879
return s3fs_settings
7980

8081

81-
def _file_chunk_streamer(src: BytesIO, dst: BytesIO):
82+
def _file_chunk_streamer(src: IOBase, dst: IOBase):
8283
data = src.read(CHUNK_SIZE)
8384
segment_len = dst.write(data)
8485
return (data, segment_len)
@@ -98,6 +99,8 @@ async def _copy_file(
9899
with fsspec.open(src_url, mode="rb", **src_storage_kwargs) as src_fp, fsspec.open(
99100
dst_url, "wb", **dst_storage_kwargs
100101
) as dst_fp:
102+
assert isinstance(src_fp, IOBase) # nosec
103+
assert isinstance(dst_fp, IOBase) # nosec
101104
file_size = getattr(src_fp, "size", None)
102105
data_read = True
103106
total_data_written = 0
@@ -159,7 +162,7 @@ async def pull_file_from_remote(
159162
if src_mime_type == _ZIP_MIME_TYPE and target_mime_type != _ZIP_MIME_TYPE:
160163
await log_publishing_cb(f"Uncompressing '{dst_path.name}'...", logging.INFO)
161164
logger.debug("%s is a zip file and will be now uncompressed", dst_path)
162-
with zipfile.ZipFile(dst_path, "r") as zip_obj:
165+
with repro_zipfile.ReproducibleZipFile(dst_path, "r") as zip_obj:
163166
await asyncio.get_event_loop().run_in_executor(
164167
None, zip_obj.extractall, dst_path.parents[0]
165168
)
@@ -248,7 +251,8 @@ async def push_file_to_remote(
248251
f"Compressing '{src_path.name}' to '{archive_file_path.name}'...",
249252
logging.INFO,
250253
)
251-
with zipfile.ZipFile(
254+
255+
with repro_zipfile.ReproducibleZipFile(
252256
archive_file_path, mode="w", compression=zipfile.ZIP_STORED
253257
) as zfp:
254258
await asyncio.get_event_loop().run_in_executor(

services/dask-sidecar/tests/unit/test_file_utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# pylint: disable=unused-variable
44

55
import asyncio
6+
import hashlib
67
import mimetypes
78
import zipfile
89
from collections.abc import AsyncIterable
@@ -375,3 +376,74 @@ async def test_pull_compressed_zip_file_from_remote(
375376
assert file.exists()
376377
assert file.name in file_names_within_zip_file
377378
mocked_log_publishing_cb.assert_called()
379+
380+
381+
def _compute_hash(file_path: Path) -> str:
382+
with file_path.open("rb") as file_to_hash:
383+
file_hash = hashlib.sha256()
384+
chunk = file_to_hash.read(8192)
385+
while chunk:
386+
file_hash.update(chunk)
387+
chunk = file_to_hash.read(8192)
388+
389+
return file_hash.hexdigest()
390+
391+
392+
async def test_push_file_to_remote_creates_reproducible_zip_archive(
393+
remote_parameters: StorageParameters,
394+
tmp_path: Path,
395+
faker: Faker,
396+
mocked_log_publishing_cb: mock.AsyncMock,
397+
):
398+
destination_url1 = parse_obj_as(AnyUrl, f"{remote_parameters.remote_file_url}1.zip")
399+
destination_url2 = parse_obj_as(AnyUrl, f"{remote_parameters.remote_file_url}2.zip")
400+
src_path = tmp_path / faker.file_name()
401+
TEXT_IN_FILE = faker.text()
402+
src_path.write_text(TEXT_IN_FILE)
403+
assert src_path.exists()
404+
405+
# pushing 2 times should produce the same archive with the same hash
406+
await push_file_to_remote(
407+
src_path,
408+
destination_url1,
409+
mocked_log_publishing_cb,
410+
remote_parameters.s3_settings,
411+
)
412+
await asyncio.sleep(
413+
5
414+
) # NOTE: we wait a bit to ensure the created zipfile has a different creation time (that is normally used for computing the hash)
415+
await push_file_to_remote(
416+
src_path,
417+
destination_url2,
418+
mocked_log_publishing_cb,
419+
remote_parameters.s3_settings,
420+
)
421+
422+
# now we pull both file and compare their hash
423+
424+
# USE-CASE 1: if destination is a zip then no decompression is done
425+
download_folder = tmp_path / "download"
426+
download_folder.mkdir(parents=True, exist_ok=True)
427+
assert download_folder.exists()
428+
dst_path1 = download_folder / f"{faker.file_name()}1.zip"
429+
dst_path2 = download_folder / f"{faker.file_name()}2.zip"
430+
431+
await pull_file_from_remote(
432+
src_url=destination_url1,
433+
target_mime_type=None,
434+
dst_path=dst_path1,
435+
log_publishing_cb=mocked_log_publishing_cb,
436+
s3_settings=remote_parameters.s3_settings,
437+
)
438+
assert dst_path1.exists()
439+
440+
await pull_file_from_remote(
441+
src_url=destination_url2,
442+
target_mime_type=None,
443+
dst_path=dst_path2,
444+
log_publishing_cb=mocked_log_publishing_cb,
445+
s3_settings=remote_parameters.s3_settings,
446+
)
447+
assert dst_path2.exists()
448+
449+
assert _compute_hash(dst_path1) == _compute_hash(dst_path2)

0 commit comments

Comments
 (0)