|
3 | 3 | # pylint: disable=unused-variable |
4 | 4 |
|
5 | 5 | import asyncio |
| 6 | +import hashlib |
6 | 7 | import mimetypes |
7 | 8 | import zipfile |
8 | 9 | from collections.abc import AsyncIterable |
@@ -375,3 +376,74 @@ async def test_pull_compressed_zip_file_from_remote( |
375 | 376 | assert file.exists() |
376 | 377 | assert file.name in file_names_within_zip_file |
377 | 378 | mocked_log_publishing_cb.assert_called() |
| 379 | + |
| 380 | + |
| 381 | +def _compute_hash(file_path: Path) -> str: |
| 382 | + with file_path.open("rb") as file_to_hash: |
| 383 | + file_hash = hashlib.sha256() |
| 384 | + chunk = file_to_hash.read(8192) |
| 385 | + while chunk: |
| 386 | + file_hash.update(chunk) |
| 387 | + chunk = file_to_hash.read(8192) |
| 388 | + |
| 389 | + return file_hash.hexdigest() |
| 390 | + |
| 391 | + |
| 392 | +async def test_push_file_to_remote_creates_reproducible_zip_archive( |
| 393 | + remote_parameters: StorageParameters, |
| 394 | + tmp_path: Path, |
| 395 | + faker: Faker, |
| 396 | + mocked_log_publishing_cb: mock.AsyncMock, |
| 397 | +): |
| 398 | + destination_url1 = parse_obj_as(AnyUrl, f"{remote_parameters.remote_file_url}1.zip") |
| 399 | + destination_url2 = parse_obj_as(AnyUrl, f"{remote_parameters.remote_file_url}2.zip") |
| 400 | + src_path = tmp_path / faker.file_name() |
| 401 | + TEXT_IN_FILE = faker.text() |
| 402 | + src_path.write_text(TEXT_IN_FILE) |
| 403 | + assert src_path.exists() |
| 404 | + |
| 405 | + # pushing 2 times should produce the same archive with the same hash |
| 406 | + await push_file_to_remote( |
| 407 | + src_path, |
| 408 | + destination_url1, |
| 409 | + mocked_log_publishing_cb, |
| 410 | + remote_parameters.s3_settings, |
| 411 | + ) |
| 412 | + await asyncio.sleep( |
| 413 | + 5 |
| 414 | + ) # NOTE: we wait a bit to ensure the created zipfile has a different creation time (that is normally used for computing the hash) |
| 415 | + await push_file_to_remote( |
| 416 | + src_path, |
| 417 | + destination_url2, |
| 418 | + mocked_log_publishing_cb, |
| 419 | + remote_parameters.s3_settings, |
| 420 | + ) |
| 421 | + |
| 422 | + # now we pull both file and compare their hash |
| 423 | + |
| 424 | + # USE-CASE 1: if destination is a zip then no decompression is done |
| 425 | + download_folder = tmp_path / "download" |
| 426 | + download_folder.mkdir(parents=True, exist_ok=True) |
| 427 | + assert download_folder.exists() |
| 428 | + dst_path1 = download_folder / f"{faker.file_name()}1.zip" |
| 429 | + dst_path2 = download_folder / f"{faker.file_name()}2.zip" |
| 430 | + |
| 431 | + await pull_file_from_remote( |
| 432 | + src_url=destination_url1, |
| 433 | + target_mime_type=None, |
| 434 | + dst_path=dst_path1, |
| 435 | + log_publishing_cb=mocked_log_publishing_cb, |
| 436 | + s3_settings=remote_parameters.s3_settings, |
| 437 | + ) |
| 438 | + assert dst_path1.exists() |
| 439 | + |
| 440 | + await pull_file_from_remote( |
| 441 | + src_url=destination_url2, |
| 442 | + target_mime_type=None, |
| 443 | + dst_path=dst_path2, |
| 444 | + log_publishing_cb=mocked_log_publishing_cb, |
| 445 | + s3_settings=remote_parameters.s3_settings, |
| 446 | + ) |
| 447 | + assert dst_path2.exists() |
| 448 | + |
| 449 | + assert _compute_hash(dst_path1) == _compute_hash(dst_path2) |
0 commit comments