|
| 1 | +# pylint:disable=redefined-outer-name,unused-argument |
| 2 | + |
| 3 | +import os |
| 4 | +import tempfile |
| 5 | +import hashlib |
| 6 | +import random |
| 7 | +from pathlib import Path |
| 8 | +import asyncio |
| 9 | +from typing import Set, List, Dict, Iterator, Tuple |
| 10 | +from concurrent.futures import ProcessPoolExecutor |
| 11 | +import string |
| 12 | +import secrets |
| 13 | + |
| 14 | + |
| 15 | +import pytest |
| 16 | + |
| 17 | +from servicelib.archiving_utils import archive_dir, unarchive_dir |
| 18 | + |
| 19 | + |
| 20 | +@pytest.fixture |
| 21 | +def temp_dir_one() -> Path: |
| 22 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 23 | + yield Path(temp_dir) |
| 24 | + |
| 25 | + |
| 26 | +@pytest.fixture |
| 27 | +def temp_dir_two(tmpdir) -> Path: |
| 28 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 29 | + yield Path(temp_dir) |
| 30 | + |
| 31 | + |
| 32 | +@pytest.fixture |
| 33 | +def dir_with_random_content() -> Path: |
| 34 | + def random_string(length: int) -> str: |
| 35 | + return "".join(secrets.choice(string.ascii_letters) for i in range(length)) |
| 36 | + |
| 37 | + def make_files_in_dir(dir_path: Path, file_count: int) -> None: |
| 38 | + for _ in range(file_count): |
| 39 | + (dir_path / f"{random_string(8)}.bin").write_bytes( |
| 40 | + os.urandom(random.randint(1, 10)) |
| 41 | + ) |
| 42 | + |
| 43 | + def ensure_dir(path_to_ensure: Path) -> Path: |
| 44 | + path_to_ensure.mkdir(parents=True, exist_ok=True) |
| 45 | + return path_to_ensure |
| 46 | + |
| 47 | + def make_subdirectory_with_content(subdir_name: Path, max_file_count: int) -> None: |
| 48 | + subdir_name = ensure_dir(subdir_name) |
| 49 | + make_files_in_dir( |
| 50 | + dir_path=subdir_name, |
| 51 | + file_count=random.randint(1, max_file_count), |
| 52 | + ) |
| 53 | + |
| 54 | + def make_subdirectories_with_content( |
| 55 | + subdir_name: Path, max_subdirectories_count: int, max_file_count: int |
| 56 | + ) -> None: |
| 57 | + subdirectories_count = random.randint(1, max_subdirectories_count) |
| 58 | + for _ in range(subdirectories_count): |
| 59 | + make_subdirectory_with_content( |
| 60 | + subdir_name=subdir_name / f"{random_string(4)}", |
| 61 | + max_file_count=max_file_count, |
| 62 | + ) |
| 63 | + |
| 64 | + def get_dirs_and_subdris_in_path(path_to_scan: Path) -> Iterator[Path]: |
| 65 | + return [path for path in path_to_scan.rglob("*") if path.is_dir()] |
| 66 | + |
| 67 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 68 | + temp_dir_path = Path(temp_dir) |
| 69 | + data_container = ensure_dir(temp_dir_path / "study_data") |
| 70 | + |
| 71 | + make_subdirectories_with_content( |
| 72 | + subdir_name=data_container, max_subdirectories_count=5, max_file_count=5 |
| 73 | + ) |
| 74 | + make_files_in_dir(dir_path=data_container, file_count=5) |
| 75 | + |
| 76 | + # creates a good amount of files |
| 77 | + for _ in range(4): |
| 78 | + for subdirectory_path in get_dirs_and_subdris_in_path(data_container): |
| 79 | + make_subdirectories_with_content( |
| 80 | + subdir_name=subdirectory_path, |
| 81 | + max_subdirectories_count=3, |
| 82 | + max_file_count=3, |
| 83 | + ) |
| 84 | + |
| 85 | + yield temp_dir_path |
| 86 | + |
| 87 | + |
| 88 | +def strip_directory_from_path(input_path: Path, to_strip: Path) -> Path: |
| 89 | + to_strip = f"{str(to_strip)}/" |
| 90 | + return Path(str(input_path).replace(to_strip, "")) |
| 91 | + |
| 92 | + |
| 93 | +def get_all_files_in_dir(dir_path: Path) -> Set[Path]: |
| 94 | + return { |
| 95 | + strip_directory_from_path(x, dir_path) |
| 96 | + for x in dir_path.rglob("*") |
| 97 | + if x.is_file() |
| 98 | + } |
| 99 | + |
| 100 | + |
| 101 | +def _compute_hash(file_path: Path) -> Tuple[Path, str]: |
| 102 | + with open(file_path, "rb") as file_to_hash: |
| 103 | + file_hash = hashlib.md5() |
| 104 | + chunk = file_to_hash.read(8192) |
| 105 | + while chunk: |
| 106 | + file_hash.update(chunk) |
| 107 | + chunk = file_to_hash.read(8192) |
| 108 | + |
| 109 | + return file_path, file_hash.hexdigest() |
| 110 | + |
| 111 | + |
| 112 | +async def compute_hashes(file_paths: List[Path]) -> Dict[Path, str]: |
| 113 | + """given a list of files computes hashes for the files on a process pool""" |
| 114 | + |
| 115 | + loop = asyncio.get_event_loop() |
| 116 | + |
| 117 | + with ProcessPoolExecutor() as prcess_pool_executor: |
| 118 | + tasks = [ |
| 119 | + loop.run_in_executor(prcess_pool_executor, _compute_hash, file_path) |
| 120 | + for file_path in file_paths |
| 121 | + ] |
| 122 | + # pylint: disable=unnecessary-comprehension |
| 123 | + # see return value of _compute_hash it is a tuple, mapping list[Tuple[Path,str]] to Dict[Path, str] here |
| 124 | + return {k: v for k, v in await asyncio.gather(*tasks)} |
| 125 | + |
| 126 | + |
| 127 | +def full_file_path_from_dir_and_subdirs(dir_path: Path) -> List[Path]: |
| 128 | + return [x for x in dir_path.rglob("*") if x.is_file()] |
| 129 | + |
| 130 | + |
| 131 | +async def assert_same_directory_content( |
| 132 | + dir_to_compress: Path, output_dir: Path, inject_relative_path: Path = None |
| 133 | +) -> None: |
| 134 | + def _relative_path(input_path: Path) -> Path: |
| 135 | + return Path(str(inject_relative_path / str(input_path))[1:]) |
| 136 | + |
| 137 | + input_set = get_all_files_in_dir(dir_to_compress) |
| 138 | + output_set = get_all_files_in_dir(output_dir) |
| 139 | + |
| 140 | + if inject_relative_path is not None: |
| 141 | + input_set = {_relative_path(x) for x in input_set} |
| 142 | + |
| 143 | + assert ( |
| 144 | + input_set == output_set |
| 145 | + ), f"There following files are missing {input_set - output_set}" |
| 146 | + |
| 147 | + # computing the hashes for dir_to_compress and map in a dict |
| 148 | + # with the name starting from the root of the directory and md5sum |
| 149 | + dir_to_compress_hashes = { |
| 150 | + strip_directory_from_path(k, dir_to_compress): v |
| 151 | + for k, v in ( |
| 152 | + await compute_hashes(full_file_path_from_dir_and_subdirs(dir_to_compress)) |
| 153 | + ).items() |
| 154 | + } |
| 155 | + |
| 156 | + # computing the hashes for output_dir and map in a dict |
| 157 | + # with the name starting from the root of the directory and md5sum |
| 158 | + output_dir_hashes = { |
| 159 | + strip_directory_from_path(k, output_dir): v |
| 160 | + for k, v in ( |
| 161 | + await compute_hashes(full_file_path_from_dir_and_subdirs(output_dir)) |
| 162 | + ).items() |
| 163 | + } |
| 164 | + |
| 165 | + # finally check if hashes are mapped 1 to 1 in order to verify |
| 166 | + # that the compress/decompress worked correctly |
| 167 | + for key in dir_to_compress_hashes: |
| 168 | + assert ( |
| 169 | + dir_to_compress_hashes[key] |
| 170 | + == output_dir_hashes[_relative_path(key) if inject_relative_path else key] |
| 171 | + ) |
| 172 | + |
| 173 | + |
| 174 | +# end utils |
| 175 | + |
| 176 | + |
| 177 | +@pytest.mark.parametrize( |
| 178 | + "compress,store_relative_path", |
| 179 | + [[True, True], [True, False], [False, True], [False, False]], |
| 180 | +) |
| 181 | +async def test_archive_unarchive_same_structure_dir( |
| 182 | + dir_with_random_content: Path, |
| 183 | + temp_dir_one: Path, |
| 184 | + temp_dir_two: Path, |
| 185 | + compress: bool, |
| 186 | + store_relative_path: bool, |
| 187 | +): |
| 188 | + archive_file = temp_dir_one / "archive.zip" |
| 189 | + |
| 190 | + archive_result = await archive_dir( |
| 191 | + dir_to_compress=dir_with_random_content, |
| 192 | + destination=archive_file, |
| 193 | + store_relative_path=store_relative_path, |
| 194 | + compress=compress, |
| 195 | + ) |
| 196 | + assert archive_result is True |
| 197 | + |
| 198 | + await unarchive_dir( |
| 199 | + archive_to_extract=archive_file, destination_folder=temp_dir_two |
| 200 | + ) |
| 201 | + |
| 202 | + await assert_same_directory_content( |
| 203 | + dir_with_random_content, |
| 204 | + temp_dir_two, |
| 205 | + None if store_relative_path else dir_with_random_content, |
| 206 | + ) |
| 207 | + |
| 208 | + |
| 209 | +@pytest.mark.parametrize( |
| 210 | + "compress,store_relative_path", |
| 211 | + [[True, True], [True, False], [False, True], [False, False]], |
| 212 | +) |
| 213 | +async def test_unarchive_in_same_dir_as_archive( |
| 214 | + dir_with_random_content: Path, |
| 215 | + temp_dir_one: Path, |
| 216 | + compress: bool, |
| 217 | + store_relative_path: bool, |
| 218 | +): |
| 219 | + archive_file = temp_dir_one / "archive.zip" |
| 220 | + |
| 221 | + archive_result = await archive_dir( |
| 222 | + dir_to_compress=dir_with_random_content, |
| 223 | + destination=archive_file, |
| 224 | + store_relative_path=store_relative_path, |
| 225 | + compress=compress, |
| 226 | + ) |
| 227 | + assert archive_result is True |
| 228 | + |
| 229 | + await unarchive_dir( |
| 230 | + archive_to_extract=archive_file, destination_folder=temp_dir_one |
| 231 | + ) |
| 232 | + archive_file.unlink() |
| 233 | + await assert_same_directory_content( |
| 234 | + dir_with_random_content, |
| 235 | + temp_dir_one, |
| 236 | + None if store_relative_path else dir_with_random_content, |
| 237 | + ) |
0 commit comments