|
2 | 2 | import sys |
3 | 3 | import zipfile |
4 | 4 | from pathlib import Path |
5 | | -from typing import Collection, List, Tuple |
| 5 | +from typing import Collection, List, Tuple, Type, TypeVar |
6 | 6 |
|
7 | 7 | import attr |
8 | 8 | import pytest |
9 | 9 |
|
| 10 | +from unblob import handlers |
10 | 11 | from unblob.models import UnknownChunk, ValidChunk |
11 | 12 | from unblob.processing import ( |
12 | 13 | ExtractionConfig, |
13 | 14 | calculate_buffer_size, |
14 | 15 | calculate_entropy, |
15 | 16 | calculate_unknown_chunks, |
16 | | - draw_entropy_plot, |
| 17 | + format_entropy_plot, |
17 | 18 | process_file, |
18 | 19 | remove_inner_chunks, |
19 | 20 | ) |
20 | | -from unblob.report import ExtractDirectoryExistsReport, StatReport |
| 21 | +from unblob.report import ( |
| 22 | + EntropyReport, |
| 23 | + ExtractDirectoryExistsReport, |
| 24 | + StatReport, |
| 25 | + UnknownChunkReport, |
| 26 | +) |
| 27 | + |
| 28 | +T = TypeVar("T") |
21 | 29 |
|
22 | 30 |
|
23 | 31 | def assert_same_chunks(expected, actual, explanation=None): |
@@ -129,32 +137,34 @@ def test_calculate_buffer_size( |
129 | 137 | ) |
130 | 138 |
|
131 | 139 |
|
132 | | -def test_draw_entropy_plot_error(): |
| 140 | +def test_format_entropy_plot_error(): |
133 | 141 | with pytest.raises(TypeError): |
134 | | - draw_entropy_plot([]) |
| 142 | + format_entropy_plot(percentages=[], buffer_size=1024) |
135 | 143 |
|
136 | 144 |
|
137 | 145 | @pytest.mark.parametrize( |
138 | | - "percentages", |
| 146 | + "percentages, buffer_size", |
139 | 147 | [ |
140 | | - pytest.param([0.0] * 100, id="zero-array"), |
141 | | - pytest.param([99.99] * 100, id="99-array"), |
142 | | - pytest.param([100.0] * 100, id="100-array"), |
| 148 | + pytest.param([0.0] * 100, 1024, id="zero-array"), |
| 149 | + pytest.param([99.99] * 100, 1024, id="99-array"), |
| 150 | + pytest.param([100.0] * 100, 1024, id="100-array"), |
| 151 | + pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"), |
| 152 | + pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"), |
| 153 | + pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"), |
143 | 154 | ], |
144 | 155 | ) |
145 | | -def test_draw_entropy_plot_no_exception(percentages: List[float]): |
146 | | - assert draw_entropy_plot(percentages) is None |
| 156 | +def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int): |
| 157 | + assert str(buffer_size) in format_entropy_plot( |
| 158 | + percentages=percentages, buffer_size=buffer_size |
| 159 | + ) |
147 | 160 |
|
148 | 161 |
|
149 | | -@pytest.mark.parametrize( |
150 | | - "path, draw_plot", |
151 | | - [ |
152 | | - pytest.param(Path(sys.executable), True, id="draw-plot"), |
153 | | - pytest.param(Path(sys.executable), False, id="no-plot"), |
154 | | - ], |
155 | | -) |
156 | | -def test_calculate_entropy_no_exception(path: Path, draw_plot: bool): |
157 | | - assert calculate_entropy(path, draw_plot=draw_plot) is None |
| 162 | +def test_calculate_entropy_no_exception(): |
| 163 | + report = calculate_entropy(Path(sys.executable)) |
| 164 | + format_entropy_plot( |
| 165 | + percentages=report.percentages, |
| 166 | + buffer_size=report.buffer_size, |
| 167 | + ) |
158 | 168 |
|
159 | 169 |
|
160 | 170 | @pytest.mark.parametrize( |
@@ -311,3 +321,71 @@ def test_processing_with_non_posix_paths(tmp_path: Path): |
311 | 321 | is_link=False, |
312 | 322 | link_target=None, |
313 | 323 | ) |
| 324 | + |
| 325 | + |
| 326 | +def test_entropy_calculation(tmp_path: Path): |
| 327 | + """Process a file with unknown chunk and a zip file with entropy calculation enabled. |
| 328 | +
|
| 329 | + The input file structure is |
| 330 | + - zip-chunk |
| 331 | + - empty.txt |
| 332 | + - 0-255.bin |
| 333 | + - unknown_chunk |
| 334 | + """ |
| 335 | + # |
| 336 | + # ** input |
| 337 | + |
| 338 | + input_file = tmp_path / "input-file" |
| 339 | + with zipfile.ZipFile(input_file, "w") as zf: |
| 340 | + zf.writestr("empty.txt", data=b"") |
| 341 | + zf.writestr("0-255.bin", data=bytes(range(256))) |
| 342 | + |
| 343 | + # entropy is calculated in 1Kb blocks for files smaller than 80Kb |
| 344 | + # so let's have 1 block with 0 entropy, 1 with 6 bit entropy, the rest with 8 bit entropy |
| 345 | + unknown_chunk_content = ( |
| 346 | + bytes(1024) + bytes(range(64)) * 4 * 4 + bytes(range(256)) * 4 * 62 |
| 347 | + ) |
| 348 | + with input_file.open("ab") as f: |
| 349 | + f.write(unknown_chunk_content) |
| 350 | + |
| 351 | + config = ExtractionConfig( |
| 352 | + extract_root=tmp_path / "extract_root", |
| 353 | + entropy_depth=100, |
| 354 | + entropy_plot=True, |
| 355 | + handlers=(handlers.archive.zip.ZIPHandler,), |
| 356 | + ) |
| 357 | + |
| 358 | + # ** action |
| 359 | + |
| 360 | + process_result = process_file(config, input_file) |
| 361 | + |
| 362 | + task_result_by_name = {r.task.path.name: r for r in process_result.results} |
| 363 | + |
| 364 | + def get_all(file_name, report_type: Type[T]) -> List[T]: |
| 365 | + return [ |
| 366 | + r |
| 367 | + for r in task_result_by_name[file_name].reports |
| 368 | + if isinstance(r, report_type) |
| 369 | + ] |
| 370 | + |
| 371 | + # ** verification |
| 372 | + |
| 373 | + # the unknown chunk report for the second chunk for the input file should have an entropy report |
| 374 | + # with a percentages (scaled up bits) of 64 items, for 0, 6, 8, 8, ... bits of entropies |
| 375 | + [unknown_chunk_report] = get_all("input-file", UnknownChunkReport) |
| 376 | + unknown_entropy = unknown_chunk_report.entropy |
| 377 | + assert unknown_entropy == EntropyReport( |
| 378 | + percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024 |
| 379 | + ) |
| 380 | + assert ( |
| 381 | + unknown_entropy is not None |
| 382 | + ) # removes pyright complaints for the below 3 lines :( |
| 383 | + assert round(unknown_entropy.mean, 2) == 98.05 # noqa: PLR2004 |
| 384 | + assert unknown_entropy.highest == 100.0 # noqa: PLR2004 |
| 385 | + assert unknown_entropy.lowest == 0.0 # noqa: PLR2004 |
| 386 | + |
| 387 | + # we should have entropy calculated for files without extractions, except for empty files |
| 388 | + assert [] == get_all("empty.txt", EntropyReport) |
| 389 | + assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all( |
| 390 | + "0-255.bin", EntropyReport |
| 391 | + ) |
0 commit comments