|
2 | 2 | from io import BytesIO |
3 | 3 | import os |
4 | 4 |
|
| 5 | +import pytest |
| 6 | + |
5 | 7 | from . import cf |
6 | 8 | from ...chunkers import ChunkerBuzHash64 |
7 | 9 | from ...constants import * # NOQA |
@@ -67,3 +69,42 @@ def test_buzhash64_chunksize_distribution(): |
67 | 69 | # most chunks should be cut due to buzhash triggering, not due to clipping at min/max size: |
68 | 70 | assert min_count < 10 |
69 | 71 | assert max_count < 10 |
| 72 | + |
| 73 | + |
| 74 | +@pytest.mark.parametrize("do_encrypt", (False, True)) |
| 75 | +def test_buzhash64_dedup_shifted(do_encrypt): |
| 76 | + min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB |
| 77 | + chunker = ChunkerBuzHash64(b"0123456789ABCDEF", min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt) |
| 78 | + rdata = os.urandom(4000000) |
| 79 | + |
| 80 | + def chunkit(data): |
| 81 | + size = 0 |
| 82 | + chunks = [] |
| 83 | + with BytesIO(data) as f: |
| 84 | + for chunk in chunker.chunkify(f): |
| 85 | + chunks.append(sha256(chunk.data).digest()) |
| 86 | + size += len(chunk.data) |
| 87 | + return chunks, size |
| 88 | + |
| 89 | + # 2 identical files |
| 90 | + data1, data2 = rdata, rdata |
| 91 | + chunks1, size1 = chunkit(data1) |
| 92 | + chunks2, size2 = chunkit(data2) |
| 93 | + # exact same chunking |
| 94 | + assert size1 == len(data1) |
| 95 | + assert size2 == len(data2) |
| 96 | + assert chunks1 == chunks2 |
| 97 | + |
| 98 | + # 2 almost identical files |
| 99 | + data1, data2 = rdata, b"inserted" + rdata |
| 100 | + chunks1, size1 = chunkit(data1) |
| 101 | + chunks2, size2 = chunkit(data2) |
| 102 | + assert size1 == len(data1) |
| 103 | + assert size2 == len(data2) |
| 104 | + # almost same chunking |
| 105 | + # many chunks overall |
| 106 | + assert len(chunks1) > 100 |
| 107 | + assert len(chunks2) > 100 |
| 108 | + # only a few unique chunks per file, most chunks are duplicates |
| 109 | + assert len(set(chunks1) - set(chunks2)) <= 2 |
| 110 | + assert len(set(chunks2) - set(chunks1)) <= 2 |
0 commit comments