Skip to content

Commit 8f120dd

Browse files
tests: add deduplication tests for buzhash64(e)
this will detect if there is anything going wrong regarding deduplication with the encrypted buzhash mode.
1 parent 8c5545c commit 8f120dd

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

src/borg/testsuite/chunkers/buzhash64_test.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from io import BytesIO
33
import os
44

5+
import pytest
6+
57
from . import cf
68
from ...chunkers import ChunkerBuzHash64
79
from ...constants import * # NOQA
@@ -67,3 +69,42 @@ def test_buzhash64_chunksize_distribution():
6769
# most chunks should be cut due to buzhash triggering, not due to clipping at min/max size:
6870
assert min_count < 10
6971
assert max_count < 10
72+
73+
74+
@pytest.mark.parametrize("do_encrypt", (False, True))
75+
def test_buzhash64_dedup_shifted(do_encrypt):
76+
min_exp, max_exp, mask = 10, 16, 14 # chunk size target 16kiB, clip at 1kiB and 64kiB
77+
chunker = ChunkerBuzHash64(b"0123456789ABCDEF", min_exp, max_exp, mask, 4095, do_encrypt=do_encrypt)
78+
rdata = os.urandom(4000000)
79+
80+
def chunkit(data):
81+
size = 0
82+
chunks = []
83+
with BytesIO(data) as f:
84+
for chunk in chunker.chunkify(f):
85+
chunks.append(sha256(chunk.data).digest())
86+
size += len(chunk.data)
87+
return chunks, size
88+
89+
# 2 identical files
90+
data1, data2 = rdata, rdata
91+
chunks1, size1 = chunkit(data1)
92+
chunks2, size2 = chunkit(data2)
93+
# exact same chunking
94+
assert size1 == len(data1)
95+
assert size2 == len(data2)
96+
assert chunks1 == chunks2
97+
98+
# 2 almost identical files
99+
data1, data2 = rdata, b"inserted" + rdata
100+
chunks1, size1 = chunkit(data1)
101+
chunks2, size2 = chunkit(data2)
102+
assert size1 == len(data1)
103+
assert size2 == len(data2)
104+
# almost same chunking
105+
# many chunks overall
106+
assert len(chunks1) > 100
107+
assert len(chunks2) > 100
108+
# only a few unique chunks per file, most chunks are duplicates
109+
assert len(set(chunks1) - set(chunks2)) <= 2
110+
assert len(set(chunks2) - set(chunks1)) <= 2

0 commit comments

Comments
 (0)