Skip to content

Commit 2971550

Browse files
committed
refactor (entropy) rename buffer_size to block_size
1 parent fa1a913 commit 2971550

File tree

3 files changed

+38
-30
lines changed

3 files changed

+38
-30
lines changed

tests/test_processing.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from unblob.models import UnknownChunk, ValidChunk
1212
from unblob.processing import (
1313
ExtractionConfig,
14-
calculate_buffer_size,
14+
calculate_block_size,
1515
calculate_entropy,
1616
calculate_unknown_chunks,
1717
format_entropy_plot,
@@ -129,41 +129,45 @@ def test_calculate_unknown_chunks(
129129
(1000, 100, 10, 100, 10),
130130
],
131131
)
132-
def test_calculate_buffer_size(
132+
def test_calculate_block_size(
133133
file_size: int, chunk_count: int, min_limit: int, max_limit: int, expected: int
134134
):
135-
assert expected == calculate_buffer_size(
136-
file_size, chunk_count=chunk_count, min_limit=min_limit, max_limit=max_limit
135+
assert expected == calculate_block_size(
136+
file_size,
137+
chunk_count=chunk_count,
138+
min_limit=min_limit,
139+
max_limit=max_limit,
137140
)
138141

139142

140143
def test_format_entropy_plot_error():
141144
with pytest.raises(TypeError):
142-
format_entropy_plot(percentages=[], buffer_size=1024)
145+
format_entropy_plot(percentages=[], block_size=1024)
143146

144147

145148
@pytest.mark.parametrize(
146-
"percentages, buffer_size",
149+
"percentages, block_size",
147150
[
148151
pytest.param([0.0] * 100, 1024, id="zero-array"),
149152
pytest.param([99.99] * 100, 1024, id="99-array"),
150153
pytest.param([100.0] * 100, 1024, id="100-array"),
151-
pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"),
152-
pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"),
153-
pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"),
154+
pytest.param([100.0] * 100, -1, id="block_size-can-be-anything1"),
155+
pytest.param([100.0] * 100, None, id="block_size-can-be-anything2"),
156+
pytest.param([100.0] * 100, "None", id="block_size-can-be-anything3"),
154157
],
155158
)
156-
def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int):
157-
assert str(buffer_size) in format_entropy_plot(
158-
percentages=percentages, buffer_size=buffer_size
159+
def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
160+
assert str(block_size) in format_entropy_plot(
161+
percentages=percentages,
162+
block_size=block_size,
159163
)
160164

161165

162166
def test_calculate_entropy_no_exception():
163167
report = calculate_entropy(Path(sys.executable))
164168
format_entropy_plot(
165169
percentages=report.percentages,
166-
buffer_size=report.buffer_size,
170+
block_size=report.block_size,
167171
)
168172

169173

@@ -375,7 +379,8 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
375379
[unknown_chunk_report] = get_all("input-file", UnknownChunkReport)
376380
unknown_entropy = unknown_chunk_report.entropy
377381
assert unknown_entropy == EntropyReport(
378-
percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024
382+
percentages=[0.0, 75.0] + [100.0] * 62,
383+
block_size=1024,
379384
)
380385
assert (
381386
unknown_entropy is not None
@@ -386,6 +391,6 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
386391

387392
# we should have entropy calculated for files without extractions, except for empty files
388393
assert [] == get_all("empty.txt", EntropyReport)
389-
assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all(
394+
assert [EntropyReport(percentages=[100.0], block_size=1024)] == get_all(
390395
"0-255.bin", EntropyReport
391396
)

unblob/processing.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
361361
"Entropy chart",
362362
# New line so that chart title will be aligned correctly in the next line
363363
chart="\n"
364-
+ format_entropy_plot(report.percentages, report.buffer_size),
364+
+ format_entropy_plot(report.percentages, report.block_size),
365365
path=path,
366366
_verbosity=3,
367367
)
@@ -514,23 +514,26 @@ def calculate_entropy(path: Path) -> EntropyReport:
514514

515515
# Smaller chunk size would be very slow to calculate.
516516
# 1Mb chunk size takes ~ 3sec for a 4,5 GB file.
517-
buffer_size = calculate_buffer_size(
518-
file_size, chunk_count=80, min_limit=1024, max_limit=1024 * 1024
517+
block_size = calculate_block_size(
518+
file_size,
519+
chunk_count=80,
520+
min_limit=1024,
521+
max_limit=1024 * 1024,
519522
)
520523

521524
with File.from_path(path) as file:
522-
for chunk in iterate_file(file, 0, file_size, buffer_size=buffer_size):
525+
for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
523526
entropy = shannon_entropy(chunk)
524527
entropy_percentage = round(entropy / 8 * 100, 2)
525528
percentages.append(entropy_percentage)
526529

527-
report = EntropyReport(percentages=percentages, buffer_size=buffer_size)
530+
report = EntropyReport(percentages=percentages, block_size=block_size)
528531

529532
logger.debug(
530533
"Entropy calculated",
531534
path=path,
532535
size=file_size,
533-
buffer_size=report.buffer_size,
536+
block_size=report.block_size,
534537
mean=round(report.mean, 2),
535538
highest=round(report.highest, 2),
536539
lowest=round(report.lowest, 2),
@@ -539,25 +542,25 @@ def calculate_entropy(path: Path) -> EntropyReport:
539542
return report
540543

541544

542-
def calculate_buffer_size(
545+
def calculate_block_size(
543546
file_size, *, chunk_count: int, min_limit: int, max_limit: int
544547
) -> int:
545548
"""Split the file into even sized chunks, limited by lower and upper values."""
546549
# We don't care about floating point precision here
547-
buffer_size = file_size // chunk_count
548-
buffer_size = max(min_limit, buffer_size)
549-
buffer_size = min(buffer_size, max_limit)
550-
return buffer_size
550+
block_size = file_size // chunk_count
551+
block_size = max(min_limit, block_size)
552+
block_size = min(block_size, max_limit)
553+
return block_size
551554

552555

553-
def format_entropy_plot(percentages: List[float], buffer_size: int):
556+
def format_entropy_plot(percentages: List[float], block_size: int):
554557
# start from scratch
555558
plt.clear_figure()
556559
# go colorless
557560
plt.clear_color()
558561
plt.title("Entropy distribution")
559-
# plt.xlabel(humanize.naturalsize(buffer_size))
560-
plt.xlabel(f"{buffer_size} bytes")
562+
# plt.xlabel(humanize.naturalsize(block_size))
563+
plt.xlabel(f"{block_size} bytes")
561564
plt.ylabel("entropy %")
562565

563566
plt.scatter(percentages, marker="dot")

unblob/report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ class FileMagicReport(Report):
176176
@attr.define(kw_only=True)
177177
class EntropyReport(Report):
178178
percentages: List[float]
179-
buffer_size: int
179+
block_size: int
180180

181181
@property
182182
def mean(self):

0 commit comments

Comments
 (0)