Skip to content

Commit f15032d

Browse files
authored
Merge pull request #569 from onekey-sec/entropy
Entropy fixes
2 parents fa1a913 + d86f9e4 commit f15032d

File tree

3 files changed

+46
-38
lines changed

3 files changed

+46
-38
lines changed

tests/test_processing.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from unblob.models import UnknownChunk, ValidChunk
1212
from unblob.processing import (
1313
ExtractionConfig,
14-
calculate_buffer_size,
14+
calculate_block_size,
1515
calculate_entropy,
1616
calculate_unknown_chunks,
1717
format_entropy_plot,
@@ -129,41 +129,45 @@ def test_calculate_unknown_chunks(
129129
(1000, 100, 10, 100, 10),
130130
],
131131
)
132-
def test_calculate_buffer_size(
132+
def test_calculate_block_size(
133133
file_size: int, chunk_count: int, min_limit: int, max_limit: int, expected: int
134134
):
135-
assert expected == calculate_buffer_size(
136-
file_size, chunk_count=chunk_count, min_limit=min_limit, max_limit=max_limit
135+
assert expected == calculate_block_size(
136+
file_size,
137+
chunk_count=chunk_count,
138+
min_limit=min_limit,
139+
max_limit=max_limit,
137140
)
138141

139142

140143
def test_format_entropy_plot_error():
141144
with pytest.raises(TypeError):
142-
format_entropy_plot(percentages=[], buffer_size=1024)
145+
format_entropy_plot(percentages=[], block_size=1024)
143146

144147

145148
@pytest.mark.parametrize(
146-
"percentages, buffer_size",
149+
"percentages, block_size",
147150
[
148151
pytest.param([0.0] * 100, 1024, id="zero-array"),
149152
pytest.param([99.99] * 100, 1024, id="99-array"),
150153
pytest.param([100.0] * 100, 1024, id="100-array"),
151-
pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"),
152-
pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"),
153-
pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"),
154+
pytest.param([100.0] * 100, -1, id="block_size-can-be-anything1"),
155+
pytest.param([100.0] * 100, None, id="block_size-can-be-anything2"),
156+
pytest.param([100.0] * 100, "None", id="block_size-can-be-anything3"),
154157
],
155158
)
156-
def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int):
157-
assert str(buffer_size) in format_entropy_plot(
158-
percentages=percentages, buffer_size=buffer_size
159+
def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
160+
assert str(block_size) in format_entropy_plot(
161+
percentages=percentages,
162+
block_size=block_size,
159163
)
160164

161165

162166
def test_calculate_entropy_no_exception():
163167
report = calculate_entropy(Path(sys.executable))
164168
format_entropy_plot(
165169
percentages=report.percentages,
166-
buffer_size=report.buffer_size,
170+
block_size=report.block_size,
167171
)
168172

169173

@@ -374,18 +378,17 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
374378
# with a percentages (scaled up bits) of 64 items, for 0, 6, 8, 8, ... bits of entropies
375379
[unknown_chunk_report] = get_all("input-file", UnknownChunkReport)
376380
unknown_entropy = unknown_chunk_report.entropy
377-
assert unknown_entropy == EntropyReport(
378-
percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024
379-
)
380381
assert (
381382
unknown_entropy is not None
382-
) # removes pyright complaints for the below 3 lines :(
383+
) # removes pyright complaints for the below lines :(
384+
assert unknown_entropy.percentages == [0.0, 75.0] + [100.0] * 62
385+
assert unknown_entropy.block_size == 1024
383386
assert round(unknown_entropy.mean, 2) == 98.05 # noqa: PLR2004
384387
assert unknown_entropy.highest == 100.0 # noqa: PLR2004
385388
assert unknown_entropy.lowest == 0.0 # noqa: PLR2004
386389

387390
# we should have entropy calculated for files without extractions, except for empty files
388391
assert [] == get_all("empty.txt", EntropyReport)
389-
assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all(
392+
assert [EntropyReport(percentages=[100.0], block_size=1024, mean=100.0)] == get_all(
390393
"0-255.bin", EntropyReport
391394
)

unblob/processing.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
361361
"Entropy chart",
362362
# New line so that chart title will be aligned correctly in the next line
363363
chart="\n"
364-
+ format_entropy_plot(report.percentages, report.buffer_size),
364+
+ format_entropy_plot(report.percentages, report.block_size),
365365
path=path,
366366
_verbosity=3,
367367
)
@@ -514,23 +514,32 @@ def calculate_entropy(path: Path) -> EntropyReport:
514514

515515
# Smaller chunk size would be very slow to calculate.
516516
# 1Mb chunk size takes ~ 3sec for a 4,5 GB file.
517-
buffer_size = calculate_buffer_size(
518-
file_size, chunk_count=80, min_limit=1024, max_limit=1024 * 1024
517+
block_size = calculate_block_size(
518+
file_size,
519+
chunk_count=80,
520+
min_limit=1024,
521+
max_limit=1024 * 1024,
519522
)
520523

524+
entropy_sum = 0.0
521525
with File.from_path(path) as file:
522-
for chunk in iterate_file(file, 0, file_size, buffer_size=buffer_size):
526+
for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
523527
entropy = shannon_entropy(chunk)
524528
entropy_percentage = round(entropy / 8 * 100, 2)
525529
percentages.append(entropy_percentage)
530+
entropy_sum += entropy * len(chunk)
526531

527-
report = EntropyReport(percentages=percentages, buffer_size=buffer_size)
532+
report = EntropyReport(
533+
percentages=percentages,
534+
block_size=block_size,
535+
mean=entropy_sum / file_size / 8 * 100,
536+
)
528537

529538
logger.debug(
530539
"Entropy calculated",
531540
path=path,
532541
size=file_size,
533-
buffer_size=report.buffer_size,
542+
block_size=report.block_size,
534543
mean=round(report.mean, 2),
535544
highest=round(report.highest, 2),
536545
lowest=round(report.lowest, 2),
@@ -539,25 +548,25 @@ def calculate_entropy(path: Path) -> EntropyReport:
539548
return report
540549

541550

542-
def calculate_buffer_size(
551+
def calculate_block_size(
543552
file_size, *, chunk_count: int, min_limit: int, max_limit: int
544553
) -> int:
545554
"""Split the file into even sized chunks, limited by lower and upper values."""
546555
# We don't care about floating point precision here
547-
buffer_size = file_size // chunk_count
548-
buffer_size = max(min_limit, buffer_size)
549-
buffer_size = min(buffer_size, max_limit)
550-
return buffer_size
556+
block_size = file_size // chunk_count
557+
block_size = max(min_limit, block_size)
558+
block_size = min(block_size, max_limit)
559+
return block_size
551560

552561

553-
def format_entropy_plot(percentages: List[float], buffer_size: int):
562+
def format_entropy_plot(percentages: List[float], block_size: int):
554563
# start from scratch
555564
plt.clear_figure()
556565
# go colorless
557566
plt.clear_color()
558567
plt.title("Entropy distribution")
559-
# plt.xlabel(humanize.naturalsize(buffer_size))
560-
plt.xlabel(f"{buffer_size} bytes")
568+
# plt.xlabel(humanize.naturalsize(block_size))
569+
plt.xlabel(f"{block_size} bytes")
561570
plt.ylabel("entropy %")
562571

563572
plt.scatter(percentages, marker="dot")

unblob/report.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import hashlib
22
import os
33
import stat
4-
import statistics
54
import traceback
65
from enum import Enum
76
from pathlib import Path
@@ -176,11 +175,8 @@ class FileMagicReport(Report):
176175
@attr.define(kw_only=True)
177176
class EntropyReport(Report):
178177
percentages: List[float]
179-
buffer_size: int
180-
181-
@property
182-
def mean(self):
183-
return statistics.mean(self.percentages)
178+
block_size: int
179+
mean: float
184180

185181
@property
186182
def highest(self):

0 commit comments

Comments
 (0)