Merge pull request #569 from onekey-sec/entropy

e3krisztian · web-flow · commit f15032df3ed6 · 2023-05-02T15:22:23.000+02:00
Entropy fixes
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -11,7 +11,7 @@
 from unblob.models import UnknownChunk, ValidChunk
 from unblob.processing import (
     ExtractionConfig,
-    calculate_buffer_size,
+    calculate_block_size,
     calculate_entropy,
     calculate_unknown_chunks,
     format_entropy_plot,
@@ -129,41 +129,45 @@ def test_calculate_unknown_chunks(
         (1000, 100, 10, 100, 10),
     ],
 )
-def test_calculate_buffer_size(
+def test_calculate_block_size(
     file_size: int, chunk_count: int, min_limit: int, max_limit: int, expected: int
 ):
-    assert expected == calculate_buffer_size(
-        file_size, chunk_count=chunk_count, min_limit=min_limit, max_limit=max_limit
+    assert expected == calculate_block_size(
+        file_size,
+        chunk_count=chunk_count,
+        min_limit=min_limit,
+        max_limit=max_limit,
     )
 
 
 def test_format_entropy_plot_error():
     with pytest.raises(TypeError):
-        format_entropy_plot(percentages=[], buffer_size=1024)
+        format_entropy_plot(percentages=[], block_size=1024)
 
 
 @pytest.mark.parametrize(
-    "percentages, buffer_size",
+    "percentages, block_size",
     [
         pytest.param([0.0] * 100, 1024, id="zero-array"),
         pytest.param([99.99] * 100, 1024, id="99-array"),
         pytest.param([100.0] * 100, 1024, id="100-array"),
-        pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"),
-        pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"),
-        pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"),
+        pytest.param([100.0] * 100, -1, id="block_size-can-be-anything1"),
+        pytest.param([100.0] * 100, None, id="block_size-can-be-anything2"),
+        pytest.param([100.0] * 100, "None", id="block_size-can-be-anything3"),
     ],
 )
-def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int):
-    assert str(buffer_size) in format_entropy_plot(
-        percentages=percentages, buffer_size=buffer_size
+def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
+    assert str(block_size) in format_entropy_plot(
+        percentages=percentages,
+        block_size=block_size,
     )
 
 
 def test_calculate_entropy_no_exception():
     report = calculate_entropy(Path(sys.executable))
     format_entropy_plot(
         percentages=report.percentages,
-        buffer_size=report.buffer_size,
+        block_size=report.block_size,
     )
 
 
@@ -374,18 +378,17 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
     # with a percentages (scaled up bits) of 64 items, for 0, 6, 8, 8, ... bits of entropies
     [unknown_chunk_report] = get_all("input-file", UnknownChunkReport)
     unknown_entropy = unknown_chunk_report.entropy
-    assert unknown_entropy == EntropyReport(
-        percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024
-    )
     assert (
         unknown_entropy is not None
-    )  # removes pyright complaints for the below 3 lines :(
+    )  # removes pyright complaints for the below lines :(
+    assert unknown_entropy.percentages == [0.0, 75.0] + [100.0] * 62
+    assert unknown_entropy.block_size == 1024
     assert round(unknown_entropy.mean, 2) == 98.05  # noqa: PLR2004
     assert unknown_entropy.highest == 100.0  # noqa: PLR2004
     assert unknown_entropy.lowest == 0.0  # noqa: PLR2004
 
     # we should have entropy calculated for files without extractions, except for empty files
     assert [] == get_all("empty.txt", EntropyReport)
-    assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all(
+    assert [EntropyReport(percentages=[100.0], block_size=1024, mean=100.0)] == get_all(
         "0-255.bin", EntropyReport
     )
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -361,7 +361,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
                     "Entropy chart",
                     # New line so that chart title will be aligned correctly in the next line
                     chart="\n"
-                    + format_entropy_plot(report.percentages, report.buffer_size),
+                    + format_entropy_plot(report.percentages, report.block_size),
                     path=path,
                     _verbosity=3,
                 )
@@ -514,23 +514,32 @@ def calculate_entropy(path: Path) -> EntropyReport:
 
     # Smaller chunk size would be very slow to calculate.
     # 1Mb chunk size takes ~ 3sec for a 4,5 GB file.
-    buffer_size = calculate_buffer_size(
-        file_size, chunk_count=80, min_limit=1024, max_limit=1024 * 1024
+    block_size = calculate_block_size(
+        file_size,
+        chunk_count=80,
+        min_limit=1024,
+        max_limit=1024 * 1024,
     )
 
+    entropy_sum = 0.0
     with File.from_path(path) as file:
-        for chunk in iterate_file(file, 0, file_size, buffer_size=buffer_size):
+        for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
             entropy = shannon_entropy(chunk)
             entropy_percentage = round(entropy / 8 * 100, 2)
             percentages.append(entropy_percentage)
+            entropy_sum += entropy * len(chunk)
 
-    report = EntropyReport(percentages=percentages, buffer_size=buffer_size)
+    report = EntropyReport(
+        percentages=percentages,
+        block_size=block_size,
+        mean=entropy_sum / file_size / 8 * 100,
+    )
 
     logger.debug(
         "Entropy calculated",
         path=path,
         size=file_size,
-        buffer_size=report.buffer_size,
+        block_size=report.block_size,
         mean=round(report.mean, 2),
         highest=round(report.highest, 2),
         lowest=round(report.lowest, 2),
@@ -539,25 +548,25 @@ def calculate_entropy(path: Path) -> EntropyReport:
     return report
 
 
-def calculate_buffer_size(
+def calculate_block_size(
     file_size, *, chunk_count: int, min_limit: int, max_limit: int
 ) -> int:
     """Split the file into even sized chunks, limited by lower and upper values."""
     # We don't care about floating point precision here
-    buffer_size = file_size // chunk_count
-    buffer_size = max(min_limit, buffer_size)
-    buffer_size = min(buffer_size, max_limit)
-    return buffer_size
+    block_size = file_size // chunk_count
+    block_size = max(min_limit, block_size)
+    block_size = min(block_size, max_limit)
+    return block_size
 
 
-def format_entropy_plot(percentages: List[float], buffer_size: int):
+def format_entropy_plot(percentages: List[float], block_size: int):
     # start from scratch
     plt.clear_figure()
     # go colorless
     plt.clear_color()
     plt.title("Entropy distribution")
-    # plt.xlabel(humanize.naturalsize(buffer_size))
-    plt.xlabel(f"{buffer_size} bytes")
+    # plt.xlabel(humanize.naturalsize(block_size))
+    plt.xlabel(f"{block_size} bytes")
     plt.ylabel("entropy %")
 
     plt.scatter(percentages, marker="dot")
diff --git a/unblob/report.py b/unblob/report.py
@@ -1,7 +1,6 @@
 import hashlib
 import os
 import stat
-import statistics
 import traceback
 from enum import Enum
 from pathlib import Path
@@ -176,11 +175,8 @@ class FileMagicReport(Report):
 @attr.define(kw_only=True)
 class EntropyReport(Report):
     percentages: List[float]
-    buffer_size: int
-
-    @property
-    def mean(self):
-        return statistics.mean(self.percentages)
+    block_size: int
+    mean: float
 
     @property
     def highest(self):