Merge pull request #558 from onekey-sec/entropy

e3krisztian · web-flow · commit 373085b2b23f · 2023-04-19T10:11:28.000+02:00
Report calculated entropies
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -2,22 +2,30 @@
 import sys
 import zipfile
 from pathlib import Path
-from typing import Collection, List, Tuple
+from typing import Collection, List, Tuple, Type, TypeVar
 
 import attr
 import pytest
 
+from unblob import handlers
 from unblob.models import UnknownChunk, ValidChunk
 from unblob.processing import (
     ExtractionConfig,
     calculate_buffer_size,
     calculate_entropy,
     calculate_unknown_chunks,
-    draw_entropy_plot,
+    format_entropy_plot,
     process_file,
     remove_inner_chunks,
 )
-from unblob.report import ExtractDirectoryExistsReport, StatReport
+from unblob.report import (
+    EntropyReport,
+    ExtractDirectoryExistsReport,
+    StatReport,
+    UnknownChunkReport,
+)
+
+T = TypeVar("T")
 
 
 def assert_same_chunks(expected, actual, explanation=None):
@@ -129,32 +137,34 @@ def test_calculate_buffer_size(
     )
 
 
-def test_draw_entropy_plot_error():
+def test_format_entropy_plot_error():
     with pytest.raises(TypeError):
-        draw_entropy_plot([])
+        format_entropy_plot(percentages=[], buffer_size=1024)
 
 
 @pytest.mark.parametrize(
-    "percentages",
+    "percentages, buffer_size",
     [
-        pytest.param([0.0] * 100, id="zero-array"),
-        pytest.param([99.99] * 100, id="99-array"),
-        pytest.param([100.0] * 100, id="100-array"),
+        pytest.param([0.0] * 100, 1024, id="zero-array"),
+        pytest.param([99.99] * 100, 1024, id="99-array"),
+        pytest.param([100.0] * 100, 1024, id="100-array"),
+        pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"),
+        pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"),
+        pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"),
     ],
 )
-def test_draw_entropy_plot_no_exception(percentages: List[float]):
-    assert draw_entropy_plot(percentages) is None
+def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int):
+    assert str(buffer_size) in format_entropy_plot(
+        percentages=percentages, buffer_size=buffer_size
+    )
 
 
-@pytest.mark.parametrize(
-    "path, draw_plot",
-    [
-        pytest.param(Path(sys.executable), True, id="draw-plot"),
-        pytest.param(Path(sys.executable), False, id="no-plot"),
-    ],
-)
-def test_calculate_entropy_no_exception(path: Path, draw_plot: bool):
-    assert calculate_entropy(path, draw_plot=draw_plot) is None
+def test_calculate_entropy_no_exception():
+    report = calculate_entropy(Path(sys.executable))
+    format_entropy_plot(
+        percentages=report.percentages,
+        buffer_size=report.buffer_size,
+    )
 
 
 @pytest.mark.parametrize(
@@ -311,3 +321,71 @@ def test_processing_with_non_posix_paths(tmp_path: Path):
             is_link=False,
             link_target=None,
         )
+
+
+def test_entropy_calculation(tmp_path: Path):
+    """Process a file with unknown chunk and a zip file with entropy calculation enabled.
+
+    The input file structure is
+    - zip-chunk
+        - empty.txt
+        - 0-255.bin
+    - unknown_chunk
+    """
+    #
+    # ** input
+
+    input_file = tmp_path / "input-file"
+    with zipfile.ZipFile(input_file, "w") as zf:
+        zf.writestr("empty.txt", data=b"")
+        zf.writestr("0-255.bin", data=bytes(range(256)))
+
+    # entropy is calculated in 1Kb blocks for files smaller than 80Kb
+    # so let's have 1 block with 0 entropy, 1 with 6 bit entropy, the rest with 8 bit entropy
+    unknown_chunk_content = (
+        bytes(1024) + bytes(range(64)) * 4 * 4 + bytes(range(256)) * 4 * 62
+    )
+    with input_file.open("ab") as f:
+        f.write(unknown_chunk_content)
+
+    config = ExtractionConfig(
+        extract_root=tmp_path / "extract_root",
+        entropy_depth=100,
+        entropy_plot=True,
+        handlers=(handlers.archive.zip.ZIPHandler,),
+    )
+
+    # ** action
+
+    process_result = process_file(config, input_file)
+
+    task_result_by_name = {r.task.path.name: r for r in process_result.results}
+
+    def get_all(file_name, report_type: Type[T]) -> List[T]:
+        return [
+            r
+            for r in task_result_by_name[file_name].reports
+            if isinstance(r, report_type)
+        ]
+
+    # ** verification
+
+    # the unknown chunk report for the second chunk for the input file should have an entropy report
+    # with a percentages (scaled up bits) of 64 items, for 0, 6, 8, 8, ... bits of entropies
+    [unknown_chunk_report] = get_all("input-file", UnknownChunkReport)
+    unknown_entropy = unknown_chunk_report.entropy
+    assert unknown_entropy == EntropyReport(
+        percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024
+    )
+    assert (
+        unknown_entropy is not None
+    )  # removes pyright complaints for the below 3 lines :(
+    assert round(unknown_entropy.mean, 2) == 98.05  # noqa: PLR2004
+    assert unknown_entropy.highest == 100.0  # noqa: PLR2004
+    assert unknown_entropy.lowest == 0.0  # noqa: PLR2004
+
+    # we should have entropy calculated for files without extractions, except for empty files
+    assert [] == get_all("empty.txt", EntropyReport)
+    assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all(
+        "0-255.bin", EntropyReport
+    )
diff --git a/tests/test_report.py b/tests/test_report.py
@@ -312,12 +312,26 @@ def hello_kitty_task_results(
                     sha1="febca6ed75dc02e0def065e7b08f1cca87b57c74",
                     sha256="144d8b2c949cb4943128aa0081153bcba4f38eb0ba26119cc06ca1563c4999e1",
                 ),
-                UnknownChunkReport(chunk_id=ANY, start_offset=0, end_offset=6, size=6),
                 UnknownChunkReport(
-                    chunk_id=ANY, start_offset=131, end_offset=138, size=7
+                    chunk_id=ANY,
+                    start_offset=0,
+                    end_offset=6,
+                    size=6,
+                    entropy=None,
+                ),
+                UnknownChunkReport(
+                    chunk_id=ANY,
+                    start_offset=131,
+                    end_offset=138,
+                    size=7,
+                    entropy=None,
                 ),
                 UnknownChunkReport(
-                    chunk_id=ANY, start_offset=263, end_offset=264, size=1
+                    chunk_id=ANY,
+                    start_offset=263,
+                    end_offset=264,
+                    size=1,
+                    entropy=None,
                 ),
                 ChunkReport(
                     chunk_id=hello_id,
diff --git a/unblob/math.py b/unblob/math.py
@@ -1,3 +1,7 @@
+from typing import Callable
+
+shannon_entropy: Callable[[bytes], float]
+
 try:
     from ._rust import shannon_entropy  # type: ignore
 except ImportError:
diff --git a/unblob/models.py b/unblob/models.py
@@ -11,7 +11,7 @@
 from .file_utils import Endian, File, InvalidInputFormat, StructParser
 from .identifiers import new_id
 from .parser import hexstring2regex
-from .report import ChunkReport, ErrorReport, Report, UnknownChunkReport
+from .report import ChunkReport, EntropyReport, ErrorReport, Report, UnknownChunkReport
 
 logger = get_logger()
 
@@ -123,12 +123,13 @@ class UnknownChunk(Chunk):
     like most common bytes (like \x00 and \xFF), ASCII strings, high entropy, etc.
     """
 
-    def as_report(self) -> UnknownChunkReport:
+    def as_report(self, entropy: Optional[EntropyReport]) -> UnknownChunkReport:
         return UnknownChunkReport(
             chunk_id=self.chunk_id,
             start_offset=self.start_offset,
             end_offset=self.end_offset,
             size=self.size,
+            entropy=entropy,
         )
 
 
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -1,6 +1,5 @@
 import multiprocessing
 import shutil
-import statistics
 from operator import attrgetter
 from pathlib import Path
 from typing import Iterable, List, Optional, Sequence
@@ -30,6 +29,7 @@
 )
 from .pool import make_pool
 from .report import (
+    EntropyReport,
     ExtractDirectoryExistsReport,
     FileMagicReport,
     HashReport,
@@ -332,7 +332,9 @@ def process(self):
             else:
                 # we don't consider whole files as unknown chunks, but we still want to
                 # calculate entropy for whole files which produced no valid chunks
-                self._calculate_entropy(self.task.path)
+                entropy = self._calculate_entropy(self.task.path)
+                if entropy:
+                    self.result.add_report(entropy)
 
     def _process_chunks(
         self,
@@ -345,15 +347,26 @@ def _process_chunks(
 
         for chunk in unknown_chunks:
             carved_unknown_path = carve_unknown_chunk(self.carve_dir, file, chunk)
-            self._calculate_entropy(carved_unknown_path)
-            self.result.add_report(chunk.as_report())
+            entropy = self._calculate_entropy(carved_unknown_path)
+            self.result.add_report(chunk.as_report(entropy=entropy))
 
         for chunk in outer_chunks:
             self._extract_chunk(file, chunk)
 
-    def _calculate_entropy(self, path: Path):
+    def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
         if self.task.depth < self.config.entropy_depth:
-            calculate_entropy(path, draw_plot=self.config.entropy_plot)
+            report = calculate_entropy(path)
+            if self.config.entropy_plot:
+                logger.debug(
+                    "Entropy chart",
+                    # New line so that chart title will be aligned correctly in the next line
+                    chart="\n"
+                    + format_entropy_plot(report.percentages, report.buffer_size),
+                    path=path,
+                    _verbosity=3,
+                )
+            return report
+        return None
 
     def _extract_chunk(self, file, chunk: ValidChunk):
         skip_carving = chunk.is_whole_file
@@ -484,8 +497,8 @@ def calculate_unknown_chunks(
     return unknown_chunks
 
 
-def calculate_entropy(path: Path, *, draw_plot: bool):
-    """Calculate and log shannon entropy divided by 8 for the file in 1mB chunks.
+def calculate_entropy(path: Path) -> EntropyReport:
+    """Calculate and log shannon entropy divided by 8 for the file in chunks.
 
     Shannon entropy returns the amount of information (in bits) of some numeric
     sequence. We calculate the average entropy of byte chunks, which in theory
@@ -499,7 +512,7 @@ def calculate_entropy(path: Path, *, draw_plot: bool):
     file_size = path.stat().st_size
     logger.debug("Calculating entropy for file", path=path, size=file_size)
 
-    # Smaller chuk size would be very slow to calculate.
+    # Smaller chunk size would be very slow to calculate.
     # 1Mb chunk size takes ~ 3sec for a 4,5 GB file.
     buffer_size = calculate_buffer_size(
         file_size, chunk_count=80, min_limit=1024, max_limit=1024 * 1024
@@ -511,15 +524,19 @@ def calculate_entropy(path: Path, *, draw_plot: bool):
             entropy_percentage = round(entropy / 8 * 100, 2)
             percentages.append(entropy_percentage)
 
+    report = EntropyReport(percentages=percentages, buffer_size=buffer_size)
+
     logger.debug(
         "Entropy calculated",
-        mean=round(statistics.mean(percentages), 2),
-        highest=max(percentages),
-        lowest=min(percentages),
+        path=path,
+        size=file_size,
+        buffer_size=report.buffer_size,
+        mean=round(report.mean, 2),
+        highest=round(report.highest, 2),
+        lowest=round(report.lowest, 2),
     )
 
-    if draw_plot:
-        draw_entropy_plot(percentages)
+    return report
 
 
 def calculate_buffer_size(
@@ -533,11 +550,14 @@ def calculate_buffer_size(
     return buffer_size
 
 
-def draw_entropy_plot(percentages: List[float]):
-    plt.clear_data()
+def format_entropy_plot(percentages: List[float], buffer_size: int):
+    # start from scratch
+    plt.clear_figure()
+    # go colorless
     plt.clear_color()
     plt.title("Entropy distribution")
-    plt.xlabel("mB")
+    # plt.xlabel(humanize.naturalsize(buffer_size))
+    plt.xlabel(f"{buffer_size} bytes")
     plt.ylabel("entropy %")
 
     plt.scatter(percentages, marker="dot")
@@ -549,5 +569,4 @@ def draw_entropy_plot(percentages: List[float]):
     # Always show 0% and 100%
     plt.yticks(range(0, 101, 10))
 
-    # New line so that chart title will be aligned correctly in the next line
-    logger.debug("Entropy chart", chart="\n" + plt.build(), _verbosity=3)
+    return plt.build()
diff --git a/unblob/report.py b/unblob/report.py
@@ -1,6 +1,7 @@
 import hashlib
 import os
 import stat
+import statistics
 import traceback
 from enum import Enum
 from pathlib import Path
@@ -172,6 +173,24 @@ class FileMagicReport(Report):
     mime_type: str
 
 
+@attr.define(kw_only=True)
+class EntropyReport(Report):
+    percentages: List[float]
+    buffer_size: int
+
+    @property
+    def mean(self):
+        return statistics.mean(self.percentages)
+
+    @property
+    def highest(self):
+        return max(self.percentages)
+
+    @property
+    def lowest(self):
+        return min(self.percentages)
+
+
 @final
 @attr.define(kw_only=True)
 class ChunkReport(Report):
@@ -191,3 +210,4 @@ class UnknownChunkReport(Report):
     start_offset: int
     end_offset: int
     size: int
+    entropy: Optional[EntropyReport]