refactor (entropy) rename buffer_size to block_size

e3krisztian · e3krisztian · commit 2971550d3ee1 · 2023-05-02T15:00:52.000+02:00
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -11,7 +11,7 @@
 from unblob.models import UnknownChunk, ValidChunk
 from unblob.processing import (
     ExtractionConfig,
-    calculate_buffer_size,
+    calculate_block_size,
     calculate_entropy,
     calculate_unknown_chunks,
     format_entropy_plot,
@@ -129,41 +129,45 @@ def test_calculate_unknown_chunks(
         (1000, 100, 10, 100, 10),
     ],
 )
-def test_calculate_buffer_size(
+def test_calculate_block_size(
     file_size: int, chunk_count: int, min_limit: int, max_limit: int, expected: int
 ):
-    assert expected == calculate_buffer_size(
-        file_size, chunk_count=chunk_count, min_limit=min_limit, max_limit=max_limit
+    assert expected == calculate_block_size(
+        file_size,
+        chunk_count=chunk_count,
+        min_limit=min_limit,
+        max_limit=max_limit,
     )
 
 
 def test_format_entropy_plot_error():
     with pytest.raises(TypeError):
-        format_entropy_plot(percentages=[], buffer_size=1024)
+        format_entropy_plot(percentages=[], block_size=1024)
 
 
 @pytest.mark.parametrize(
-    "percentages, buffer_size",
+    "percentages, block_size",
     [
         pytest.param([0.0] * 100, 1024, id="zero-array"),
         pytest.param([99.99] * 100, 1024, id="99-array"),
         pytest.param([100.0] * 100, 1024, id="100-array"),
-        pytest.param([100.0] * 100, -1, id="buffer_size-can-be-anything1"),
-        pytest.param([100.0] * 100, None, id="buffer_size-can-be-anything2"),
-        pytest.param([100.0] * 100, "None", id="buffer_size-can-be-anything3"),
+        pytest.param([100.0] * 100, -1, id="block_size-can-be-anything1"),
+        pytest.param([100.0] * 100, None, id="block_size-can-be-anything2"),
+        pytest.param([100.0] * 100, "None", id="block_size-can-be-anything3"),
     ],
 )
-def test_format_entropy_plot_no_exception(percentages: List[float], buffer_size: int):
-    assert str(buffer_size) in format_entropy_plot(
-        percentages=percentages, buffer_size=buffer_size
+def test_format_entropy_plot_no_exception(percentages: List[float], block_size: int):
+    assert str(block_size) in format_entropy_plot(
+        percentages=percentages,
+        block_size=block_size,
     )
 
 
 def test_calculate_entropy_no_exception():
     report = calculate_entropy(Path(sys.executable))
     format_entropy_plot(
         percentages=report.percentages,
-        buffer_size=report.buffer_size,
+        block_size=report.block_size,
     )
 
 
@@ -375,7 +379,8 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
     [unknown_chunk_report] = get_all("input-file", UnknownChunkReport)
     unknown_entropy = unknown_chunk_report.entropy
     assert unknown_entropy == EntropyReport(
-        percentages=[0.0, 75.0] + [100.0] * 62, buffer_size=1024
+        percentages=[0.0, 75.0] + [100.0] * 62,
+        block_size=1024,
     )
     assert (
         unknown_entropy is not None
@@ -386,6 +391,6 @@ def get_all(file_name, report_type: Type[T]) -> List[T]:
 
     # we should have entropy calculated for files without extractions, except for empty files
     assert [] == get_all("empty.txt", EntropyReport)
-    assert [EntropyReport(percentages=[100.0], buffer_size=1024)] == get_all(
+    assert [EntropyReport(percentages=[100.0], block_size=1024)] == get_all(
         "0-255.bin", EntropyReport
     )
diff --git a/unblob/processing.py b/unblob/processing.py
@@ -361,7 +361,7 @@ def _calculate_entropy(self, path: Path) -> Optional[EntropyReport]:
                     "Entropy chart",
                     # New line so that chart title will be aligned correctly in the next line
                     chart="\n"
-                    + format_entropy_plot(report.percentages, report.buffer_size),
+                    + format_entropy_plot(report.percentages, report.block_size),
                     path=path,
                     _verbosity=3,
                 )
@@ -514,23 +514,26 @@ def calculate_entropy(path: Path) -> EntropyReport:
 
     # Smaller chunk size would be very slow to calculate.
     # 1Mb chunk size takes ~ 3sec for a 4,5 GB file.
-    buffer_size = calculate_buffer_size(
-        file_size, chunk_count=80, min_limit=1024, max_limit=1024 * 1024
+    block_size = calculate_block_size(
+        file_size,
+        chunk_count=80,
+        min_limit=1024,
+        max_limit=1024 * 1024,
     )
 
     with File.from_path(path) as file:
-        for chunk in iterate_file(file, 0, file_size, buffer_size=buffer_size):
+        for chunk in iterate_file(file, 0, file_size, buffer_size=block_size):
             entropy = shannon_entropy(chunk)
             entropy_percentage = round(entropy / 8 * 100, 2)
             percentages.append(entropy_percentage)
 
-    report = EntropyReport(percentages=percentages, buffer_size=buffer_size)
+    report = EntropyReport(percentages=percentages, block_size=block_size)
 
     logger.debug(
         "Entropy calculated",
         path=path,
         size=file_size,
-        buffer_size=report.buffer_size,
+        block_size=report.block_size,
         mean=round(report.mean, 2),
         highest=round(report.highest, 2),
         lowest=round(report.lowest, 2),
@@ -539,25 +542,25 @@ def calculate_entropy(path: Path) -> EntropyReport:
     return report
 
 
-def calculate_buffer_size(
+def calculate_block_size(
     file_size, *, chunk_count: int, min_limit: int, max_limit: int
 ) -> int:
     """Split the file into even sized chunks, limited by lower and upper values."""
     # We don't care about floating point precision here
-    buffer_size = file_size // chunk_count
-    buffer_size = max(min_limit, buffer_size)
-    buffer_size = min(buffer_size, max_limit)
-    return buffer_size
+    block_size = file_size // chunk_count
+    block_size = max(min_limit, block_size)
+    block_size = min(block_size, max_limit)
+    return block_size
 
 
-def format_entropy_plot(percentages: List[float], buffer_size: int):
+def format_entropy_plot(percentages: List[float], block_size: int):
     # start from scratch
     plt.clear_figure()
     # go colorless
     plt.clear_color()
     plt.title("Entropy distribution")
-    # plt.xlabel(humanize.naturalsize(buffer_size))
-    plt.xlabel(f"{buffer_size} bytes")
+    # plt.xlabel(humanize.naturalsize(block_size))
+    plt.xlabel(f"{block_size} bytes")
     plt.ylabel("entropy %")
 
     plt.scatter(percentages, marker="dot")
diff --git a/unblob/report.py b/unblob/report.py
@@ -176,7 +176,7 @@ class FileMagicReport(Report):
 @attr.define(kw_only=True)
 class EntropyReport(Report):
     percentages: List[float]
-    buffer_size: int
+    block_size: int
 
     @property
     def mean(self):