Applied various improvements

dhondta · dhondta · commit 775c871c843b · 2025-08-24T14:35:50.000+02:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<p align="center"><img src="https://github.com/packing-box/python-exeplot/raw/main/docs/pages/img/logo.png"></p>
+<p align="center" id="top"><img src="https://github.com/packing-box/python-exeplot/raw/main/docs/pages/img/logo.png"></p>
 <h1 align="center">ExePlot <a href="https://twitter.com/intent/tweet?text=ExePlot%20-%20Plot%20executable%20samples%20easy.%0D%0ALibrary%20for%20plotting%20executable%20samples%20supporting%20multiple%20formats.%0D%0Ahttps%3a%2f%2fgithub%2ecom%2fpacking-box%2fpython-exeplot%0D%0A&hashtags=python,programming,executable-samples,plot"><img src="https://img.shields.io/badge/Tweet--lightgrey?logo=twitter&style=social" alt="Tweet" height="20"/></a></h1>
 <h3 align="center">Search for samples from various malware databases.</h3>
 
@@ -27,4 +27,4 @@ TODO
 
 [![Forkers repo roster for @packing-box/python-exeplot](https://reporoster.com/forks/dark/packing-box/python-exeplot)](https://github.com/packing-box/python-exeplot/network/members)
 
-<p align="center"><a href="#"><img src="https://img.shields.io/badge/Back%20to%20top--lightgrey?style=social" alt="Back to top" height="20"/></a></p>
+<p align="center"><a href="#top"><img src="https://img.shields.io/badge/Back%20to%20top--lightgrey?style=social" alt="Back to top" height="20"/></a></p>
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,7 @@ authors = [
 ]
 description = "Library for plotting executable samples supporting multiple formats"
 license = {file = "LICENSE"}
-keywords = ["python", "development", "programming", "executable-samples", "plot"]
+keywords = ["python", "development", "programming", "executable-samples", "plot", "entropy", "cfg"]
 requires-python = ">=3.9,<4"
 classifiers = [
   "Development Status :: 5 - Production/Stable",
@@ -37,15 +37,23 @@ dependencies = [
 ]
 dynamic = ["version"]
 
+[project.optional-dependencies]
+graph = [
+  "angr>=9.2",
+  "networkx>=3.4.2",
+  "numpy<2",  # required until angr gets compatible with numpy>=2
+  "pygraphviz>=1.14",
+]
+
 [project.readme]
 file = "README.md"
 content-type = "text/markdown"
 
 [project.urls]
 documentation = "https://python-exeplot.readthedocs.io/en/latest/?badge=latest"
-homepage = "https://github.com/dhondta/python-exeplot"
-issues = "https://github.com/dhondta/python-exeplot/issues"
-repository = "https://github.com/dhondta/python-exeplot"
+homepage = "https://github.com/packing-box/python-exeplot"
+issues = "https://github.com/packing-box/python-exeplot/issues"
+repository = "https://github.com/packing-box/python-exeplot"
 
 [project.scripts]
 exeplot = "exeplot.__main__:main"
diff --git a/src/exeplot/VERSION.txt b/src/exeplot/VERSION.txt
@@ -1 +1 @@
-0.2.1
+0.3.1
diff --git a/src/exeplot/__conf__.py b/src/exeplot/__conf__.py
@@ -1,6 +1,7 @@
 # -*- coding: UTF-8 -*-
 import logging
 import matplotlib.pyplot as plt
+import numpy
 from functools import wraps
 
 
@@ -18,6 +19,8 @@
     'transparent':    False,
 }
 
+numpy.int = numpy.int_  # dirty fix to "AttributeError: module 'numpy' has no attribute 'int'."
+
 
 def configure():  # pragma: no cover
     from configparser import ConfigParser
diff --git a/src/exeplot/__info__.py b/src/exeplot/__info__.py
@@ -3,12 +3,17 @@
 
 """
 import os
+from datetime import datetime
+
+__y = str(datetime.now().year)
+__s = "2025"
 
 __author__    = "Alexandre D'Hondt"
-__copyright__ = "© 2025 A. D'Hondt"
+__copyright__ = "© {} A. D'Hondt".format([__y, __s + "-" + __y][__y != __s])
 __email__     = "alexandre.dhondt@gmail.com"
 __license__   = "GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html)"
 __source__    = "https://github.com/packing-box/python-exeplot"
 
 with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f:
     __version__ = f.read().strip()
+
diff --git a/src/exeplot/plots/__common__.py b/src/exeplot/plots/__common__.py
@@ -3,6 +3,8 @@
 from functools import cached_property
 from statistics import mean
 
+from ..utils import *
+
 
 CACHE_DIR = os.path.expanduser("~/.exeplot")
 # https://matplotlib.org/2.0.2/examples/color/named_colors.html
@@ -48,36 +50,17 @@
 SHADOW = {'shade': .3, 'ox': .005, 'oy': -.005, 'linewidth': 0.}
 SUBLABELS = {
     'ep':          lambda d: "EP at 0x%.8x in %s" % d['ep'][1:],
-    'size':        lambda d: "Size = %s" % _human_readable_size(d['size'], 1),
+    'size':        lambda d: "Size = %s" % human_readable_size(d['size'], 1),
     'size-ep':     lambda d: "Size = %s\nEP at 0x%.8x in %s" % \
-                             (_human_readable_size(d['size'], 1), d['ep'][1], d['ep'][2]),
+                             (human_readable_size(d['size'], 1), d['ep'][1], d['ep'][2]),
     'size-ent':    lambda d: "Size = %s\nAverage entropy: %.2f\nOverall entropy: %.2f" % \
-                             (_human_readable_size(d['size'], 1), mean(d['entropy']) * 8, d['entropy*']),
+                             (human_readable_size(d['size'], 1), mean(d['entropy']) * 8, d['entropy*']),
     'size-ep-ent': lambda d: "Size = %s\nEP at 0x%.8x in %s\nAverage entropy: %.2f\nOverall entropy: %.2f" % \
-                             (_human_readable_size(d['size'], 1), d['ep'][1], d['ep'][2], mean(d['entropy']) * 8,
+                             (human_readable_size(d['size'], 1), d['ep'][1], d['ep'][2], mean(d['entropy']) * 8,
                               d['entropy*']),
 }
 
 
-def _ensure_str(s, encoding='utf-8', errors='strict'):
-    if isinstance(s, bytes):
-        try:
-            return s.decode(encoding, errors)
-        except:
-            return s.decode("latin-1")
-    elif not isinstance(s, (str, bytes)):
-        raise TypeError("not expecting type '%s'" % type(s))
-    return s
-
-
-def _human_readable_size(size, precision=0):
-    i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
-    while size >= 1024 and i < len(units)-1:
-        i += 1
-        size /= 1024.0
-    return "%.*f%s" % (precision, size, units[i])
-
-
 class Binary:
     def __init__(self, path, **kwargs):
         from lief import logging, parse
@@ -132,7 +115,7 @@ def __sections_data(self):
             h_len = b.header.header_size + b.header.program_header_size * b.header.numberof_segments
         elif self.type == "MachO":
             h_len = [28, 32][str(b.header.magic)[-3:] == "_64"] + b.header.sizeof_cmds
-        yield 0, f"[0] Header ({_human_readable_size(h_len)})", 0, h_len, "black"
+        yield 0, f"[0] Header ({human_readable_size(h_len)})", 0, h_len, "black"
         # then handle binary's sections
         color_cursor, i = 0, 1
         for section in sorted(b.sections, key=lambda s: s.offset):
@@ -145,30 +128,30 @@ def __sections_data(self):
                 c = co[color_cursor % len(co)]
                 color_cursor += 1
             start, end = section.offset, section.offset + section.size
-            yield i, f"[{i}] {self.section_names[section.name]} ({_human_readable_size(end - start)})", start, end, c
+            yield i, f"[{i}] {self.section_names[section.name]} ({human_readable_size(end - start)})", start, end, c
             i += 1
         # sections header at the end for ELF files
         if self.type == "ELF":
             start, end = end, end + b.header.section_header_size * b.header.numberof_sections
-            yield i, f"[{i}] Section Header ({_human_readable_size(end - start)})", start, end, "black"
+            yield i, f"[{i}] Section Header ({human_readable_size(end - start)})", start, end, "black"
             i += 1
         # finally, handle the overlay
         start, end = self.size - b.overlay.nbytes, self.size
-        yield i, f"[{i}] Overlay ({_human_readable_size(end - start)})", start, self.size, "lightgray"
+        yield i, f"[{i}] Overlay ({human_readable_size(end - start)})", start, self.size, "lightgray"
         i += 1
-        yield i, f"TOTAL: {_human_readable_size(self.size)}", None, None, "white"
+        yield i, f"TOTAL: {human_readable_size(self.size)}", None, None, "white"
     
     def __segments_data(self):
         b = self.__binary
         if self.type == "PE":
             return  # segments only apply to ELF and MachO
         elif self.type == "ELF":
             for i, s in enumerate(sorted(b.segments, key=lambda x: (x.file_offset, x.physical_size))):
-                yield i, f"[{i}] {str(s.type).split('.')[1]} ({_human_readable_size(s.physical_size)})", \
+                yield i, f"[{i}] {str(s.type).split('.')[1]} ({human_readable_size(s.physical_size)})", \
                       s.file_offset, s.file_offset+s.physical_size, "lightgray"
         elif self.type == "MachO":
             for i, s in enumerate(sorted(b.segments, key=lambda x: (x.file_offset, x.file_size))):
-                yield i, f"[{i}] {s.name} ({_human_readable_size(s.file_size)})", \
+                yield i, f"[{i}] {s.name} ({human_readable_size(s.file_size)})", \
                       s.file_offset, s.file_offset+s.file_size, "lightgray"
     
     def _data(self, segments=False, overlap=False):
@@ -255,7 +238,7 @@ def rawbytes(self):
     
     @cached_property        
     def section_names(self):
-        names = {s.name: _ensure_str(s.name).strip("\x00") or "<empty>" for s in self.__binary.sections}
+        names = {s.name: ensure_str(s.name).strip("\x00") or "<empty>" for s in self.__binary.sections}
         # names from string table only applies to PE
         if self.type != "PE":
             return names
diff --git a/src/exeplot/plots/__init__.py b/src/exeplot/plots/__init__.py
@@ -6,12 +6,12 @@
 __all__ = []
 
 
-for f in os.listdir(os.path.dirname(os.path.abspath(__file__))):
+for f in sorted(os.listdir(os.path.dirname(os.path.abspath(__file__)))):
     if not f.endswith(".py") or f.startswith("_"):
         continue
     name = f[:-3]
     module = importlib.import_module(f".{name}", package=__name__)
-    if hasattr(module, "plot") and callable(getattr(module, "plot")):
+    if getattr(module, "_IMP", True) and hasattr(module, "plot") and callable(getattr(module, "plot")):
         globals()[f"{name}"] = f = getattr(module, "plot")
         f.__args__ = getattr(module, "arguments")
         f.__name__ = name
diff --git a/src/exeplot/plots/byte.py b/src/exeplot/plots/byte.py
@@ -1,6 +1,7 @@
 # -*- coding: UTF-8 -*-
-from .__common__ import _human_readable_size, Binary, COLORS
+from .__common__ import Binary, COLORS
 from ..__conf__ import save_figure
+from ..utils import human_readable_size
 
 
 def arguments(parser):
diff --git a/src/exeplot/plots/entropy.py b/src/exeplot/plots/entropy.py
@@ -1,8 +1,7 @@
 # -*- coding: UTF-8 -*-
-from math import log2
-
 from .__common__ import mean, Binary, COLORS, MIN_ZONE_WIDTH, N_SAMPLES, SUBLABELS
 from ..__conf__ import save_figure
+from ..utils import shannon_entropy
 
 
 def arguments(parser):
@@ -26,12 +25,11 @@ def data(executable, n_samples=N_SAMPLES, window_size=lambda s: 2*s, **kwargs):
     :param n_samples:   number of samples of entropy required
     :param window_size: window size for computing the entropy
     """
-    _entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
     binary = Binary(executable)
     data = {'hash': binary.hash, 'name': binary.basename, 'size': binary.size, 'type': binary.type,
             'entropy': [], 'sections': []}
     # compute window-based entropy
-    data['entropy*'] = _entropy(binary.rawbytes)
+    data['entropy*'] = shannon_entropy(binary.rawbytes)
     step, cs = abs(binary.size // n_samples), binary.size / n_samples  # chunk size
     if isinstance(window_size, type(lambda: 0)):
         window_size = window_size(step)
@@ -47,7 +45,7 @@ def data(executable, n_samples=N_SAMPLES, window_size=lambda s: 2*s, **kwargs):
             window += f.read(new_pos - cur_pos if i > 0 else winter)
             window = window[max(0, len(window)-window_size) if cur_pos + winter < binary.size else step:]
             # compute entropy
-            data['entropy'].append(_entropy(window)/8.)
+            data['entropy'].append(shannon_entropy(window)/8.)
     # compute other characteristics using the Binary instance parsed with LIEF
     # convert to 3-tuple (EP offset on plot, EP file offset, section name containing EP)
     ep, ep_sec = binary.entrypoint, binary.entrypoint_section
diff --git a/src/exeplot/plots/nested_pie.py b/src/exeplot/plots/nested_pie.py
@@ -1,6 +1,7 @@
 # -*- coding: UTF-8 -*-
-from .__common__ import _human_readable_size, Binary, COLORS, SHADOW
+from .__common__ import Binary, COLORS, SHADOW
 from ..__conf__ import save_figure
+from ..utils import human_readable_size
 
 
 def arguments(parser):
diff --git a/src/exeplot/plots/pie.py b/src/exeplot/plots/pie.py
@@ -1,6 +1,7 @@
 # -*- coding: UTF-8 -*-
-from .__common__ import _human_readable_size, Binary, COLORS, SHADOW
+from .__common__ import Binary, COLORS, SHADOW
 from ..__conf__ import save_figure
+from ..utils import human_readable_size
 
 
 def arguments(parser):
diff --git a/src/exeplot/utils.py b/src/exeplot/utils.py
@@ -0,0 +1,63 @@
+# -*- coding: UTF-8 -*-
+from math import log2
+
+
+__all__ = ["ensure_str", "human_readable_size", "ngrams_counts", "ngrams_distribution", "shannon_entropy"]
+
+shannon_entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
+
+
+def ensure_str(s, encoding='utf-8', errors='strict'):
+    """ Ensure that an input string is decoded. """
+    if isinstance(s, bytes):
+        try:
+            return s.decode(encoding, errors)
+        except:
+            return s.decode("latin-1")
+    elif not isinstance(s, (str, bytes)):
+        raise TypeError("not expecting type '%s'" % type(s))
+    return s
+
+
+def human_readable_size(size, precision=0):
+    """ Display bytes' size in a human-readable format given a precision. """
+    i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
+    while size >= 1024 and i < len(units)-1:
+        i += 1
+        size /= 1024.0
+    return "%.*f%s" % (precision, size, units[i])
+
+
+def ngrams_counts(byte_obj, n=1):
+    """ Output the Counter instance for an input byte sequence or byte object based on n-grams.
+         If the input is a byte object, cache the result.
+    
+    :param n: n determining the size of n-grams, defaults to 1
+    """
+    from collections import Counter
+    if isinstance(byte_obj, (str, bytes)):
+        return Counter(byte_obj[i:i+n] for i in range(0, len(byte_obj) - n + 1))
+    elif hasattr(byte_obj, "bytes") and hasattr(byte_obj, "size"):
+        if not hasattr(byte_obj, "_ngram_counts_cache"):
+            byte_obj._ngram_counts_cache = {}
+        if n not in byte_obj._ngram_counts_cache.keys():
+            byte_obj._ngram_counts_cache[n] = Counter(byte_obj.bytes[i:i+n] for i in range(0, byte_obj.size - n + 1))
+        return byte_obj._ngram_counts_cache[n]
+    raise TypeError("Bad input type ; should be a byte sequence or object")
+
+
+def ngrams_distribution(byte_obj, n=1, n_most_common=None, n_exclude_top=0, exclude=None):
+    """ Compute the n-grams distribution of an input byte sequence or byte object given exclusions.
+    
+    :param n:             n determining the size of n-grams, defaults to 1
+    :param n_most_common: number of n-grams to be kept in the result, keep all by default
+    :param n_exclude_top: number of n-grams to be excluded from the top of the histogram, no exclusion by default
+    :param exclude:       list of specific n-grams to be excluded, no exclusion by default
+    :return:              list of n_most_common (n-gram, count) pairs
+    """
+    c = ngrams_counts(byte_obj, n)
+    r = c.most_common(len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or []))
+    if exclude is not None:
+        r = [(ngram, count) for ngram, count in r if ngram not in exclude]
+    return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]
+
diff --git a/tests/test_others.py b/tests/test_others.py
@@ -2,16 +2,33 @@
 # -*- coding: UTF-8 -*-
 import matplotlib.pyplot as plt
 import os
-from exeplot.plots.__common__ import _ensure_str, Binary
+from collections import Counter
+from exeplot.plots.__common__ import Binary
+from exeplot.utils import *
 from unittest import TestCase
 
 
 class TestOthers(TestCase):
     def test_miscellaneous(self):
-        self.assertRaises(TypeError, _ensure_str, 1)
+        self.assertRaises(TypeError, ensure_str, 1)
         for i in range(256):
-            self.assertIsNotNone(_ensure_str(bytes([i])))
+            self.assertIsNotNone(ensure_str(bytes([i])))
         self.assertRaises(TypeError, Binary, "BAD")
         binary = Binary(os.path.join(os.path.dirname(__file__), "hello.exe"))
         self.assertIsNotNone(str(binary))
 
+
+class TestUtils(TestCase):
+    def test_ngrams_functions(self):
+        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), Counter))
+        class Test:
+            bytes = seq
+            size = len(seq)
+        histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
+        self.assertTrue(isinstance(histogram, list))
+        self.assertNotIn(b"\x00", [b for b, c in histogram])
+        self.assertNotIn(b"\xff", [b for b, c in histogram])
+        histogram2 = ngrams_distribution(t, n_most_common=300)
+        self.assertIn(b"\x00", [b for b, c in histogram2])
+        self.assertIn(b"\xff", [b for b, c in histogram2])
+
diff --git a/tests/test_plots.py b/tests/test_plots.py
@@ -56,12 +56,18 @@ def test_entropy_plot_function(self):
         for path in iter_files():
             print(f"plotting entropy of {path} (sublabel='size-ep-ent',scale=True,target='test.exe')...")
             entropy(path, sublabel="size-ep-ent", scale=True, target="test")
-        print(f"plotting entropy of {path}.exe and {path}.elf (labels=['PE', lambda x:'ELF'],sublabel='size-ep-ent',scale=True)...")
+        print(f"plotting entropy of {path}.exe and {path}.elf (labels=['PE', lambda x:'ELF'],sublabel='size-ep-ent',"
+              "scale=True)...")
         path = os.path.join(os.path.dirname(__file__), "hello")
-        entropy(f"{path}.exe", f"{path}.elf", labels=["PE", lambda x: "ELF"], sublabel="size-ep-ent", scale=True)
+        for img in entropy(f"{path}.exe", f"{path}.elf", labels=["PE", lambda x: "ELF"], sublabel="size-ep-ent",
+                           scale=True):
+            os.remove(img)
+            plt.clf()
     
     def test_pie_plot_function(self):
         for path in iter_files():
             print(f"plotting pie of {path} (donut=True)...")
-            pie(path, donut=True)
+            for img in pie(path, donut=True):
+                os.remove(img)
+                plt.clf()