aboutcode-org · AyanSinhaMahapatra · Dec 6, 2024 · Oct 18, 2023 · Oct 18, 2023 · Nov 21, 2023
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,13 @@
 Release notes
 =============
 
+
+Version 32.1.0 - (2024-12-06)
+-----------------------------
+
+- Compute file checksums from streaming the file content in chunks to avoid running out of memory
+
+
 Version 32.0.0 - (2024-09-05)
 -----------------------------
 

diff --git a/src/commoncode/hash.py b/src/commoncode/hash.py
@@ -8,6 +8,7 @@
 
 import binascii
 import hashlib
+import os
 import sys
 from functools import partial
 
@@ -26,41 +27,82 @@
 """
 
 
-def _hash_mod(bitsize, hmodule):
+class Hashable:
     """
-    Return a hashing class returning hashes with a `bitsize` bit length. The
-    interface of this class is similar to the hash module API.
+    A mixin for hashers that provides the base methods.
     """
 
-    class hasher(object):
-        def __init__(self, msg=None):
-            self.digest_size = bitsize // 8
-            self.h = msg and hmodule(msg).digest()[: self.digest_size] or None
+    # digest_size = length of binary digest for this hash
+    # binh = binary hasher module
+    # msg_len = length in bytes of the messages hashed
+    # total_length = total length in bytes of the messages hashed
+
+    def digest(self):
+        """
+        Return a bytes string digest for this hash.
+        """
+        if not self.msg_len:
+            return
+        return self.binh.digest()[: self.digest_size]
 
-        def digest(self):
-            return bytes(self.h)
+    def hexdigest(self):
+        """
+        Return a string hex digest for this hash.
+        """
+        return self.msg_len and binascii.hexlify(self.digest()).decode("utf-8")
 
-        def hexdigest(self):
-            return self.h and binascii.hexlify(self.h).decode("utf-8")
+    def b64digest(self):
+        """
+        Return a string base64 digest for this hash.
+        """
+        return self.msg_len and urlsafe_b64encode(self.digest()).decode("utf-8")
+
+    def intdigest(self):
+        """
+        Return a int digest for this hash.
+        """
+        return self.msg_len and int(bin_to_num(self.digest()))
+
+
+def _hash_mod(bitsize, hmodule):
+    """
+    Return a hasher class that returns hashes with a ``bitsize`` bit length. The interface of this
+    class is similar to the hash module API.
+    """
 
-        def b64digest(self):
-            return self.h and urlsafe_b64encode(self.h).decode("utf-8")
+    class hasher(Hashable):
+        """A hasher class that behaves like a hashlib module."""
 
-        def intdigest(self):
-            return self.h and int(bin_to_num(self.h))
+        def __init__(self, msg=None, **kwargs):
+            """
+            Return a hasher, populated with an initial ``msg`` bytes string.
+            Close on the bitsize and hmodule
+            """
+            self.digest_size = bitsize // 8
+            self.binh = hmodule()
+            self.msg_len = 0
+            if msg:
+                self.update(msg)
+
+        def update(self, msg=None):
+            """
+            Update this hash with a ``msg`` bytes string.
+            """
+            if msg:
+                self.binh.update(msg)
+                self.msg_len += len(msg)
 
     return hasher
 
 
-# for FIPS support
+# for FIPS support, we declare that "usedforsecurity" is False
 sys_v0 = sys.version_info[0]
 sys_v1 = sys.version_info[1]
 if sys_v0 == 3 and sys_v1 >= 9:
     md5_hasher = partial(hashlib.md5, usedforsecurity=False)
 else:
     md5_hasher = hashlib.md5
 
-
 # Base hashers for each bit size
 _hashmodules_by_bitsize = {
     # md5-based
@@ -82,31 +124,62 @@ def get_hasher(bitsize):
     return _hashmodules_by_bitsize[bitsize]
 
 
-class sha1_git_hasher(object):
+class sha1_git_hasher(Hashable):
     """
     Hash content using the git blob SHA1 convention.
+    See https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_object_storage
     """
 
-    def __init__(self, msg=None):
+    def __init__(self, msg=None, total_length=0, **kwargs):
+        """
+        Initialize a sha1_git_hasher with an optional ``msg`` byte string. The ``total_length`` of
+        all content that will be hashed, combining the ``msg`` length plus any later call to
+        update() with additional messages.
+        """
         self.digest_size = 160 // 8
-        self.h = msg and self._compute(msg) or None
+        self.msg_len = 0
 
-    def _compute(self, msg):
-        # note: bytes interpolation is new in Python 3.5
-        git_blob_msg = b"blob %d\0%s" % (len(msg), msg)
-        return hashlib.sha1(git_blob_msg).digest()
+        if msg:
+            self.msg_len = msg_len = len(msg)
 
-    def digest(self):
-        return bytes(self.h)
+            if not total_length:
+                total_length = msg_len
+            else:
+                if total_length < msg_len:
+                    raise ValueError(
+                        f"Initial msg length: {msg_len} "
+                        f"cannot be larger than the the total_length: {self.total_length}"
+                    )
 
-    def hexdigest(self):
-        return self.h and binascii.hexlify(self.h).decode("utf-8")
+        if not total_length:
+            raise ValueError("total_length cannot be zero")
 
-    def b64digest(self):
-        return self.h and urlsafe_b64encode(self.h).decode("utf-8")
+        self.total_length = total_length
+        self.binh = get_hasher(bitsize=160)(total_length=total_length)
 
-    def intdigest(self):
-        return self.h and int(bin_to_num(self.h))
+        self._hash_header()
+        if msg:
+            self.update(msg)
+
+    def _hash_header(self):
+        # note: bytes interpolation is new in Python 3.5
+        git_blob_header = b"blob %d\0" % (self.total_length)
+        self.binh.update(msg=git_blob_header)
+
+    def update(self, msg=None):
+        """
+        Update this hash with a ``msg`` bytes string.
+        """
+        if msg:
+            msg_len = len(msg)
+            if (msg_len + self.msg_len) > self.total_length:
+                raise ValueError(
+                    f"Actual combined msg lengths: initial: {self.msg_len} plus added: {msg_len} "
+                    f"cannot be larger than the the total_length: {self.total_length}"
+                )
+
+            self.binh.update(msg)
+            self.msg_len += msg_len
 
 
 _hashmodules_by_name = {
@@ -119,25 +192,47 @@ def intdigest(self):
 }
 
 
+def get_hasher_instance_by_name(name, total_length=0):
+    """
+    Return a hasher instance for a checksum algorithm ``name`` with a planned ``total_length`` of
+    bytes to hash.
+    """
+    try:
+        hm = _hashmodules_by_name[name]
+        return hm(total_length=total_length)
+    except KeyError:
+        raise ValueError(f"Unknown checksum algorithm: {name!r}")
+
+
+def get_file_size(location):
+    return os.path.getsize(location)
+
+
 def checksum(location, name, base64=False):
     """
-    Return a checksum of `bitsize` length from the content of the file at
-    `location`. The checksum is a hexdigest or base64-encoded is `base64` is
-    True.
+    Return a checksum from the content of the file at ``location`` using the ``name`` checksum
+    algorithm. The checksum is a string as a hexdigest or is base64-encoded is ``base64`` is True.
     """
     if not filetype.is_file(location):
         return
-    hasher = _hashmodules_by_name[name]
 
-    # fixme: we should read in chunks?
-    with open(location, "rb") as f:
-        hashable = f.read()
+    total_length = get_file_size(location)
+    chunks = binary_chunks(location)
+    return checksum_from_chunks(chunks=chunks, total_length=total_length, name=name, base64=base64)
 
-    hashed = hasher(hashable)
-    if base64:
-        return hashed.b64digest()
 
-    return hashed.hexdigest()
+def checksum_from_chunks(chunks, name, total_length=0, base64=False):
+    """
+    Return a checksum from the content of the iterator of byte strings ``chunks`` with a
+    ``total_length`` combined length using the ``name`` checksum algorithm. The returned checksum is
+    a string as a hexdigest or is base64-encoded is ``base64`` is True.
+    """
+    hasher = get_hasher_instance_by_name(name=name, total_length=total_length)
+    for chunk in chunks:
+        hasher.update(chunk)
+    if base64:
+        return hasher.b64digest()
+    return hasher.hexdigest()
 
 
 def md5(location):
@@ -164,21 +259,37 @@ def sha1_git(location):
     return checksum(location, name="sha1_git", base64=False)
 
 
-def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
+def binary_chunks(location, size=2**24):
     """
-    Return a mapping of hexdigest checksums keyed by checksum name from the content
-    of the file at `location`. Use the `checksum_names` list of checksum names.
-    The mapping is guaranted to contains all the requested names as keys.
-    If the location is not a file, the values are None.
+    Read file at ``location`` as binary and yield bytes of up to ``size`` length in bytes,
+    defaulting to 2**24 bytes, e.g., about 16 MB.
     """
-    results = dict([(name, None) for name in checksum_names])
-    if not filetype.is_file(location):
-        return results
-
-    # fixme: we should read in chunks?
     with open(location, "rb") as f:
-        hashable = f.read()
+        while True:
+            chunk = f.read(size)
+            if not chunk:
+                break
+            yield chunk
 
-    for name in checksum_names:
-        results[name] = _hashmodules_by_name[name](hashable).hexdigest()
-    return results
+
+def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
+    """
+    Return a mapping of hexdigest checksum strings keyed by checksum algorithm name from hashing the
+    content of the file at ``location``. Use the ``checksum_names`` list of checksum names. The
+    mapping is guaranted to contains all the requested names as keys. If the location is not a file,
+    or if the file is empty, the values are None.
+    The purpose of this function is
+    """
+    if not filetype.is_file(location):
+        return {name: None for name in checksum_names}
+    file_size = get_file_size(location)
+    hashers = {
+        name: get_hasher_instance_by_name(name=name, total_length=file_size)
+        for name in checksum_names
+    }
+
+    for chunk in binary_chunks(location):
+        for hasher in hashers.values():
+            hasher.update(msg=chunk)
+
+    return {name: hasher.hexdigest() for name, hasher in hashers.items()}
diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -6,10 +6,12 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import hashlib
 import os
 
 from commoncode.hash import b64sha1
 from commoncode.hash import checksum
+from commoncode.hash import checksum_from_chunks
 from commoncode.hash import get_hasher
 from commoncode.hash import md5
 from commoncode.hash import multi_checksums
@@ -174,3 +176,21 @@ def test_sha1_git_checksum(self):
             test_file = self.get_test_loc(test_file)
             # test that we match the git hash-object
             assert sha1_git(test_file) == expected_sha1_git
+
+    def test_checksum_from_chunks_can_stream_gigabytes(self):
+        chunk_16mb = b"0" * 16000000
+        chunks_3dot2gb = (chunk_16mb for _ in range(200))
+        result = checksum_from_chunks(
+            chunks=chunks_3dot2gb, total_length=16000000 * 200, name="sha1_git"
+        )
+        assert result == "494caf26c43c4473f6e930b0f5c2ecf8121bcf24"
+
+    def test_checksum_from_chunks_from_stream_is_same_as_plain(self):
+        chunk = b"0" * 16000
+        chunks = (chunk for _ in range(100))
+        result1 = checksum_from_chunks(chunks=chunks, name="sha256")
+
+        result2 = hashlib.sha256()
+        for _ in range(100):
+            result2.update(chunk)
+        assert result1 == result2.hexdigest()