Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Release notes
=============


Version 32.1.0 - (2024-12-06)
-----------------------------

- Compute file checksums from streaming the file content in chunks to avoid running out of memory


Version 32.0.0 - (2024-09-05)
-----------------------------

Expand Down
225 changes: 168 additions & 57 deletions src/commoncode/hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import binascii
import hashlib
import os
import sys
from functools import partial

Expand All @@ -26,41 +27,82 @@
"""


def _hash_mod(bitsize, hmodule):
class Hashable:
"""
Return a hashing class returning hashes with a `bitsize` bit length. The
interface of this class is similar to the hash module API.
A mixin for hashers that provides the base methods.
"""

class hasher(object):
def __init__(self, msg=None):
self.digest_size = bitsize // 8
self.h = msg and hmodule(msg).digest()[: self.digest_size] or None
# digest_size = length of binary digest for this hash
# binh = binary hasher module
# msg_len = length in bytes of the messages hashed
# total_length = total length in bytes of the messages hashed

def digest(self):
"""
Return a bytes string digest for this hash.
"""
if not self.msg_len:
return
return self.binh.digest()[: self.digest_size]

def digest(self):
return bytes(self.h)
def hexdigest(self):
"""
Return a string hex digest for this hash.
"""
return self.msg_len and binascii.hexlify(self.digest()).decode("utf-8")

def hexdigest(self):
return self.h and binascii.hexlify(self.h).decode("utf-8")
def b64digest(self):
"""
Return a string base64 digest for this hash.
"""
return self.msg_len and urlsafe_b64encode(self.digest()).decode("utf-8")

def intdigest(self):
"""
Return a int digest for this hash.
"""
return self.msg_len and int(bin_to_num(self.digest()))


def _hash_mod(bitsize, hmodule):
"""
Return a hasher class that returns hashes with a ``bitsize`` bit length. The interface of this
class is similar to the hash module API.
"""

def b64digest(self):
return self.h and urlsafe_b64encode(self.h).decode("utf-8")
class hasher(Hashable):
"""A hasher class that behaves like a hashlib module."""

def intdigest(self):
return self.h and int(bin_to_num(self.h))
def __init__(self, msg=None, **kwargs):
"""
Return a hasher, populated with an initial ``msg`` bytes string.
Close on the bitsize and hmodule
"""
self.digest_size = bitsize // 8
self.binh = hmodule()
self.msg_len = 0
if msg:
self.update(msg)

def update(self, msg=None):
"""
Update this hash with a ``msg`` bytes string.
"""
if msg:
self.binh.update(msg)
self.msg_len += len(msg)

return hasher


# for FIPS support
# for FIPS support, we declare that "usedforsecurity" is False
sys_v0 = sys.version_info[0]
sys_v1 = sys.version_info[1]
if sys_v0 == 3 and sys_v1 >= 9:
md5_hasher = partial(hashlib.md5, usedforsecurity=False)
else:
md5_hasher = hashlib.md5


# Base hashers for each bit size
_hashmodules_by_bitsize = {
# md5-based
Expand All @@ -82,31 +124,62 @@ def get_hasher(bitsize):
return _hashmodules_by_bitsize[bitsize]


class sha1_git_hasher(object):
class sha1_git_hasher(Hashable):
"""
Hash content using the git blob SHA1 convention.
See https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_object_storage
"""

def __init__(self, msg=None):
def __init__(self, msg=None, total_length=0, **kwargs):
"""
Initialize a sha1_git_hasher with an optional ``msg`` byte string. The ``total_length`` of
all content that will be hashed, combining the ``msg`` length plus any later call to
update() with additional messages.
"""
self.digest_size = 160 // 8
self.h = msg and self._compute(msg) or None
self.msg_len = 0

def _compute(self, msg):
# note: bytes interpolation is new in Python 3.5
git_blob_msg = b"blob %d\0%s" % (len(msg), msg)
return hashlib.sha1(git_blob_msg).digest()
if msg:
self.msg_len = msg_len = len(msg)

def digest(self):
return bytes(self.h)
if not total_length:
total_length = msg_len
else:
if total_length < msg_len:
raise ValueError(
f"Initial msg length: {msg_len} "
f"cannot be larger than the the total_length: {self.total_length}"
)

def hexdigest(self):
return self.h and binascii.hexlify(self.h).decode("utf-8")
if not total_length:
raise ValueError("total_length cannot be zero")

def b64digest(self):
return self.h and urlsafe_b64encode(self.h).decode("utf-8")
self.total_length = total_length
self.binh = get_hasher(bitsize=160)(total_length=total_length)

def intdigest(self):
return self.h and int(bin_to_num(self.h))
self._hash_header()
if msg:
self.update(msg)

def _hash_header(self):
# note: bytes interpolation is new in Python 3.5
git_blob_header = b"blob %d\0" % (self.total_length)
self.binh.update(msg=git_blob_header)

def update(self, msg=None):
"""
Update this hash with a ``msg`` bytes string.
"""
if msg:
msg_len = len(msg)
if (msg_len + self.msg_len) > self.total_length:
raise ValueError(
f"Actual combined msg lengths: initial: {self.msg_len} plus added: {msg_len} "
f"cannot be larger than the the total_length: {self.total_length}"
)

self.binh.update(msg)
self.msg_len += msg_len


_hashmodules_by_name = {
Expand All @@ -119,25 +192,47 @@ def intdigest(self):
}


def get_hasher_instance_by_name(name, total_length=0):
"""
Return a hasher instance for a checksum algorithm ``name`` with a planned ``total_length`` of
bytes to hash.
"""
try:
hm = _hashmodules_by_name[name]
return hm(total_length=total_length)
except KeyError:
raise ValueError(f"Unknown checksum algorithm: {name!r}")


def get_file_size(location):
return os.path.getsize(location)


def checksum(location, name, base64=False):
"""
Return a checksum of `bitsize` length from the content of the file at
`location`. The checksum is a hexdigest or base64-encoded is `base64` is
True.
Return a checksum from the content of the file at ``location`` using the ``name`` checksum
algorithm. The checksum is a string as a hexdigest or is base64-encoded is ``base64`` is True.
"""
if not filetype.is_file(location):
return
hasher = _hashmodules_by_name[name]

# fixme: we should read in chunks?
with open(location, "rb") as f:
hashable = f.read()
total_length = get_file_size(location)
chunks = binary_chunks(location)
return checksum_from_chunks(chunks=chunks, total_length=total_length, name=name, base64=base64)

hashed = hasher(hashable)
if base64:
return hashed.b64digest()

return hashed.hexdigest()
def checksum_from_chunks(chunks, name, total_length=0, base64=False):
"""
Return a checksum from the content of the iterator of byte strings ``chunks`` with a
``total_length`` combined length using the ``name`` checksum algorithm. The returned checksum is
a string as a hexdigest or is base64-encoded is ``base64`` is True.
"""
hasher = get_hasher_instance_by_name(name=name, total_length=total_length)
for chunk in chunks:
hasher.update(chunk)
if base64:
return hasher.b64digest()
return hasher.hexdigest()


def md5(location):
Expand All @@ -164,21 +259,37 @@ def sha1_git(location):
return checksum(location, name="sha1_git", base64=False)


def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
def binary_chunks(location, size=2**24):
"""
Return a mapping of hexdigest checksums keyed by checksum name from the content
of the file at `location`. Use the `checksum_names` list of checksum names.
The mapping is guaranted to contains all the requested names as keys.
If the location is not a file, the values are None.
Read file at ``location`` as binary and yield bytes of up to ``size`` length in bytes,
defaulting to 2**24 bytes, e.g., about 16 MB.
"""
results = dict([(name, None) for name in checksum_names])
if not filetype.is_file(location):
return results

# fixme: we should read in chunks?
with open(location, "rb") as f:
hashable = f.read()
while True:
chunk = f.read(size)
if not chunk:
break
yield chunk

for name in checksum_names:
results[name] = _hashmodules_by_name[name](hashable).hexdigest()
return results

def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
"""
Return a mapping of hexdigest checksum strings keyed by checksum algorithm name from hashing the
content of the file at ``location``. Use the ``checksum_names`` list of checksum names. The
mapping is guaranted to contains all the requested names as keys. If the location is not a file,
or if the file is empty, the values are None.
The purpose of this function is
"""
if not filetype.is_file(location):
return {name: None for name in checksum_names}
file_size = get_file_size(location)
hashers = {
name: get_hasher_instance_by_name(name=name, total_length=file_size)
for name in checksum_names
}

for chunk in binary_chunks(location):
for hasher in hashers.values():
hasher.update(msg=chunk)

return {name: hasher.hexdigest() for name, hasher in hashers.items()}
20 changes: 20 additions & 0 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import hashlib
import os

from commoncode.hash import b64sha1
from commoncode.hash import checksum
from commoncode.hash import checksum_from_chunks
from commoncode.hash import get_hasher
from commoncode.hash import md5
from commoncode.hash import multi_checksums
Expand Down Expand Up @@ -174,3 +176,21 @@ def test_sha1_git_checksum(self):
test_file = self.get_test_loc(test_file)
# test that we match the git hash-object
assert sha1_git(test_file) == expected_sha1_git

def test_checksum_from_chunks_can_stream_gigabytes(self):
chunk_16mb = b"0" * 16000000
chunks_3dot2gb = (chunk_16mb for _ in range(200))
result = checksum_from_chunks(
chunks=chunks_3dot2gb, total_length=16000000 * 200, name="sha1_git"
)
assert result == "494caf26c43c4473f6e930b0f5c2ecf8121bcf24"

def test_checksum_from_chunks_from_stream_is_same_as_plain(self):
chunk = b"0" * 16000
chunks = (chunk for _ in range(100))
result1 = checksum_from_chunks(chunks=chunks, name="sha256")

result2 = hashlib.sha256()
for _ in range(100):
result2.update(chunk)
assert result1 == result2.hexdigest()