Skip to content

Commit b6ace69

Browse files
committed
Compute checksums from binary stream
Scanning a large file may exhaust memory and file because commoncode.hash loads the whole file in memory. With this commit, the files are loaded in chunks so we do not run out of memory on very large files, such as VM images. Reference: #72 Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 878be61 commit b6ace69

File tree

2 files changed

+186
-57
lines changed

2 files changed

+186
-57
lines changed

src/commoncode/hash.py

Lines changed: 168 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88

99
import binascii
1010
import hashlib
11+
import os
1112
import sys
13+
1214
from functools import partial
1315

1416
from commoncode import filetype
@@ -26,41 +28,81 @@
2628
"""
2729

2830

29-
def _hash_mod(bitsize, hmodule):
31+
class Hashable:
3032
"""
31-
Return a hashing class returning hashes with a `bitsize` bit length. The
32-
interface of this class is similar to the hash module API.
33+
A mixin for hashers that provides the base methods.
3334
"""
35+
# digest_size = length of binary digest for this hash
36+
# binh = binary hasher module
37+
# msg_len = length in bytes of the messages hashed
38+
# total_length = total length in bytes of the messages hashed
3439

35-
class hasher(object):
36-
def __init__(self, msg=None):
37-
self.digest_size = bitsize // 8
38-
self.h = msg and hmodule(msg).digest()[: self.digest_size] or None
40+
def digest(self):
41+
"""
42+
Return a bytes string digest for this hash.
43+
"""
44+
if not self.msg_len:
45+
return
46+
return self.binh.digest()[: self.digest_size]
3947

40-
def digest(self):
41-
return bytes(self.h)
48+
def hexdigest(self):
49+
"""
50+
Return a string hex digest for this hash.
51+
"""
52+
return self.msg_len and binascii.hexlify(self.digest()).decode("utf-8")
4253

43-
def hexdigest(self):
44-
return self.h and binascii.hexlify(self.h).decode("utf-8")
54+
def b64digest(self):
55+
"""
56+
Return a string base64 digest for this hash.
57+
"""
58+
return self.msg_len and urlsafe_b64encode(self.digest()).decode("utf-8")
59+
60+
def intdigest(self):
61+
"""
62+
Return a int digest for this hash.
63+
"""
64+
return self.msg_len and int(bin_to_num(self.digest()))
65+
66+
67+
def _hash_mod(bitsize, hmodule):
68+
"""
69+
Return a hasher class that returns hashes with a ``bitsize`` bit length. The interface of this
70+
class is similar to the hash module API.
71+
"""
4572

46-
def b64digest(self):
47-
return self.h and urlsafe_b64encode(self.h).decode("utf-8")
73+
class hasher(Hashable):
74+
"""A hasher class that behaves like a hashlib module."""
4875

49-
def intdigest(self):
50-
return self.h and int(bin_to_num(self.h))
76+
def __init__(self, msg=None, **kwargs):
77+
"""
78+
Return a hasher, populated with an initial ``msg`` bytes string.
79+
Close on the bitsize and hmodule
80+
"""
81+
self.digest_size = bitsize // 8
82+
self.binh = hmodule()
83+
self.msg_len = 0
84+
if msg:
85+
self.update(msg)
86+
87+
def update(self, msg=None):
88+
"""
89+
Update this hash with a ``msg`` bytes string.
90+
"""
91+
if msg:
92+
self.binh.update(msg)
93+
self.msg_len += len(msg)
5194

5295
return hasher
5396

5497

55-
# for FIPS support
98+
# for FIPS support, we declare that "usedforsecurity" is False
5699
sys_v0 = sys.version_info[0]
57100
sys_v1 = sys.version_info[1]
58101
if sys_v0 == 3 and sys_v1 >= 9:
59102
md5_hasher = partial(hashlib.md5, usedforsecurity=False)
60103
else:
61104
md5_hasher = hashlib.md5
62105

63-
64106
# Base hashers for each bit size
65107
_hashmodules_by_bitsize = {
66108
# md5-based
@@ -82,31 +124,62 @@ def get_hasher(bitsize):
82124
return _hashmodules_by_bitsize[bitsize]
83125

84126

85-
class sha1_git_hasher(object):
127+
class sha1_git_hasher(Hashable):
86128
"""
87129
Hash content using the git blob SHA1 convention.
130+
See https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_object_storage
88131
"""
89132

90-
def __init__(self, msg=None):
133+
def __init__(self, msg=None, total_length=0, **kwargs):
134+
"""
135+
Initialize a sha1_git_hasher with an optional ``msg`` byte string. The ``total_length`` of
136+
all content that will be hashed, combining the ``msg`` length plus any later call to
137+
update() with additional messages.
138+
"""
91139
self.digest_size = 160 // 8
92-
self.h = msg and self._compute(msg) or None
140+
self.msg_len = 0
93141

94-
def _compute(self, msg):
95-
# note: bytes interpolation is new in Python 3.5
96-
git_blob_msg = b"blob %d\0%s" % (len(msg), msg)
97-
return hashlib.sha1(git_blob_msg).digest()
142+
if msg:
143+
self.msg_len = msg_len = len(msg)
98144

99-
def digest(self):
100-
return bytes(self.h)
145+
if not total_length:
146+
total_length = msg_len
147+
else:
148+
if total_length < msg_len:
149+
raise ValueError(
150+
f"Initial msg length: {msg_len} "
151+
f"cannot be larger than the the total_length: {self.total_length}"
152+
)
101153

102-
def hexdigest(self):
103-
return self.h and binascii.hexlify(self.h).decode("utf-8")
154+
if not total_length:
155+
raise ValueError("total_length cannot be zero")
104156

105-
def b64digest(self):
106-
return self.h and urlsafe_b64encode(self.h).decode("utf-8")
157+
self.total_length = total_length
158+
self.binh = get_hasher(bitsize=160)(total_length=total_length)
107159

108-
def intdigest(self):
109-
return self.h and int(bin_to_num(self.h))
160+
self._hash_header()
161+
if msg:
162+
self.update(msg)
163+
164+
def _hash_header(self):
165+
# note: bytes interpolation is new in Python 3.5
166+
git_blob_header = b"blob %d\0" % (self.total_length)
167+
self.binh.update(msg=git_blob_header)
168+
169+
def update(self, msg=None):
170+
"""
171+
Update this hash with a ``msg`` bytes string.
172+
"""
173+
if msg:
174+
msg_len = len(msg)
175+
if (msg_len + self.msg_len) > self.total_length:
176+
raise ValueError(
177+
f"Actual combined msg lengths: initial: {self.msg_len} plus added: {msg_len} "
178+
f"cannot be larger than the the total_length: {self.total_length}"
179+
)
180+
181+
self.binh.update(msg)
182+
self.msg_len += msg_len
110183

111184

112185
_hashmodules_by_name = {
@@ -119,25 +192,47 @@ def intdigest(self):
119192
}
120193

121194

195+
def get_hasher_instance_by_name(name, total_length=0):
196+
"""
197+
Return a hasher instance for a checksum algorithm ``name`` with a planned ``total_length`` of
198+
bytes to hash.
199+
"""
200+
try:
201+
hm = _hashmodules_by_name[name]
202+
return hm(total_length=total_length)
203+
except KeyError:
204+
raise ValueError(f"Unknown checksum algorithm: {name!r}")
205+
206+
207+
def get_file_size(location):
208+
return os.path.getsize(location)
209+
210+
122211
def checksum(location, name, base64=False):
123212
"""
124-
Return a checksum of `bitsize` length from the content of the file at
125-
`location`. The checksum is a hexdigest or base64-encoded is `base64` is
126-
True.
213+
Return a checksum from the content of the file at ``location`` using the ``name`` checksum
214+
algorithm. The checksum is a string as a hexdigest or is base64-encoded is ``base64`` is True.
127215
"""
128216
if not filetype.is_file(location):
129217
return
130-
hasher = _hashmodules_by_name[name]
131218

132-
# fixme: we should read in chunks?
133-
with open(location, "rb") as f:
134-
hashable = f.read()
219+
total_length = get_file_size(location)
220+
chunks = binary_chunks(location)
221+
return checksum_from_chunks(chunks=chunks, total_length=total_length, name=name, base64=base64)
135222

136-
hashed = hasher(hashable)
137-
if base64:
138-
return hashed.b64digest()
139223

140-
return hashed.hexdigest()
224+
def checksum_from_chunks(chunks, name, total_length=0, base64=False):
225+
"""
226+
Return a checksum from the content of the iterator of byte strings ``chunks`` with a
227+
``total_length`` combined length using the ``name`` checksum algorithm. The returned checksum is
228+
a string as a hexdigest or is base64-encoded is ``base64`` is True.
229+
"""
230+
hasher = get_hasher_instance_by_name(name=name, total_length=total_length)
231+
for chunk in chunks:
232+
hasher.update(chunk)
233+
if base64:
234+
return hasher.b64digest()
235+
return hasher.hexdigest()
141236

142237

143238
def md5(location):
@@ -164,21 +259,37 @@ def sha1_git(location):
164259
return checksum(location, name="sha1_git", base64=False)
165260

166261

167-
def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
262+
def binary_chunks(location, size=2 ** 24):
168263
"""
169-
Return a mapping of hexdigest checksums keyed by checksum name from the content
170-
of the file at `location`. Use the `checksum_names` list of checksum names.
171-
The mapping is guaranted to contains all the requested names as keys.
172-
If the location is not a file, the values are None.
264+
Read file at ``location`` as binary and yield bytes of up to ``size`` length in bytes,
265+
defaulting to 2**24 bytes, e.g., about 16 MB.
173266
"""
174-
results = dict([(name, None) for name in checksum_names])
175-
if not filetype.is_file(location):
176-
return results
177-
178-
# fixme: we should read in chunks?
179267
with open(location, "rb") as f:
180-
hashable = f.read()
268+
while True:
269+
chunk = f.read(size)
270+
if not chunk:
271+
break
272+
yield chunk
181273

182-
for name in checksum_names:
183-
results[name] = _hashmodules_by_name[name](hashable).hexdigest()
184-
return results
274+
275+
def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")):
276+
"""
277+
Return a mapping of hexdigest checksum strings keyed by checksum algorithm name from hashing the
278+
content of the file at ``location``. Use the ``checksum_names`` list of checksum names. The
279+
mapping is guaranted to contains all the requested names as keys. If the location is not a file,
280+
or if the file is empty, the values are None.
281+
The purpose of this function is
282+
"""
283+
if not filetype.is_file(location):
284+
return {name: None for name in checksum_names}
285+
file_size = get_file_size(location)
286+
hashers = {
287+
name: get_hasher_instance_by_name(name=name, total_length=file_size)
288+
for name in checksum_names
289+
}
290+
291+
for chunk in binary_chunks(location):
292+
for hasher in hashers.values():
293+
hasher.update(msg=chunk)
294+
295+
return {name: hasher.hexdigest() for name, hasher in hashers.items()}

tests/test_hash.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
#
88

99
import os
10+
import hashlib
1011

1112
from commoncode.hash import b64sha1
1213
from commoncode.hash import checksum
14+
from commoncode.hash import checksum_from_chunks
1315
from commoncode.hash import get_hasher
1416
from commoncode.hash import md5
1517
from commoncode.hash import multi_checksums
@@ -174,3 +176,19 @@ def test_sha1_git_checksum(self):
174176
test_file = self.get_test_loc(test_file)
175177
# test that we match the git hash-object
176178
assert sha1_git(test_file) == expected_sha1_git
179+
180+
def test_checksum_from_chunks_can_stream_gigabytes(self):
181+
chunk_16mb = b"0" * 16000000
182+
chunks_3dot2gb = (chunk_16mb for _ in range(200))
183+
result = checksum_from_chunks(chunks=chunks_3dot2gb, total_length=16000000 * 200, name="sha1_git")
184+
assert result == "494caf26c43c4473f6e930b0f5c2ecf8121bcf24"
185+
186+
def test_checksum_from_chunks_from_stream_is_same_as_plain(self):
187+
chunk = b"0" * 16000
188+
chunks = (chunk for _ in range(100))
189+
result1 = checksum_from_chunks(chunks=chunks, name="sha256")
190+
191+
result2 = hashlib.sha256()
192+
for _ in range(100):
193+
result2.update(chunk)
194+
assert result1 == result2.hexdigest()

0 commit comments

Comments
 (0)