88
99import binascii
1010import hashlib
11+ import os
1112import sys
13+
1214from functools import partial
1315
1416from commoncode import filetype
2628"""
2729
2830
29- def _hash_mod ( bitsize , hmodule ) :
31+ class Hashable :
3032 """
31- Return a hashing class returning hashes with a `bitsize` bit length. The
32- interface of this class is similar to the hash module API.
33+ A mixin for hashers that provides the base methods.
3334 """
35+ # digest_size = length of binary digest for this hash
36+ # binh = binary hasher module
37+ # msg_len = length in bytes of the messages hashed
38+ # total_length = total length in bytes of the messages hashed
3439
35- class hasher (object ):
36- def __init__ (self , msg = None ):
37- self .digest_size = bitsize // 8
38- self .h = msg and hmodule (msg ).digest ()[: self .digest_size ] or None
40+ def digest (self ):
41+ """
42+ Return a bytes string digest for this hash.
43+ """
44+ if not self .msg_len :
45+ return
46+ return self .binh .digest ()[: self .digest_size ]
3947
40- def digest (self ):
41- return bytes (self .h )
48+ def hexdigest (self ):
49+ """
50+ Return a string hex digest for this hash.
51+ """
52+ return self .msg_len and binascii .hexlify (self .digest ()).decode ("utf-8" )
4253
43- def hexdigest (self ):
44- return self .h and binascii .hexlify (self .h ).decode ("utf-8" )
54+ def b64digest (self ):
55+ """
56+ Return a string base64 digest for this hash.
57+ """
58+ return self .msg_len and urlsafe_b64encode (self .digest ()).decode ("utf-8" )
59+
60+ def intdigest (self ):
61+ """
62+ Return a int digest for this hash.
63+ """
64+ return self .msg_len and int (bin_to_num (self .digest ()))
65+
66+
67+ def _hash_mod (bitsize , hmodule ):
68+ """
69+ Return a hasher class that returns hashes with a ``bitsize`` bit length. The interface of this
70+ class is similar to the hash module API.
71+ """
4572
46- def b64digest ( self ):
47- return self . h and urlsafe_b64encode ( self . h ). decode ( "utf-8" )
73+ class hasher ( Hashable ):
74+ """A hasher class that behaves like a hashlib module."""
4875
49- def intdigest (self ):
50- return self .h and int (bin_to_num (self .h ))
76+ def __init__ (self , msg = None , ** kwargs ):
77+ """
78+ Return a hasher, populated with an initial ``msg`` bytes string.
79+ Close on the bitsize and hmodule
80+ """
81+ self .digest_size = bitsize // 8
82+ self .binh = hmodule ()
83+ self .msg_len = 0
84+ if msg :
85+ self .update (msg )
86+
87+ def update (self , msg = None ):
88+ """
89+ Update this hash with a ``msg`` bytes string.
90+ """
91+ if msg :
92+ self .binh .update (msg )
93+ self .msg_len += len (msg )
5194
5295 return hasher
5396
5497
55- # for FIPS support
98+ # for FIPS support, we declare that "usedforsecurity" is False
5699sys_v0 = sys .version_info [0 ]
57100sys_v1 = sys .version_info [1 ]
58101if sys_v0 == 3 and sys_v1 >= 9 :
59102 md5_hasher = partial (hashlib .md5 , usedforsecurity = False )
60103else :
61104 md5_hasher = hashlib .md5
62105
63-
64106# Base hashers for each bit size
65107_hashmodules_by_bitsize = {
66108 # md5-based
@@ -82,31 +124,62 @@ def get_hasher(bitsize):
82124 return _hashmodules_by_bitsize [bitsize ]
83125
84126
85- class sha1_git_hasher (object ):
127+ class sha1_git_hasher (Hashable ):
86128 """
87129 Hash content using the git blob SHA1 convention.
130+ See https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_object_storage
88131 """
89132
90- def __init__ (self , msg = None ):
133+ def __init__ (self , msg = None , total_length = 0 , ** kwargs ):
134+ """
135+ Initialize a sha1_git_hasher with an optional ``msg`` byte string. The ``total_length`` of
136+ all content that will be hashed, combining the ``msg`` length plus any later call to
137+ update() with additional messages.
138+ """
91139 self .digest_size = 160 // 8
92- self .h = msg and self . _compute ( msg ) or None
140+ self .msg_len = 0
93141
94- def _compute (self , msg ):
95- # note: bytes interpolation is new in Python 3.5
96- git_blob_msg = b"blob %d\0 %s" % (len (msg ), msg )
97- return hashlib .sha1 (git_blob_msg ).digest ()
142+ if msg :
143+ self .msg_len = msg_len = len (msg )
98144
99- def digest (self ):
100- return bytes (self .h )
145+ if not total_length :
146+ total_length = msg_len
147+ else :
148+ if total_length < msg_len :
149+ raise ValueError (
150+ f"Initial msg length: { msg_len } "
151+ f"cannot be larger than the the total_length: { self .total_length } "
152+ )
101153
102- def hexdigest ( self ) :
103- return self . h and binascii . hexlify ( self . h ). decode ( "utf-8 " )
154+ if not total_length :
155+ raise ValueError ( "total_length cannot be zero " )
104156
105- def b64digest ( self ):
106- return self .h and urlsafe_b64encode ( self . h ). decode ( "utf-8" )
157+ self . total_length = total_length
158+ self .binh = get_hasher ( bitsize = 160 )( total_length = total_length )
107159
108- def intdigest (self ):
109- return self .h and int (bin_to_num (self .h ))
160+ self ._hash_header ()
161+ if msg :
162+ self .update (msg )
163+
164+ def _hash_header (self ):
165+ # note: bytes interpolation is new in Python 3.5
166+ git_blob_header = b"blob %d\0 " % (self .total_length )
167+ self .binh .update (msg = git_blob_header )
168+
169+ def update (self , msg = None ):
170+ """
171+ Update this hash with a ``msg`` bytes string.
172+ """
173+ if msg :
174+ msg_len = len (msg )
175+ if (msg_len + self .msg_len ) > self .total_length :
176+ raise ValueError (
177+ f"Actual combined msg lengths: initial: { self .msg_len } plus added: { msg_len } "
178+ f"cannot be larger than the the total_length: { self .total_length } "
179+ )
180+
181+ self .binh .update (msg )
182+ self .msg_len += msg_len
110183
111184
112185_hashmodules_by_name = {
@@ -119,25 +192,47 @@ def intdigest(self):
119192}
120193
121194
195+ def get_hasher_instance_by_name (name , total_length = 0 ):
196+ """
197+ Return a hasher instance for a checksum algorithm ``name`` with a planned ``total_length`` of
198+ bytes to hash.
199+ """
200+ try :
201+ hm = _hashmodules_by_name [name ]
202+ return hm (total_length = total_length )
203+ except KeyError :
204+ raise ValueError (f"Unknown checksum algorithm: { name !r} " )
205+
206+
207+ def get_file_size (location ):
208+ return os .path .getsize (location )
209+
210+
122211def checksum (location , name , base64 = False ):
123212 """
124- Return a checksum of `bitsize` length from the content of the file at
125- `location`. The checksum is a hexdigest or base64-encoded is `base64` is
126- True.
213+ Return a checksum from the content of the file at ``location`` using the ``name`` checksum
214+ algorithm. The checksum is a string as a hexdigest or is base64-encoded is ``base64`` is True.
127215 """
128216 if not filetype .is_file (location ):
129217 return
130- hasher = _hashmodules_by_name [name ]
131218
132- # fixme: we should read in chunks?
133- with open (location , "rb" ) as f :
134- hashable = f . read ( )
219+ total_length = get_file_size ( location )
220+ chunks = binary_chunks (location )
221+ return checksum_from_chunks ( chunks = chunks , total_length = total_length , name = name , base64 = base64 )
135222
136- hashed = hasher (hashable )
137- if base64 :
138- return hashed .b64digest ()
139223
140- return hashed .hexdigest ()
224+ def checksum_from_chunks (chunks , name , total_length = 0 , base64 = False ):
225+ """
226+ Return a checksum from the content of the iterator of byte strings ``chunks`` with a
227+ ``total_length`` combined length using the ``name`` checksum algorithm. The returned checksum is
228+ a string as a hexdigest or is base64-encoded is ``base64`` is True.
229+ """
230+ hasher = get_hasher_instance_by_name (name = name , total_length = total_length )
231+ for chunk in chunks :
232+ hasher .update (chunk )
233+ if base64 :
234+ return hasher .b64digest ()
235+ return hasher .hexdigest ()
141236
142237
143238def md5 (location ):
@@ -164,21 +259,37 @@ def sha1_git(location):
164259 return checksum (location , name = "sha1_git" , base64 = False )
165260
166261
167- def multi_checksums (location , checksum_names = ( "md5" , "sha1" , "sha256" , "sha512" , "sha1_git" ) ):
262+ def binary_chunks (location , size = 2 ** 24 ):
168263 """
169- Return a mapping of hexdigest checksums keyed by checksum name from the content
170- of the file at `location`. Use the `checksum_names` list of checksum names.
171- The mapping is guaranted to contains all the requested names as keys.
172- If the location is not a file, the values are None.
264+ Read file at ``location`` as binary and yield bytes of up to ``size`` length in bytes,
265+ defaulting to 2**24 bytes, e.g., about 16 MB.
173266 """
174- results = dict ([(name , None ) for name in checksum_names ])
175- if not filetype .is_file (location ):
176- return results
177-
178- # fixme: we should read in chunks?
179267 with open (location , "rb" ) as f :
180- hashable = f .read ()
268+ while True :
269+ chunk = f .read (size )
270+ if not chunk :
271+ break
272+ yield chunk
181273
182- for name in checksum_names :
183- results [name ] = _hashmodules_by_name [name ](hashable ).hexdigest ()
184- return results
274+
275+ def multi_checksums (location , checksum_names = ("md5" , "sha1" , "sha256" , "sha512" , "sha1_git" )):
276+ """
277+ Return a mapping of hexdigest checksum strings keyed by checksum algorithm name from hashing the
278+ content of the file at ``location``. Use the ``checksum_names`` list of checksum names. The
279+ mapping is guaranted to contains all the requested names as keys. If the location is not a file,
280+ or if the file is empty, the values are None.
281+ The purpose of this function is
282+ """
283+ if not filetype .is_file (location ):
284+ return {name : None for name in checksum_names }
285+ file_size = get_file_size (location )
286+ hashers = {
287+ name : get_hasher_instance_by_name (name = name , total_length = file_size )
288+ for name in checksum_names
289+ }
290+
291+ for chunk in binary_chunks (location ):
292+ for hasher in hashers .values ():
293+ hasher .update (msg = chunk )
294+
295+ return {name : hasher .hexdigest () for name , hasher in hashers .items ()}
0 commit comments