Skip to content

Commit 1f9a11d

Browse files
authored
Add blake3 hashing (#538)
Because BLAKE3 natively supports parallelism without changing the final hash, sharding is bypassed. This is much more useful than getting different file hashes depending on which hashing method you used. The BLAKE3 hashing is done by memory mapping the file, and defaults to the max number of workers which is the number of logical CPU cores. This is a good default and the most performant setup. It is also what the standard BLAKE3 CLI tool (b3sum) does. It is implemented in Rust and so will be true parallelism rather than the thread concurrency implemented for other hashing algorithms, so the speed up should be quite large. But it will likely be slower on HDDs than having no parallelism. I think this is the right default, but the HDD concern is documented. Resolves: #530 Signed-off-by: makeworld <[email protected]>
1 parent e7f58ed commit 1f9a11d

File tree

11 files changed

+356
-16
lines changed

11 files changed

+356
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ All versions prior to 1.0.0 are untracked.
2929
- Implemented a new, minimal container image. This variant excludes optional dependencies (like OTel and PKCS#11) to reduce footprint, focusing solely on core signing and verification mechanisms.
3030
- The library now requires at least v4.0.0 of `sigstore` due to breaking changes in that library
3131
- Added support for signing and verifying using private Sigstore instances (`--trust_config`)
32+
- Added support for BLAKE3 hashing
3233

3334
## [1.0.1] - 2024-04-18
3435

benchmarks/exp_hash.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def build_parser() -> argparse.ArgumentParser:
4848
help="hash methods to benchmark",
4949
nargs="+",
5050
type=str,
51-
default=["sha256", "blake2"],
51+
default=["sha256", "blake2", "blake3"],
5252
)
5353

5454
parser.add_argument(
@@ -74,6 +74,8 @@ def _get_hasher(hash_algorithm: str) -> hashing.StreamingHashEngine:
7474
return memory.SHA256()
7575
if hash_algorithm == "blake2":
7676
return memory.BLAKE2()
77+
if hash_algorithm == "blake3":
78+
return memory.BLAKE3()
7779

7880
raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine")
7981

benchmarks/serialize.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ def get_hash_engine_factory(
4646
return memory.SHA256
4747
if hash_algorithm == "blake2":
4848
return memory.BLAKE2
49+
if hash_algorithm == "blake3":
50+
return memory.BLAKE3
4951

5052
raise ValueError(f"Cannot convert {hash_algorithm} to a hash engine")
5153

@@ -146,7 +148,7 @@ def build_parser() -> argparse.ArgumentParser:
146148
parser.add_argument(
147149
"--hash_method",
148150
help="hash method to use (default: sha256)",
149-
choices=["sha256", "blake2"],
151+
choices=["sha256", "blake2", "blake3"],
150152
default="sha256",
151153
)
152154
parser.add_argument(
@@ -180,7 +182,7 @@ def build_parser() -> argparse.ArgumentParser:
180182
"--merge_hasher",
181183
help="hasher to use to merge individual hashes "
182184
"when skipping manifest creation (default: sha256)",
183-
choices=["sha256", "blake2"],
185+
choices=["sha256", "blake2", "blake3"],
184186
default="sha256",
185187
)
186188

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ classifiers = [
2828
]
2929
dependencies = [
3030
"asn1crypto",
31+
"blake3",
3132
"click",
3233
"cryptography",
3334
"in-toto-attestation",

src/model_signing/_hashing/io.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import pathlib
3939
from typing import Optional
4040

41+
import blake3
4142
from typing_extensions import override
4243

4344
from model_signing._hashing import hashing
@@ -135,6 +136,61 @@ def digest_size(self) -> int:
135136
return self._content_hasher.digest_size
136137

137138

139+
class Blake3FileHasher(FileHasher):
140+
"""Simple file hash engine that uses BLAKE3 in parallel.
141+
142+
This hash engine uses the fastest BLAKE3 settings, by using memory mapping
143+
and multiple workers. This will greatly increase speed on SSDs, but may
144+
not perform well on HDDs. For HDDs, you can set max_threads to 1.
145+
"""
146+
147+
def __init__(
148+
self,
149+
file: pathlib.Path,
150+
*,
151+
max_threads: int = blake3.blake3.AUTO,
152+
digest_name_override: Optional[str] = None,
153+
):
154+
"""Initializes an instance to hash a file.
155+
156+
Args:
157+
file: The file to hash. Use `set_file` to reset it.
158+
max_threads: how many BLAKE3 workers to use. Defaults to number of
159+
logical cores.
160+
digest_name_override: Optional string to allow overriding the
161+
`digest_name` property to support shorter, standardized names.
162+
"""
163+
self._file = file
164+
self._digest_name_override = digest_name_override
165+
self._blake3 = blake3.blake3(max_threads=max_threads)
166+
167+
def set_file(self, file: pathlib.Path) -> None:
168+
"""Redefines the file to be hashed in `compute`.
169+
170+
Args:
171+
file: The new file to be hashed.
172+
"""
173+
self._file = file
174+
175+
@property
176+
@override
177+
def digest_name(self) -> str:
178+
if self._digest_name_override is not None:
179+
return self._digest_name_override
180+
return "blake3"
181+
182+
@override
183+
def compute(self) -> hashing.Digest:
184+
self._blake3.reset()
185+
self._blake3.update_mmap(self._file)
186+
return hashing.Digest(self.digest_name, self._blake3.digest())
187+
188+
@property
189+
@override
190+
def digest_size(self) -> int:
191+
return 32
192+
193+
138194
class ShardedFileHasher(SimpleFileHasher):
139195
"""File hash engine that hashes a portion (shard) of the file.
140196

src/model_signing/_hashing/memory.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
import hashlib
3939

40+
import blake3
4041
from typing_extensions import override
4142

4243
from model_signing._hashing import hashing
@@ -108,3 +109,37 @@ def digest_name(self) -> str:
108109
@override
109110
def digest_size(self) -> int:
110111
return self._hasher.digest_size
112+
113+
114+
class BLAKE3(hashing.StreamingHashEngine):
115+
"""A wrapper around `blake3.blake3`."""
116+
117+
def __init__(self, initial_data: bytes = b""):
118+
"""Initializes an instance of a BLAKE3 hash engine.
119+
120+
Args:
121+
initial_data: Optional initial data to hash.
122+
"""
123+
self._hasher = blake3.blake3(initial_data)
124+
125+
@override
126+
def update(self, data: bytes) -> None:
127+
self._hasher.update(data)
128+
129+
@override
130+
def reset(self, data: bytes = b"") -> None:
131+
self._hasher = blake3.blake3(data)
132+
133+
@override
134+
def compute(self) -> hashing.Digest:
135+
return hashing.Digest(self.digest_name, self._hasher.digest())
136+
137+
@property
138+
@override
139+
def digest_name(self) -> str:
140+
return "blake3"
141+
142+
@property
143+
@override
144+
def digest_size(self) -> int:
145+
return self._hasher.digest_size

src/model_signing/_serialization/file.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def __init__(
6666
self._serialization_description = manifest._FileSerialization(
6767
hasher.digest_name, self._allow_symlinks, self._ignore_paths
6868
)
69+
self._is_blake3 = hasher.digest_name == "blake3"
6970

7071
def set_allow_symlinks(self, allow_symlinks: bool) -> None:
7172
"""Set whether following symlinks is allowed."""
@@ -120,7 +121,8 @@ def serialize(
120121

121122
manifest_items = []
122123
with concurrent.futures.ThreadPoolExecutor(
123-
max_workers=self._max_workers
124+
# blake3 parallelizes internally
125+
max_workers=1 if self._is_blake3 else self._max_workers
124126
) as tpe:
125127
futures = [
126128
tpe.submit(self._compute_hash, model_path, path)

src/model_signing/hashing.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353
import sys
5454
from typing import Literal, Optional, Union
5555

56+
import blake3
57+
5658
from model_signing import manifest
5759
from model_signing._hashing import hashing
5860
from model_signing._hashing import io
@@ -124,8 +126,8 @@ class Config:
124126
default, we hash at file level granularity.
125127
126128
This configuration class also supports configuring the hash method used to
127-
generate the hash for every object in the model. We currently support SHA256
128-
and BLAKE2, with SHA256 being the default.
129+
generate the hash for every object in the model. We currently support
130+
SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.
129131
130132
This configuration class also supports configuring which paths from the
131133
model directory should be ignored. These are files that doesn't impact the
@@ -183,7 +185,8 @@ def hash(
183185
)
184186

185187
def _build_stream_hasher(
186-
self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256"
188+
self,
189+
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
187190
) -> hashing.StreamingHashEngine:
188191
"""Builds a streaming hasher from a constant string.
189192
@@ -198,28 +201,37 @@ def _build_stream_hasher(
198201
return memory.SHA256()
199202
if hashing_algorithm == "blake2":
200203
return memory.BLAKE2()
204+
if hashing_algorithm == "blake3":
205+
return memory.BLAKE3()
201206

202207
raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
203208

204209
def _build_file_hasher_factory(
205210
self,
206-
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
211+
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
207212
chunk_size: int = 1048576,
208-
) -> Callable[[pathlib.Path], io.SimpleFileHasher]:
213+
max_workers: Optional[int] = None,
214+
) -> Callable[[pathlib.Path], io.FileHasher]:
209215
"""Builds the hasher factory for a serialization by file.
210216
211217
Args:
212218
hashing_algorithm: The hashing algorithm to use to hash a file.
213219
chunk_size: The amount of file to read at once. Default is 1MB. A
214220
special value of 0 signals to attempt to read everything in a
215-
single call.
221+
single call. This is ignored for BLAKE3.
222+
max_workers: Maximum number of workers to use in parallel. Defaults
223+
to the number of logical cores. Only relevant for BLAKE3.
216224
217225
Returns:
218226
The hasher factory that should be used by the active serialization
219227
method.
220228
"""
229+
if max_workers is None:
230+
max_workers = blake3.blake3.AUTO
221231

222-
def _factory(path: pathlib.Path) -> io.SimpleFileHasher:
232+
def _factory(path: pathlib.Path) -> io.FileHasher:
233+
if hashing_algorithm == "blake3":
234+
return io.Blake3FileHasher(path, max_threads=max_workers)
223235
hasher = self._build_stream_hasher(hashing_algorithm)
224236
return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
225237

@@ -233,6 +245,9 @@ def _build_sharded_file_hasher_factory(
233245
) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
234246
"""Builds the hasher factory for a serialization by file shards.
235247
248+
This is not recommended for BLAKE3 because it is not necessary. BLAKE3
249+
already operates in parallel.
250+
236251
Args:
237252
hashing_algorithm: The hashing algorithm to use to hash a shard.
238253
chunk_size: The amount of file to read at once. Default is 1MB. A
@@ -263,7 +278,7 @@ def _factory(
263278
def use_file_serialization(
264279
self,
265280
*,
266-
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
281+
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
267282
chunk_size: int = 1048576,
268283
max_workers: Optional[int] = None,
269284
allow_symlinks: bool = False,
@@ -279,10 +294,13 @@ def use_file_serialization(
279294
hashing_algorithm: The hashing algorithm to use to hash a file.
280295
chunk_size: The amount of file to read at once. Default is 1MB. A
281296
special value of 0 signals to attempt to read everything in a
282-
single call.
297+
single call. Ignored for BLAKE3.
283298
max_workers: Maximum number of workers to use in parallel. Default
284299
is to defer to the `concurrent.futures` library to select the best
285-
value for the current machine.
300+
value for the current machine, or the number of logical cores
301+
when doing BLAKE3 hashing. When reading files off of slower
302+
hardware like an HDD rather than an SSD, and using BLAKE3,
303+
setting max_workers to 1 may improve performance.
286304
allow_symlinks: Controls whether symbolic links are included. If a
287305
symlink is present but the flag is `False` (default) the
288306
serialization would raise an error.
@@ -291,7 +309,9 @@ def use_file_serialization(
291309
The new hashing configuration with the new serialization method.
292310
"""
293311
self._serializer = file.Serializer(
294-
self._build_file_hasher_factory(hashing_algorithm, chunk_size),
312+
self._build_file_hasher_factory(
313+
hashing_algorithm, chunk_size, max_workers
314+
),
295315
max_workers=max_workers,
296316
allow_symlinks=allow_symlinks,
297317
ignore_paths=ignore_paths,
@@ -301,7 +321,7 @@ def use_file_serialization(
301321
def use_shard_serialization(
302322
self,
303323
*,
304-
hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
324+
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
305325
chunk_size: int = 1048576,
306326
shard_size: int = 1_000_000_000,
307327
max_workers: Optional[int] = None,
@@ -310,6 +330,10 @@ def use_shard_serialization(
310330
) -> Self:
311331
"""Configures serialization to build a manifest of (shard, hash) pairs.
312332
333+
For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
334+
because BLAKE3 already operates in parallel. This means the chunk_size
335+
and shard_size args are ignored.
336+
313337
The serialization method in this configuration is changed to one where
314338
every file in the model is sharded in equal sized shards, every shard is
315339
paired with its digest and a manifest containing all these pairings is
@@ -332,6 +356,15 @@ def use_shard_serialization(
332356
Returns:
333357
The new hashing configuration with the new serialization method.
334358
"""
359+
if hashing_algorithm == "blake3":
360+
return self.use_file_serialization(
361+
hashing_algorithm=hashing_algorithm,
362+
chunk_size=chunk_size,
363+
max_workers=max_workers,
364+
allow_symlinks=allow_symlinks,
365+
ignore_paths=ignore_paths,
366+
)
367+
335368
self._serializer = file_shard.Serializer(
336369
self._build_sharded_file_hasher_factory(
337370
hashing_algorithm, chunk_size, shard_size

0 commit comments

Comments
 (0)