Skip to content

Commit d385199

Browse files
author
Emrick Donadei
committed
Add Config.use_incremental_serialization() to hashing API
Integrates the IncrementalSerializer into the high-level hashing API, making it accessible through the Config class. Usage: # Extract manifest from previous signature old_manifest = Manifest.from_signature(Path("model.sig.old")) # Configure incremental hashing config = hashing.Config().use_incremental_serialization( old_manifest, hashing_algorithm="sha256" ) # Get changed files and hash them changed_files = [model_path / "README.md"] new_manifest = config.hash(model_path, files_to_hash=changed_files) This method follows the same pattern as use_file_serialization() and use_shard_serialization(), providing a consistent API for users. The configuration: - Accepts an existing manifest to compare against - Supports all the same hashing algorithms (SHA256, BLAKE2, BLAKE3) - Supports the same parameters (chunk_size, max_workers, etc.) - Returns Self for method chaining Related to issue #160 - API for incremental model re-hashing
1 parent 77f3d2e commit d385199

File tree

1 file changed

+73
-0
lines changed

1 file changed

+73
-0
lines changed

src/model_signing/hashing.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
from model_signing._hashing import memory
6262
from model_signing._serialization import file
6363
from model_signing._serialization import file_shard
64+
from model_signing._serialization import incremental
6465

6566

6667
if sys.version_info >= (3, 11):
@@ -375,6 +376,78 @@ def use_shard_serialization(
375376
)
376377
return self
377378

379+
def use_incremental_serialization(
380+
self,
381+
existing_manifest: manifest.Manifest,
382+
*,
383+
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
384+
chunk_size: int = 1048576,
385+
max_workers: Optional[int] = None,
386+
allow_symlinks: bool = False,
387+
ignore_paths: Iterable[pathlib.Path] = frozenset(),
388+
) -> Self:
389+
"""Configures incremental serialization for selective file re-hashing.
390+
391+
This serialization method compares the current model state against an
392+
existing manifest (from a previous signature) and only re-hashes files
393+
that changed. This provides significant performance improvements for
394+
large models where only a small subset of files change.
395+
396+
The serialization method in this configuration is changed to one where:
397+
- Files that exist in the existing manifest have their digests reused
398+
- New files (not in existing manifest) are hashed
399+
- Modified files (specified via files_to_hash in hash()) are re-hashed
400+
- Deleted files are automatically omitted from the new manifest
401+
402+
Usage example:
403+
# Extract manifest from previous signature
404+
old_manifest = manifest.Manifest.from_signature(
405+
pathlib.Path("model.sig.old")
406+
)
407+
408+
# Configure incremental hashing
409+
config = hashing.Config().use_incremental_serialization(
410+
old_manifest,
411+
hashing_algorithm="sha256"
412+
)
413+
414+
# Get changed files (e.g., from git)
415+
changed_files = [model_path / "README.md"]
416+
417+
# Hash only changed files
418+
new_manifest = config.hash(model_path, files_to_hash=changed_files)
419+
420+
Args:
421+
existing_manifest: The manifest from a previous signature. Digests
422+
from this manifest will be reused for unchanged files.
423+
hashing_algorithm: The hashing algorithm to use for new/changed
424+
files. Must match the algorithm used in existing_manifest.
425+
chunk_size: The amount of file to read at once. Default is 1MB. A
426+
special value of 0 signals to attempt to read everything in a
427+
single call. Ignored for BLAKE3.
428+
max_workers: Maximum number of workers to use in parallel. Default
429+
is to defer to the `concurrent.futures` library to select the best
430+
value for the current machine, or the number of logical cores
431+
when doing BLAKE3 hashing.
432+
allow_symlinks: Controls whether symbolic links are included. If a
433+
symlink is present but the flag is `False` (default) the
434+
serialization would raise an error.
435+
ignore_paths: Paths of files to ignore.
436+
437+
Returns:
438+
The new hashing configuration with incremental serialization.
439+
"""
440+
self._serializer = incremental.IncrementalSerializer(
441+
self._build_file_hasher_factory(
442+
hashing_algorithm, chunk_size, max_workers
443+
),
444+
existing_manifest,
445+
max_workers=max_workers,
446+
allow_symlinks=allow_symlinks,
447+
ignore_paths=ignore_paths,
448+
)
449+
return self
450+
378451
def set_ignored_paths(
379452
self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
380453
) -> Self:

0 commit comments

Comments
 (0)