|
61 | 61 | from model_signing._hashing import memory |
62 | 62 | from model_signing._serialization import file |
63 | 63 | from model_signing._serialization import file_shard |
| 64 | +from model_signing._serialization import incremental |
64 | 65 |
|
65 | 66 |
|
66 | 67 | if sys.version_info >= (3, 11): |
@@ -375,6 +376,78 @@ def use_shard_serialization( |
375 | 376 | ) |
376 | 377 | return self |
377 | 378 |
|
| 379 | + def use_incremental_serialization( |
| 380 | + self, |
| 381 | + existing_manifest: manifest.Manifest, |
| 382 | + *, |
| 383 | + hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", |
| 384 | + chunk_size: int = 1048576, |
| 385 | + max_workers: Optional[int] = None, |
| 386 | + allow_symlinks: bool = False, |
| 387 | + ignore_paths: Iterable[pathlib.Path] = frozenset(), |
| 388 | + ) -> Self: |
| 389 | + """Configures incremental serialization for selective file re-hashing. |
| 390 | +
|
| 391 | + This serialization method compares the current model state against an |
| 392 | + existing manifest (from a previous signature) and only re-hashes files |
| 393 | + that changed. This provides significant performance improvements for |
| 394 | + large models where only a small subset of files change. |
| 395 | +
|
| 396 | + The serialization method in this configuration is changed to one where: |
| 397 | + - Files that exist in the existing manifest have their digests reused |
| 398 | + - New files (not in existing manifest) are hashed |
| 399 | + - Modified files (specified via files_to_hash in hash()) are re-hashed |
| 400 | + - Deleted files are automatically omitted from the new manifest |
| 401 | +
|
| 402 | + Usage example: |
| 403 | + # Extract manifest from previous signature |
| 404 | + old_manifest = manifest.Manifest.from_signature( |
| 405 | + pathlib.Path("model.sig.old") |
| 406 | + ) |
| 407 | +
|
| 408 | + # Configure incremental hashing |
| 409 | + config = hashing.Config().use_incremental_serialization( |
| 410 | + old_manifest, |
| 411 | + hashing_algorithm="sha256" |
| 412 | + ) |
| 413 | +
|
| 414 | + # Get changed files (e.g., from git) |
| 415 | + changed_files = [model_path / "README.md"] |
| 416 | +
|
| 417 | + # Hash only changed files |
| 418 | + new_manifest = config.hash(model_path, files_to_hash=changed_files) |
| 419 | +
|
| 420 | + Args: |
| 421 | + existing_manifest: The manifest from a previous signature. Digests |
| 422 | + from this manifest will be reused for unchanged files. |
| 423 | + hashing_algorithm: The hashing algorithm to use for new/changed |
| 424 | + files. Must match the algorithm used in existing_manifest. |
| 425 | + chunk_size: The amount of file to read at once. Default is 1MB. A |
| 426 | + special value of 0 signals to attempt to read everything in a |
| 427 | + single call. Ignored for BLAKE3. |
| 428 | + max_workers: Maximum number of workers to use in parallel. Default |
| 429 | + is to defer to the `concurrent.futures` library to select the best |
| 430 | + value for the current machine, or the number of logical cores |
| 431 | + when doing BLAKE3 hashing. |
| 432 | + allow_symlinks: Controls whether symbolic links are included. If a |
| 433 | + symlink is present but the flag is `False` (default) the |
| 434 | + serialization would raise an error. |
| 435 | + ignore_paths: Paths of files to ignore. |
| 436 | +
|
| 437 | + Returns: |
| 438 | + The new hashing configuration with incremental serialization. |
| 439 | + """ |
| 440 | + self._serializer = incremental.IncrementalSerializer( |
| 441 | + self._build_file_hasher_factory( |
| 442 | + hashing_algorithm, chunk_size, max_workers |
| 443 | + ), |
| 444 | + existing_manifest, |
| 445 | + max_workers=max_workers, |
| 446 | + allow_symlinks=allow_symlinks, |
| 447 | + ignore_paths=ignore_paths, |
| 448 | + ) |
| 449 | + return self |
| 450 | + |
378 | 451 | def set_ignored_paths( |
379 | 452 | self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True |
380 | 453 | ) -> Self: |
|
0 commit comments