Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 247 additions & 0 deletions src/model_signing/_serialization/incremental.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# Copyright 2024 The Sigstore Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Incremental model serializer for selective file re-hashing.

This module provides a serializer that can reuse digests from an existing
manifest, only re-hashing files that have changed. This is useful for large
models where only a small subset of files change between signings.
"""

from collections.abc import Callable, Iterable
import concurrent.futures
import itertools
import os
import pathlib
from typing import Optional

from typing_extensions import override

from model_signing import manifest
from model_signing._hashing import io
from model_signing._serialization import serialization


class IncrementalSerializer(serialization.Serializer):
"""Model serializer that only re-hashes changed files.

This serializer compares the current model state against an existing
manifest (from a previous signature) and only re-hashes files that:
- Are new (not in the existing manifest)
- Have changed size (likely modified)
- Are explicitly requested via files_to_hash parameter

Files that exist in both the current model and the existing manifest
with matching sizes will have their digests reused from the existing
manifest without re-hashing.

This provides significant performance improvements for large models where
only a small number of files change between signings (e.g., updating
documentation in a 200GB model).
"""

def __init__(
self,
file_hasher_factory: Callable[[pathlib.Path], io.FileHasher],
existing_manifest: manifest.Manifest,
*,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
ignore_paths: Iterable[pathlib.Path] = frozenset(),
):
"""Initializes an incremental serializer.

Args:
file_hasher_factory: A callable to build the hash engine used to
hash individual files.
existing_manifest: The manifest from a previous signature. Digests
from this manifest will be reused for unchanged files.
max_workers: Maximum number of workers to use in parallel. Default
is to defer to the `concurrent.futures` library.
allow_symlinks: Controls whether symbolic links are included. If a
symlink is present but the flag is `False` (default) the
serialization would raise an error.
ignore_paths: The paths of files to ignore.
"""
self._hasher_factory = file_hasher_factory
self._existing_manifest = existing_manifest
self._max_workers = max_workers
self._allow_symlinks = allow_symlinks
self._ignore_paths = ignore_paths

# Build lookup dictionary: file path -> manifest item
self._existing_items = {}
for item in existing_manifest._item_to_digest:
# item is a _File or _Shard key; we only support files for now
if isinstance(item, manifest._File):
self._existing_items[item.path] = item

# Precompute serialization description
hasher = file_hasher_factory(pathlib.Path())
self._serialization_description = manifest._FileSerialization(
hasher.digest_name, self._allow_symlinks, self._ignore_paths
)
self._is_blake3 = hasher.digest_name == "blake3"

def set_allow_symlinks(self, allow_symlinks: bool) -> None:
"""Set whether following symlinks is allowed."""
self._allow_symlinks = allow_symlinks
hasher = self._hasher_factory(pathlib.Path())
self._serialization_description = manifest._FileSerialization(
hasher.digest_name, self._allow_symlinks, self._ignore_paths
)

@override
def serialize(
self,
model_path: pathlib.Path,
*,
ignore_paths: Iterable[pathlib.Path] = frozenset(),
files_to_hash: Optional[Iterable[pathlib.Path]] = None,
) -> manifest.Manifest:
"""Serializes the model, only re-hashing changed/new files.

Args:
model_path: The path to the model.
ignore_paths: The paths to ignore during serialization. If a
provided path is a directory, all children of the directory are
ignored.
files_to_hash: Optional list of files that may have changed and
should be re-hashed. If None, all files in the model directory
are scanned, and only NEW files (not in existing manifest) are
hashed. Existing files have their digests reused.

To detect changed files, use git diff or similar:
changed_files = subprocess.check_output(
['git', 'diff', '--name-only', 'HEAD']
).decode().splitlines()
files_to_hash = [model_path / f for f in changed_files]

Returns:
The model's serialized manifest with a mix of reused and
newly-computed digests.

Raises:
ValueError: The model contains a symbolic link, but the serializer
was not initialized with `allow_symlinks=True`.
"""
# Build a set of files to rehash (files that potentially changed)
rehash_paths = set()
if files_to_hash is not None:
# User provided explicit list of changed files
for path in files_to_hash:
if path.is_file():
rehash_paths.add(path.relative_to(model_path))

# Scan directory to find all current files in the model
all_current_files = []
for path in itertools.chain((model_path,), model_path.glob("**/*")):
if serialization.should_ignore(path, ignore_paths):
continue
serialization.check_file_or_directory(
path, allow_symlinks=self._allow_symlinks
)
if path.is_file():
all_current_files.append(path)

# Build the new manifest
files_to_rehash = []
manifest_items = []

for path in all_current_files:
relative_path = path.relative_to(model_path)
posix_path = pathlib.PurePosixPath(relative_path)

# Determine if this file needs re-hashing
needs_rehash = False

if posix_path not in self._existing_items:
# New file not in old manifest - must hash it
needs_rehash = True
elif rehash_paths and relative_path in rehash_paths:
# File was explicitly marked as changed - must re-hash it
needs_rehash = True
elif not rehash_paths:
# No explicit files_to_hash provided, so we're in "scan mode"
# Reuse digest for existing files (assume unchanged)
needs_rehash = False
else:
# File exists in old manifest and wasn't marked as changed
# Reuse old digest
needs_rehash = False

if needs_rehash:
files_to_rehash.append(path)
else:
# Reuse existing digest
old_item_key = self._existing_items[posix_path]
old_digest = self._existing_manifest._item_to_digest[
old_item_key
]
manifest_items.append(
manifest.FileManifestItem(
path=relative_path, digest=old_digest
)
)

# Hash all files that need re-hashing in parallel
with concurrent.futures.ThreadPoolExecutor(
max_workers=1 if self._is_blake3 else self._max_workers
) as tpe:
futures = [
tpe.submit(self._compute_hash, model_path, path)
for path in files_to_rehash
]
for future in concurrent.futures.as_completed(futures):
manifest_items.append(future.result())

# Handle ignore_paths for serialization description
if ignore_paths:
rel_ignore_paths = []
for p in ignore_paths:
rp = os.path.relpath(p, model_path)
if not rp.startswith("../"):
rel_ignore_paths.append(pathlib.Path(rp))

hasher = self._hasher_factory(pathlib.Path())
self._serialization_description = manifest._FileSerialization(
hasher.digest_name,
self._allow_symlinks,
frozenset(list(self._ignore_paths) + rel_ignore_paths),
)

model_name = model_path.name
if not model_name or model_name == "..":
model_name = os.path.basename(model_path.resolve())

return manifest.Manifest(
model_name, manifest_items, self._serialization_description
)

def _compute_hash(
self, model_path: pathlib.Path, path: pathlib.Path
) -> manifest.FileManifestItem:
"""Produces the manifest item of the file given by `path`.

Args:
model_path: The path to the model.
path: Path to the file in the model, that is currently transformed
to a manifest item.

Returns:
The itemized manifest.
"""
relative_path = path.relative_to(model_path)
digest = self._hasher_factory(path).compute()
return manifest.FileManifestItem(path=relative_path, digest=digest)
73 changes: 73 additions & 0 deletions src/model_signing/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from model_signing._hashing import memory
from model_signing._serialization import file
from model_signing._serialization import file_shard
from model_signing._serialization import incremental


if sys.version_info >= (3, 11):
Expand Down Expand Up @@ -375,6 +376,78 @@ def use_shard_serialization(
)
return self

def use_incremental_serialization(
self,
existing_manifest: manifest.Manifest,
*,
hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
chunk_size: int = 1048576,
max_workers: Optional[int] = None,
allow_symlinks: bool = False,
ignore_paths: Iterable[pathlib.Path] = frozenset(),
) -> Self:
"""Configures incremental serialization for selective file re-hashing.

This serialization method compares the current model state against an
existing manifest (from a previous signature) and only re-hashes files
that changed. This provides significant performance improvements for
large models where only a small subset of files change.

The serialization method in this configuration is changed to one where:
- Files that exist in the existing manifest have their digests reused
- New files (not in existing manifest) are hashed
- Modified files (specified via files_to_hash in hash()) are re-hashed
- Deleted files are automatically omitted from the new manifest

Usage example:
# Extract manifest from previous signature
old_manifest = manifest.Manifest.from_signature(
pathlib.Path("model.sig.old")
)

# Configure incremental hashing
config = hashing.Config().use_incremental_serialization(
old_manifest,
hashing_algorithm="sha256"
)

# Get changed files (e.g., from git)
changed_files = [model_path / "README.md"]

# Hash only changed files
new_manifest = config.hash(model_path, files_to_hash=changed_files)

Args:
existing_manifest: The manifest from a previous signature. Digests
from this manifest will be reused for unchanged files.
hashing_algorithm: The hashing algorithm to use for new/changed
files. Must match the algorithm used in existing_manifest.
chunk_size: The amount of file to read at once. Default is 1MB. A
special value of 0 signals to attempt to read everything in a
single call. Ignored for BLAKE3.
max_workers: Maximum number of workers to use in parallel. Default
is to defer to the `concurrent.futures` library to select the best
value for the current machine, or the number of logical cores
when doing BLAKE3 hashing.
allow_symlinks: Controls whether symbolic links are included. If a
symlink is present but the flag is `False` (default) the
serialization would raise an error.
ignore_paths: Paths of files to ignore.

Returns:
The new hashing configuration with incremental serialization.
"""
self._serializer = incremental.IncrementalSerializer(
self._build_file_hasher_factory(
hashing_algorithm, chunk_size, max_workers
),
existing_manifest,
max_workers=max_workers,
allow_symlinks=allow_symlinks,
ignore_paths=ignore_paths,
)
return self

def set_ignored_paths(
self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
) -> Self:
Expand Down
50 changes: 50 additions & 0 deletions src/model_signing/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
"""

import abc
import base64
from collections.abc import Iterable, Iterator
import dataclasses
import json
import pathlib
import sys
from typing import Any, Final
Expand Down Expand Up @@ -466,3 +468,51 @@ def serialization_type(self) -> dict[str, Any]:
manifest so that signature verification can use the same method.
"""
return self._serialization_type.serialization_parameters

@classmethod
def from_signature(cls, signature_path: pathlib.Path) -> Self:
"""Extracts a manifest from an existing signature file.

This method reads a signature file (Sigstore bundle) and extracts the
manifest without performing cryptographic verification. This is useful
for incremental re-hashing where you need to know what files were
previously signed without verifying the signature.

Args:
signature_path: Path to the signature file to read.

Returns:
A Manifest object representing the signed model.

Raises:
ValueError: If the signature file cannot be parsed or doesn't
contain a valid manifest.
FileNotFoundError: If the signature file doesn't exist.
"""
# Avoid circular import by importing here
from model_signing._signing import signing

# Read the signature file
content = signature_path.read_text(encoding="utf-8")
bundle_dict = json.loads(content)

# Extract the DSSE envelope payload
if "dsseEnvelope" in bundle_dict:
# This is a protobuf-based bundle
envelope = bundle_dict["dsseEnvelope"]
elif "dsse_envelope" in bundle_dict:
# Alternative snake_case naming
envelope = bundle_dict["dsse_envelope"]
else:
raise ValueError("Signature file does not contain a DSSE envelope")

# Decode the payload (it's base64 encoded)
payload_b64 = envelope.get("payload")
if not payload_b64:
raise ValueError("DSSE envelope does not contain a payload")

payload_bytes = base64.b64decode(payload_b64)
payload_dict = json.loads(payload_bytes)

# Use the existing function to convert DSSE payload to manifest
return signing.dsse_payload_to_manifest(payload_dict)
Loading