sigstore · edonadei · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025 · Nov 2, 2025
@@ -0,0 +1,247 @@
+# Copyright 2024 The Sigstore Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Incremental model serializer for selective file re-hashing.
+
+This module provides a serializer that can reuse digests from an existing
+manifest, only re-hashing files that have changed. This is useful for large
+models where only a small subset of files change between signings.
+"""
+
+from collections.abc import Callable, Iterable
+import concurrent.futures
+import itertools
+import os
+import pathlib
+from typing import Optional
+
+from typing_extensions import override
+
+from model_signing import manifest
+from model_signing._hashing import io
+from model_signing._serialization import serialization
+
+
+class IncrementalSerializer(serialization.Serializer):
+    """Model serializer that only re-hashes changed files.
+
+    This serializer compares the current model state against an existing
+    manifest (from a previous signature) and only re-hashes files that:
+    - Are new (not in the existing manifest)
+    - Have changed size (likely modified)
+    - Are explicitly requested via files_to_hash parameter
+
+    Files that exist in both the current model and the existing manifest
+    with matching sizes will have their digests reused from the existing
+    manifest without re-hashing.
+
+    This provides significant performance improvements for large models where
+    only a small number of files change between signings (e.g., updating
+    documentation in a 200GB model).
+    """
+
+    def __init__(
+        self,
+        file_hasher_factory: Callable[[pathlib.Path], io.FileHasher],
+        existing_manifest: manifest.Manifest,
+        *,
+        max_workers: Optional[int] = None,
+        allow_symlinks: bool = False,
+        ignore_paths: Iterable[pathlib.Path] = frozenset(),
+    ):
+        """Initializes an incremental serializer.
+
+        Args:
+            file_hasher_factory: A callable to build the hash engine used to
+              hash individual files.
+            existing_manifest: The manifest from a previous signature. Digests
+              from this manifest will be reused for unchanged files.
+            max_workers: Maximum number of workers to use in parallel. Default
+              is to defer to the `concurrent.futures` library.
+            allow_symlinks: Controls whether symbolic links are included. If a
+              symlink is present but the flag is `False` (default) the
+              serialization would raise an error.
+            ignore_paths: The paths of files to ignore.
+        """
+        self._hasher_factory = file_hasher_factory
+        self._existing_manifest = existing_manifest
+        self._max_workers = max_workers
+        self._allow_symlinks = allow_symlinks
+        self._ignore_paths = ignore_paths
+
+        # Build lookup dictionary: file path -> manifest item
+        self._existing_items = {}
+        for item in existing_manifest._item_to_digest:
+            # item is a _File or _Shard key; we only support files for now
+            if isinstance(item, manifest._File):
+                self._existing_items[item.path] = item
+
+        # Precompute serialization description
+        hasher = file_hasher_factory(pathlib.Path())
+        self._serialization_description = manifest._FileSerialization(
+            hasher.digest_name, self._allow_symlinks, self._ignore_paths
+        )
+        self._is_blake3 = hasher.digest_name == "blake3"
+
+    def set_allow_symlinks(self, allow_symlinks: bool) -> None:
+        """Set whether following symlinks is allowed."""
+        self._allow_symlinks = allow_symlinks
+        hasher = self._hasher_factory(pathlib.Path())
+        self._serialization_description = manifest._FileSerialization(
+            hasher.digest_name, self._allow_symlinks, self._ignore_paths
+        )
+
+    @override
+    def serialize(
+        self,
+        model_path: pathlib.Path,
+        *,
+        ignore_paths: Iterable[pathlib.Path] = frozenset(),
+        files_to_hash: Optional[Iterable[pathlib.Path]] = None,
+    ) -> manifest.Manifest:
+        """Serializes the model, only re-hashing changed/new files.
+
+        Args:
+            model_path: The path to the model.
+            ignore_paths: The paths to ignore during serialization. If a
+              provided path is a directory, all children of the directory are
+              ignored.
+            files_to_hash: Optional list of files that may have changed and
+              should be re-hashed. If None, all files in the model directory
+              are scanned, and only NEW files (not in existing manifest) are
+              hashed. Existing files have their digests reused.
+
+              To detect changed files, use git diff or similar:
+                  changed_files = subprocess.check_output(
+                      ['git', 'diff', '--name-only', 'HEAD']
+                  ).decode().splitlines()
+                  files_to_hash = [model_path / f for f in changed_files]
+
+        Returns:
+            The model's serialized manifest with a mix of reused and
+            newly-computed digests.
+
+        Raises:
+            ValueError: The model contains a symbolic link, but the serializer
+              was not initialized with `allow_symlinks=True`.
+        """
+        # Build a set of files to rehash (files that potentially changed)
+        rehash_paths = set()
+        if files_to_hash is not None:
+            # User provided explicit list of changed files
+            for path in files_to_hash:
+                if path.is_file():
+                    rehash_paths.add(path.relative_to(model_path))
+
+        # Scan directory to find all current files in the model
+        all_current_files = []
+        for path in itertools.chain((model_path,), model_path.glob("**/*")):
+            if serialization.should_ignore(path, ignore_paths):
+                continue
+            serialization.check_file_or_directory(
+                path, allow_symlinks=self._allow_symlinks
+            )
+            if path.is_file():
+                all_current_files.append(path)
+
+        # Build the new manifest
+        files_to_rehash = []
+        manifest_items = []
+
+        for path in all_current_files:
+            relative_path = path.relative_to(model_path)
+            posix_path = pathlib.PurePosixPath(relative_path)
+
+            # Determine if this file needs re-hashing
+            needs_rehash = False
+
+            if posix_path not in self._existing_items:
+                # New file not in old manifest - must hash it
+                needs_rehash = True
+            elif rehash_paths and relative_path in rehash_paths:
+                # File was explicitly marked as changed - must re-hash it
+                needs_rehash = True
+            elif not rehash_paths:
+                # No explicit files_to_hash provided, so we're in "scan mode"
+                # Reuse digest for existing files (assume unchanged)
+                needs_rehash = False
+            else:
+                # File exists in old manifest and wasn't marked as changed
+                # Reuse old digest
+                needs_rehash = False
+
+            if needs_rehash:
+                files_to_rehash.append(path)
+            else:
+                # Reuse existing digest
+                old_item_key = self._existing_items[posix_path]
+                old_digest = self._existing_manifest._item_to_digest[
+                    old_item_key
+                ]
+                manifest_items.append(
+                    manifest.FileManifestItem(
+                        path=relative_path, digest=old_digest
+                    )
+                )
+
+        # Hash all files that need re-hashing in parallel
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=1 if self._is_blake3 else self._max_workers
+        ) as tpe:
+            futures = [
+                tpe.submit(self._compute_hash, model_path, path)
+                for path in files_to_rehash
+            ]
+            for future in concurrent.futures.as_completed(futures):
+                manifest_items.append(future.result())
+
+        # Handle ignore_paths for serialization description
+        if ignore_paths:
+            rel_ignore_paths = []
+            for p in ignore_paths:
+                rp = os.path.relpath(p, model_path)
+                if not rp.startswith("../"):
+                    rel_ignore_paths.append(pathlib.Path(rp))
+
+            hasher = self._hasher_factory(pathlib.Path())
+            self._serialization_description = manifest._FileSerialization(
+                hasher.digest_name,
+                self._allow_symlinks,
+                frozenset(list(self._ignore_paths) + rel_ignore_paths),
+            )
+
+        model_name = model_path.name
+        if not model_name or model_name == "..":
+            model_name = os.path.basename(model_path.resolve())
+
+        return manifest.Manifest(
+            model_name, manifest_items, self._serialization_description
+        )
+
+    def _compute_hash(
+        self, model_path: pathlib.Path, path: pathlib.Path
+    ) -> manifest.FileManifestItem:
+        """Produces the manifest item of the file given by `path`.
+
+        Args:
+            model_path: The path to the model.
+            path: Path to the file in the model, that is currently transformed
+              to a manifest item.
+
+        Returns:
+            The itemized manifest.
+        """
+        relative_path = path.relative_to(model_path)
+        digest = self._hasher_factory(path).compute()
+        return manifest.FileManifestItem(path=relative_path, digest=digest)
@@ -61,6 +61,7 @@
 from model_signing._hashing import memory
 from model_signing._serialization import file
 from model_signing._serialization import file_shard
+from model_signing._serialization import incremental
 
 
 if sys.version_info >= (3, 11):
@@ -375,6 +376,78 @@ def use_shard_serialization(
         )
         return self
 
+    def use_incremental_serialization(
+        self,
+        existing_manifest: manifest.Manifest,
+        *,
+        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
+        chunk_size: int = 1048576,
+        max_workers: Optional[int] = None,
+        allow_symlinks: bool = False,
+        ignore_paths: Iterable[pathlib.Path] = frozenset(),
+    ) -> Self:
+        """Configures incremental serialization for selective file re-hashing.
+
+        This serialization method compares the current model state against an
+        existing manifest (from a previous signature) and only re-hashes files
+        that changed. This provides significant performance improvements for
+        large models where only a small subset of files change.
+
+        The serialization method in this configuration is changed to one where:
+        - Files that exist in the existing manifest have their digests reused
+        - New files (not in existing manifest) are hashed
+        - Modified files (specified via files_to_hash in hash()) are re-hashed
+        - Deleted files are automatically omitted from the new manifest
+
+        Usage example:
+            # Extract manifest from previous signature
+            old_manifest = manifest.Manifest.from_signature(
+                pathlib.Path("model.sig.old")
+            )
+
+            # Configure incremental hashing
+            config = hashing.Config().use_incremental_serialization(
+                old_manifest,
+                hashing_algorithm="sha256"
+            )
+
+            # Get changed files (e.g., from git)
+            changed_files = [model_path / "README.md"]
+
+            # Hash only changed files
+            new_manifest = config.hash(model_path, files_to_hash=changed_files)
+
+        Args:
+            existing_manifest: The manifest from a previous signature. Digests
+              from this manifest will be reused for unchanged files.
+            hashing_algorithm: The hashing algorithm to use for new/changed
+              files. Must match the algorithm used in existing_manifest.
+            chunk_size: The amount of file to read at once. Default is 1MB. A
+              special value of 0 signals to attempt to read everything in a
+              single call. Ignored for BLAKE3.
+            max_workers: Maximum number of workers to use in parallel. Default
+              is to defer to the `concurrent.futures` library to select the best
+              value for the current machine, or the number of logical cores
+              when doing BLAKE3 hashing.
+            allow_symlinks: Controls whether symbolic links are included. If a
+              symlink is present but the flag is `False` (default) the
+              serialization would raise an error.
+            ignore_paths: Paths of files to ignore.
+
+        Returns:
+            The new hashing configuration with incremental serialization.
+        """
+        self._serializer = incremental.IncrementalSerializer(
+            self._build_file_hasher_factory(
+                hashing_algorithm, chunk_size, max_workers
+            ),
+            existing_manifest,
+            max_workers=max_workers,
+            allow_symlinks=allow_symlinks,
+            ignore_paths=ignore_paths,
+        )
+        return self
+
     def set_ignored_paths(
         self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
     ) -> Self:

@@ -39,8 +39,10 @@
 """
 
 import abc
+import base64
 from collections.abc import Iterable, Iterator
 import dataclasses
+import json
 import pathlib
 import sys
 from typing import Any, Final
@@ -466,3 +468,51 @@ def serialization_type(self) -> dict[str, Any]:
         manifest so that signature verification can use the same method.
         """
         return self._serialization_type.serialization_parameters
+
+    @classmethod
+    def from_signature(cls, signature_path: pathlib.Path) -> Self:
+        """Extracts a manifest from an existing signature file.
+
+        This method reads a signature file (Sigstore bundle) and extracts the
+        manifest without performing cryptographic verification. This is useful
+        for incremental re-hashing where you need to know what files were
+        previously signed without verifying the signature.
+
+        Args:
+            signature_path: Path to the signature file to read.
+
+        Returns:
+            A Manifest object representing the signed model.
+
+        Raises:
+            ValueError: If the signature file cannot be parsed or doesn't
+                contain a valid manifest.
+            FileNotFoundError: If the signature file doesn't exist.
+        """
+        # Avoid circular import by importing here
+        from model_signing._signing import signing
+
+        # Read the signature file
+        content = signature_path.read_text(encoding="utf-8")
+        bundle_dict = json.loads(content)
+
+        # Extract the DSSE envelope payload
+        if "dsseEnvelope" in bundle_dict:
+            # This is a protobuf-based bundle
+            envelope = bundle_dict["dsseEnvelope"]
+        elif "dsse_envelope" in bundle_dict:
+            # Alternative snake_case naming
+            envelope = bundle_dict["dsse_envelope"]
+        else:
+            raise ValueError("Signature file does not contain a DSSE envelope")
+
+        # Decode the payload (it's base64 encoded)
+        payload_b64 = envelope.get("payload")
+        if not payload_b64:
+            raise ValueError("DSSE envelope does not contain a payload")
+
+        payload_bytes = base64.b64decode(payload_b64)
+        payload_dict = json.loads(payload_bytes)
+
+        # Use the existing function to convert DSSE payload to manifest
+        return signing.dsse_payload_to_manifest(payload_dict)