diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py new file mode 100644 index 00000000..d37676d8 --- /dev/null +++ b/src/model_signing/_serialization/incremental.py @@ -0,0 +1,247 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Incremental model serializer for selective file re-hashing. + +This module provides a serializer that can reuse digests from an existing +manifest, only re-hashing files that have changed. This is useful for large +models where only a small subset of files change between signings. +""" + +from collections.abc import Callable, Iterable +import concurrent.futures +import itertools +import os +import pathlib +from typing import Optional + +from typing_extensions import override + +from model_signing import manifest +from model_signing._hashing import io +from model_signing._serialization import serialization + + +class IncrementalSerializer(serialization.Serializer): + """Model serializer that only re-hashes changed files. + + This serializer compares the current model state against an existing + manifest (from a previous signature) and only re-hashes files that: + - Are new (not in the existing manifest) + - Have changed size (likely modified) + - Are explicitly requested via files_to_hash parameter + + Files that exist in both the current model and the existing manifest + with matching sizes will have their digests reused from the existing + manifest without re-hashing. + + This provides significant performance improvements for large models where + only a small number of files change between signings (e.g., updating + documentation in a 200GB model). + """ + + def __init__( + self, + file_hasher_factory: Callable[[pathlib.Path], io.FileHasher], + existing_manifest: manifest.Manifest, + *, + max_workers: Optional[int] = None, + allow_symlinks: bool = False, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + ): + """Initializes an incremental serializer. + + Args: + file_hasher_factory: A callable to build the hash engine used to + hash individual files. + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + ignore_paths: The paths of files to ignore. + """ + self._hasher_factory = file_hasher_factory + self._existing_manifest = existing_manifest + self._max_workers = max_workers + self._allow_symlinks = allow_symlinks + self._ignore_paths = ignore_paths + + # Build lookup dictionary: file path -> manifest item + self._existing_items = {} + for item in existing_manifest._item_to_digest: + # item is a _File or _Shard key; we only support files for now + if isinstance(item, manifest._File): + self._existing_items[item.path] = item + + # Precompute serialization description + hasher = file_hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + self._is_blake3 = hasher.digest_name == "blake3" + + def set_allow_symlinks(self, allow_symlinks: bool) -> None: + """Set whether following symlinks is allowed.""" + self._allow_symlinks = allow_symlinks + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + + @override + def serialize( + self, + model_path: pathlib.Path, + *, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + files_to_hash: Optional[Iterable[pathlib.Path]] = None, + ) -> manifest.Manifest: + """Serializes the model, only re-hashing changed/new files. + + Args: + model_path: The path to the model. + ignore_paths: The paths to ignore during serialization. If a + provided path is a directory, all children of the directory are + ignored. + files_to_hash: Optional list of files that may have changed and + should be re-hashed. If None, all files in the model directory + are scanned, and only NEW files (not in existing manifest) are + hashed. Existing files have their digests reused. + + To detect changed files, use git diff or similar: + changed_files = subprocess.check_output( + ['git', 'diff', '--name-only', 'HEAD'] + ).decode().splitlines() + files_to_hash = [model_path / f for f in changed_files] + + Returns: + The model's serialized manifest with a mix of reused and + newly-computed digests. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. + """ + # Build a set of files to rehash (files that potentially changed) + rehash_paths = set() + if files_to_hash is not None: + # User provided explicit list of changed files + for path in files_to_hash: + if path.is_file(): + rehash_paths.add(path.relative_to(model_path)) + + # Scan directory to find all current files in the model + all_current_files = [] + for path in itertools.chain((model_path,), model_path.glob("**/*")): + if serialization.should_ignore(path, ignore_paths): + continue + serialization.check_file_or_directory( + path, allow_symlinks=self._allow_symlinks + ) + if path.is_file(): + all_current_files.append(path) + + # Build the new manifest + files_to_rehash = [] + manifest_items = [] + + for path in all_current_files: + relative_path = path.relative_to(model_path) + posix_path = pathlib.PurePosixPath(relative_path) + + # Determine if this file needs re-hashing + needs_rehash = False + + if posix_path not in self._existing_items: + # New file not in old manifest - must hash it + needs_rehash = True + elif rehash_paths and relative_path in rehash_paths: + # File was explicitly marked as changed - must re-hash it + needs_rehash = True + elif not rehash_paths: + # No explicit files_to_hash provided, so we're in "scan mode" + # Reuse digest for existing files (assume unchanged) + needs_rehash = False + else: + # File exists in old manifest and wasn't marked as changed + # Reuse old digest + needs_rehash = False + + if needs_rehash: + files_to_rehash.append(path) + else: + # Reuse existing digest + old_item_key = self._existing_items[posix_path] + old_digest = self._existing_manifest._item_to_digest[ + old_item_key + ] + manifest_items.append( + manifest.FileManifestItem( + path=relative_path, digest=old_digest + ) + ) + + # Hash all files that need re-hashing in parallel + with concurrent.futures.ThreadPoolExecutor( + max_workers=1 if self._is_blake3 else self._max_workers + ) as tpe: + futures = [ + tpe.submit(self._compute_hash, model_path, path) + for path in files_to_rehash + ] + for future in concurrent.futures.as_completed(futures): + manifest_items.append(future.result()) + + # Handle ignore_paths for serialization description + if ignore_paths: + rel_ignore_paths = [] + for p in ignore_paths: + rp = os.path.relpath(p, model_path) + if not rp.startswith("../"): + rel_ignore_paths.append(pathlib.Path(rp)) + + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, + self._allow_symlinks, + frozenset(list(self._ignore_paths) + rel_ignore_paths), + ) + + model_name = model_path.name + if not model_name or model_name == "..": + model_name = os.path.basename(model_path.resolve()) + + return manifest.Manifest( + model_name, manifest_items, self._serialization_description + ) + + def _compute_hash( + self, model_path: pathlib.Path, path: pathlib.Path + ) -> manifest.FileManifestItem: + """Produces the manifest item of the file given by `path`. + + Args: + model_path: The path to the model. + path: Path to the file in the model, that is currently transformed + to a manifest item. + + Returns: + The itemized manifest. + """ + relative_path = path.relative_to(model_path) + digest = self._hasher_factory(path).compute() + return manifest.FileManifestItem(path=relative_path, digest=digest) diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index cb2c453f..832588e0 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -61,6 +61,7 @@ from model_signing._hashing import memory from model_signing._serialization import file from model_signing._serialization import file_shard +from model_signing._serialization import incremental if sys.version_info >= (3, 11): @@ -375,6 +376,78 @@ def use_shard_serialization( ) return self + def use_incremental_serialization( + self, + existing_manifest: manifest.Manifest, + *, + hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", + chunk_size: int = 1048576, + max_workers: Optional[int] = None, + allow_symlinks: bool = False, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + ) -> Self: + """Configures incremental serialization for selective file re-hashing. + + This serialization method compares the current model state against an + existing manifest (from a previous signature) and only re-hashes files + that changed. This provides significant performance improvements for + large models where only a small subset of files change. + + The serialization method in this configuration is changed to one where: + - Files that exist in the existing manifest have their digests reused + - New files (not in existing manifest) are hashed + - Modified files (specified via files_to_hash in hash()) are re-hashed + - Deleted files are automatically omitted from the new manifest + + Usage example: + # Extract manifest from previous signature + old_manifest = manifest.Manifest.from_signature( + pathlib.Path("model.sig.old") + ) + + # Configure incremental hashing + config = hashing.Config().use_incremental_serialization( + old_manifest, + hashing_algorithm="sha256" + ) + + # Get changed files (e.g., from git) + changed_files = [model_path / "README.md"] + + # Hash only changed files + new_manifest = config.hash(model_path, files_to_hash=changed_files) + + Args: + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + hashing_algorithm: The hashing algorithm to use for new/changed + files. Must match the algorithm used in existing_manifest. + chunk_size: The amount of file to read at once. Default is 1MB. A + special value of 0 signals to attempt to read everything in a + single call. Ignored for BLAKE3. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library to select the best + value for the current machine, or the number of logical cores + when doing BLAKE3 hashing. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + ignore_paths: Paths of files to ignore. + + Returns: + The new hashing configuration with incremental serialization. + """ + self._serializer = incremental.IncrementalSerializer( + self._build_file_hasher_factory( + hashing_algorithm, chunk_size, max_workers + ), + existing_manifest, + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ignore_paths=ignore_paths, + ) + return self + def set_ignored_paths( self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True ) -> Self: diff --git a/src/model_signing/manifest.py b/src/model_signing/manifest.py index a42662ed..e175e541 100644 --- a/src/model_signing/manifest.py +++ b/src/model_signing/manifest.py @@ -39,8 +39,10 @@ """ import abc +import base64 from collections.abc import Iterable, Iterator import dataclasses +import json import pathlib import sys from typing import Any, Final @@ -466,3 +468,51 @@ def serialization_type(self) -> dict[str, Any]: manifest so that signature verification can use the same method. """ return self._serialization_type.serialization_parameters + + @classmethod + def from_signature(cls, signature_path: pathlib.Path) -> Self: + """Extracts a manifest from an existing signature file. + + This method reads a signature file (Sigstore bundle) and extracts the + manifest without performing cryptographic verification. This is useful + for incremental re-hashing where you need to know what files were + previously signed without verifying the signature. + + Args: + signature_path: Path to the signature file to read. + + Returns: + A Manifest object representing the signed model. + + Raises: + ValueError: If the signature file cannot be parsed or doesn't + contain a valid manifest. + FileNotFoundError: If the signature file doesn't exist. + """ + # Avoid circular import by importing here + from model_signing._signing import signing + + # Read the signature file + content = signature_path.read_text(encoding="utf-8") + bundle_dict = json.loads(content) + + # Extract the DSSE envelope payload + if "dsseEnvelope" in bundle_dict: + # This is a protobuf-based bundle + envelope = bundle_dict["dsseEnvelope"] + elif "dsse_envelope" in bundle_dict: + # Alternative snake_case naming + envelope = bundle_dict["dsse_envelope"] + else: + raise ValueError("Signature file does not contain a DSSE envelope") + + # Decode the payload (it's base64 encoded) + payload_b64 = envelope.get("payload") + if not payload_b64: + raise ValueError("DSSE envelope does not contain a payload") + + payload_bytes = base64.b64decode(payload_b64) + payload_dict = json.loads(payload_bytes) + + # Use the existing function to convert DSSE payload to manifest + return signing.dsse_payload_to_manifest(payload_dict) diff --git a/src/model_signing/signing.py b/src/model_signing/signing.py index 5c45a8eb..86feb3f5 100644 --- a/src/model_signing/signing.py +++ b/src/model_signing/signing.py @@ -48,6 +48,7 @@ from typing import Optional from model_signing import hashing +from model_signing import manifest from model_signing._signing import sign_certificate as certificate from model_signing._signing import sign_ec_key as ec_key from model_signing._signing import sign_sigstore as sigstore @@ -75,6 +76,55 @@ def sign(model_path: hashing.PathLike, signature_path: hashing.PathLike): Config().sign(model_path, signature_path) +def sign_incremental( + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, +): + """Signs a model incrementally, only re-hashing changed files. + + This function provides a convenient way to sign large models where only + a small subset of files have changed. Instead of re-hashing the entire + model (which can take hours for multi-hundred GB models), it reuses + digests from the previous signature for unchanged files and only hashes + new or modified files. + + In this default configuration we sign using Sigstore. + + Usage example: + # User modified README.md in a 500GB model + sign_incremental( + model_path="huge-model/", + old_signature_path="model.sig.old", + new_signature_path="model.sig.new", + files_to_hash=["huge-model/README.md"] + ) + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. The manifest + from this signature will be extracted and used for incremental + hashing. + new_signature_path: The path where the new signature will be written. + files_to_hash: Optional list of files that changed and need to be + re-hashed. If None, only new files (not in old signature) will + be hashed. Existing files will have their digests reused. + To detect changed files, use git diff or similar tools. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed. + """ + Config().sign_incremental( + model_path, + old_signature_path, + new_signature_path, + files_to_hash=files_to_hash, + ) + + class Config: """Configuration to use when signing models. @@ -109,6 +159,58 @@ def sign( signature = self._signer.sign(payload) signature.write(pathlib.Path(signature_path)) + def sign_incremental( + self, + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, + ): + """Signs a model incrementally using the current configuration. + + This method extracts the manifest from an existing signature and + configures incremental hashing to reuse digests for unchanged files. + Only new or modified files are re-hashed, providing significant + performance improvements for large models. + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. + new_signature_path: The path where the new signature will be + written. + files_to_hash: Optional list of files that changed and need to + be re-hashed. If None, only new files will be hashed. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed. + """ + # Extract manifest from old signature + old_manifest = manifest.Manifest.from_signature( + pathlib.Path(old_signature_path) + ) + + # Configure incremental hashing + self._hashing_config.use_incremental_serialization(old_manifest) + + # Convert files_to_hash to pathlib.Path objects if provided + paths_to_hash = None + if files_to_hash is not None: + paths_to_hash = [pathlib.Path(f) for f in files_to_hash] + + # Hash the model incrementally + new_manifest = self._hashing_config.hash( + model_path, files_to_hash=paths_to_hash + ) + + # Sign the new manifest + if not self._signer: + self.use_sigstore_signer() + payload = signing.Payload(new_manifest) + signature = self._signer.sign(payload) + signature.write(pathlib.Path(new_signature_path)) + def set_hashing_config(self, hashing_config: hashing.Config) -> Self: """Sets the new configuration for hashing models. diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py new file mode 100644 index 00000000..3c4a44a0 --- /dev/null +++ b/tests/_serialization/incremental_test.py @@ -0,0 +1,392 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for incremental serialization.""" + +import pathlib + +from model_signing import manifest +from model_signing._hashing import hashing +from model_signing._hashing import io as io_hashing +from model_signing._hashing import memory +from model_signing._serialization import incremental + + +class TestIncrementalSerializer: + def test_no_changes_reuses_all_digests(self, tmp_path): + """When no files change, all digests should be reused.""" + # Create a model with two files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") + + # Create an existing manifest (simulate previous signature) + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest2 = hashing.Digest("sha256", b"digest2_bytes_here") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item2 = manifest.FileManifestItem( + path=pathlib.PurePath("file2.txt"), digest=digest2 + ) + + existing_manifest = manifest.Manifest( + "model", [item1, item2], manifest._FileSerialization("sha256") + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify that digests were reused (not re-computed) + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # Find each file's descriptor + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") + + # Verify digests match the old manifest (were reused) + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + assert file2_desc.digest.digest_value == b"digest2_bytes_here" + + def test_new_file_is_hashed(self, tmp_path): + """When a new file is added, it should be hashed.""" + # Create a model with one existing file + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") # This is new + + # Create existing manifest with only file1 + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + + existing_manifest = manifest.Manifest( + "model", [item1], manifest._FileSerialization("sha256") + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify we have both files + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # file1 should have reused digest + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + + # file2 should have a new hash (not the fake digest) + file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") + # It should be the actual SHA256 of "content2", not a reused digest + assert file2_desc.digest.digest_value != b"digest1_bytes_here" + assert file2_desc.digest.algorithm == "sha256" + + def test_deleted_file_not_in_manifest(self, tmp_path): + """When a file is deleted, it should not appear in new manifest.""" + # Create a model with only one file + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + + # Create existing manifest with two files (file2 was deleted) + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest2 = hashing.Digest("sha256", b"digest2_bytes_here") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item2 = manifest.FileManifestItem( + path=pathlib.PurePath("file2.txt"), digest=digest2 + ) + + existing_manifest = manifest.Manifest( + "model", [item1, item2], manifest._FileSerialization("sha256") + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify only file1 is in the manifest + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 1 + assert descriptors[0].identifier == "file1.txt" + assert descriptors[0].digest.digest_value == b"digest1_bytes_here" + + def test_empty_existing_manifest_hashes_all(self, tmp_path): + """With an empty existing manifest, all files should be hashed.""" + # Create a model with files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") + + # Create empty existing manifest + existing_manifest = manifest.Manifest( + "model", [], manifest._FileSerialization("sha256") + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify both files are hashed + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # Both should have real hashes (not fake digests) + for desc in descriptors: + assert desc.digest.algorithm == "sha256" + assert len(desc.digest.digest_value) == 32 # SHA256 is 32 bytes + + def test_modified_file_with_files_to_hash_parameter(self, tmp_path): + """Test file is re-hashed when modified and in files_to_hash.""" + # Create a model with two files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "README.md").write_text("old readme") + + # Create existing manifest with both files + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme_old + ) + + existing_manifest = manifest.Manifest( + "model", [item1, item_readme], manifest._FileSerialization("sha256") + ) + + # User modifies README.md + (model_dir / "README.md").write_text("new readme content") + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize with files_to_hash specifying the changed file + new_manifest = serializer.serialize( + model_dir, + files_to_hash=[model_dir / "README.md"], # Only this file changed + ) + + # Verify we have both files + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # file1.txt should have reused digest + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + + # README.md should have a NEW hash (not the old one) + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) + assert readme_desc.digest.digest_value != b"old_readme_digest" + assert readme_desc.digest.algorithm == "sha256" + assert len(readme_desc.digest.digest_value) == 32 # Real SHA256 + + def test_deleted_file_in_files_to_hash_is_handled(self, tmp_path): + """When a deleted file is in files_to_hash, it's safely ignored.""" + # Create a model with files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "README.md").write_text("readme") + (model_dir / "weights.bin").write_text("weights") + + # Create existing manifest with three files + digest_readme = hashing.Digest("sha256", b"readme_digest") + digest_old = hashing.Digest("sha256", b"old_file_digest") + digest_weights = hashing.Digest("sha256", b"weights_digest") + + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme + ) + item_old = manifest.FileManifestItem( + path=pathlib.PurePath("old_file.txt"), digest=digest_old + ) + item_weights = manifest.FileManifestItem( + path=pathlib.PurePath("weights.bin"), digest=digest_weights + ) + + existing_manifest = manifest.Manifest( + "model", + [item_readme, item_old, item_weights], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # User specifies old_file.txt in files_to_hash (as git diff might) + # even though the file was deleted + deleted_file = model_dir / "old_file.txt" + new_manifest = serializer.serialize( + model_dir, + files_to_hash=[deleted_file], # Deleted file in the list + ) + + # Verify deleted file is NOT in new manifest + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + identifiers = [d.identifier for d in descriptors] + assert "README.md" in identifiers + assert "weights.bin" in identifiers + assert "old_file.txt" not in identifiers # Deleted file is gone + + # Other files should have reused digests + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) + assert readme_desc.digest.digest_value == b"readme_digest" + + weights_desc = next( + d for d in descriptors if d.identifier == "weights.bin" + ) + assert weights_desc.digest.digest_value == b"weights_digest" + + def test_mixed_changes_with_files_to_hash(self, tmp_path): + """Test realistic scenario: modify, add, delete files together.""" + # Initial state: three files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "README.md").write_text("old readme") + (model_dir / "weights.bin").write_text("weights") + (model_dir / "new_config.json").write_text("new config") + + # Old manifest has README.md, old_file.txt, weights.bin + digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") + digest_old_file = hashing.Digest("sha256", b"old_file_digest") + digest_weights = hashing.Digest("sha256", b"weights_digest") + + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme_old + ) + item_old = manifest.FileManifestItem( + path=pathlib.PurePath("old_file.txt"), digest=digest_old_file + ) + item_weights = manifest.FileManifestItem( + path=pathlib.PurePath("weights.bin"), digest=digest_weights + ) + + existing_manifest = manifest.Manifest( + "model", + [item_readme, item_old, item_weights], + manifest._FileSerialization("sha256"), + ) + + # User makes changes: + # - Modifies README.md + (model_dir / "README.md").write_text("new readme content") + # - Deletes old_file.txt (already not on disk) + # - Adds new_config.json (already on disk) + # - Leaves weights.bin unchanged + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Simulate git diff --name-only output + files_to_hash = [ + model_dir / "README.md", # Modified + model_dir / "old_file.txt", # Deleted + model_dir / "new_config.json", # Added + ] + + new_manifest = serializer.serialize( + model_dir, files_to_hash=files_to_hash + ) + + # Verify results + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 3 + + identifiers = [d.identifier for d in descriptors] + assert "README.md" in identifiers # Modified + assert "new_config.json" in identifiers # Added + assert "weights.bin" in identifiers # Unchanged + assert "old_file.txt" not in identifiers # Deleted + + # README.md should have NEW hash (was modified) + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) + assert readme_desc.digest.digest_value != b"old_readme_digest" + assert len(readme_desc.digest.digest_value) == 32 + + # new_config.json should have NEW hash (was added) + config_desc = next( + d for d in descriptors if d.identifier == "new_config.json" + ) + assert len(config_desc.digest.digest_value) == 32 + + # weights.bin should have REUSED hash (unchanged) + weights_desc = next( + d for d in descriptors if d.identifier == "weights.bin" + ) + assert weights_desc.digest.digest_value == b"weights_digest" diff --git a/tests/manifest_test.py b/tests/manifest_test.py index 771e1f01..e81e3577 100644 --- a/tests/manifest_test.py +++ b/tests/manifest_test.py @@ -206,3 +206,186 @@ def test_manifest_has_the_correct_resource_descriptors(self): assert descriptors[0].digest.digest_value == b"hash1" assert descriptors[1].digest.digest_value == b"hash2" assert descriptors[2].digest.digest_value == b"hash3" + + +class TestManifestFromSignature: + def test_from_signature_rejects_inconsistent_manifest(self, tmp_path): + import base64 + import json + + # Create a Sigstore bundle with inconsistent root digest + # The subject digest doesn't match the hash of the resources + payload_dict = { + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "test_model", + "digest": { + "sha256": ( + "0b8a5a8c8e8f1a8b8c8d8e8f2a8b8c8d8e8f3a8b8c8d" + "8e8f4a8b8c8d8e8f5a8b" + ) + }, + } + ], + "predicateType": "https://model_signing/signature/v1.0", + "predicate": { + "serialization": { + "method": "files", + "hash_type": "sha256", + "allow_symlinks": False, + "ignore_paths": [], + }, + "resources": [ + { + "name": "file1.txt", + "algorithm": "sha256", + "digest": ( + "abcd1234abcd1234abcd1234abcd1234" + "abcd1234abcd1234abcd1234abcd1234" + ), + }, + { + "name": "file2.txt", + "algorithm": "sha256", + "digest": ( + "5678dcba5678dcba5678dcba5678dcba" + "5678dcba5678dcba5678dcba5678dcba" + ), + }, + ], + }, + } + + # Create DSSE envelope + payload_json = json.dumps(payload_dict) + payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( + "utf-8" + ) + + bundle_dict = { + "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", + "verificationMaterial": { + "publicKey": {"hint": "test"}, + "tlogEntries": [], + }, + "dsseEnvelope": { + "payload": payload_b64, + "payloadType": "application/vnd.in-toto+json", + "signatures": [{"sig": "fake_signature"}], + }, + } + + # Write to file + sig_file = tmp_path / "test.sig" + sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") + + # Verify that inconsistent manifest is rejected + with pytest.raises(ValueError, match="Manifest is inconsistent"): + manifest.Manifest.from_signature(sig_file) + + def test_from_signature_extracts_valid_manifest(self, tmp_path): + import base64 + import hashlib + import json + + # Create valid SHA256 hex digests (64 chars each) + digest1_hex = ( + "abcd1234abcd1234abcd1234abcd1234" + "abcd1234abcd1234abcd1234abcd1234" + ) + digest2_hex = ( + "5678dcba5678dcba5678dcba5678dcba" + "5678dcba5678dcba5678dcba5678dcba" + ) + + digest1_bytes = bytes.fromhex(digest1_hex) + digest2_bytes = bytes.fromhex(digest2_hex) + + # Compute root digest (SHA256 of both digests concatenated) + hasher = hashlib.sha256() + hasher.update(digest1_bytes) + hasher.update(digest2_bytes) + root_digest = hasher.hexdigest() + + payload_dict = { + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + {"name": "test_model", "digest": {"sha256": root_digest}} + ], + "predicateType": "https://model_signing/signature/v1.0", + "predicate": { + "serialization": { + "method": "files", + "hash_type": "sha256", + "allow_symlinks": False, + "ignore_paths": [], + }, + "resources": [ + { + "name": "file1.txt", + "algorithm": "sha256", + "digest": digest1_hex, + }, + { + "name": "file2.txt", + "algorithm": "sha256", + "digest": digest2_hex, + }, + ], + }, + } + + payload_json = json.dumps(payload_dict) + payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( + "utf-8" + ) + + bundle_dict = { + "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", + "verificationMaterial": { + "publicKey": {"hint": "test"}, + "tlogEntries": [], + }, + "dsseEnvelope": { + "payload": payload_b64, + "payloadType": "application/vnd.in-toto+json", + "signatures": [{"sig": "fake_signature"}], + }, + } + + sig_file = tmp_path / "test.sig" + sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") + + # Extract manifest + extracted_manifest = manifest.Manifest.from_signature(sig_file) + + # Verify the manifest has the correct files + descriptors = list(extracted_manifest.resource_descriptors()) + assert len(descriptors) == 2 + assert descriptors[0].identifier == "file1.txt" + assert descriptors[1].identifier == "file2.txt" + assert descriptors[0].digest.digest_hex == digest1_hex + assert descriptors[1].digest.digest_hex == digest2_hex + assert extracted_manifest.model_name == "test_model" + + def test_from_signature_file_not_found(self, tmp_path): + non_existent = tmp_path / "does_not_exist.sig" + with pytest.raises(FileNotFoundError): + manifest.Manifest.from_signature(non_existent) + + def test_from_signature_invalid_json(self, tmp_path): + import json + + sig_file = tmp_path / "invalid.sig" + sig_file.write_text("not valid json", encoding="utf-8") + with pytest.raises(json.JSONDecodeError): + manifest.Manifest.from_signature(sig_file) + + def test_from_signature_missing_envelope(self, tmp_path): + sig_file = tmp_path / "missing_envelope.sig" + sig_file.write_text("{}", encoding="utf-8") + with pytest.raises( + ValueError, match="does not contain a DSSE envelope" + ): + manifest.Manifest.from_signature(sig_file)