|
| 1 | +# Copyright 2024 The Sigstore Authors |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +"""Incremental model serializer for selective file re-hashing. |
| 16 | +
|
| 17 | +This module provides a serializer that can reuse digests from an existing |
| 18 | +manifest, only re-hashing files that have changed. This is useful for large |
| 19 | +models where only a small subset of files change between signings. |
| 20 | +""" |
| 21 | + |
| 22 | +from collections.abc import Callable, Iterable |
| 23 | +import concurrent.futures |
| 24 | +import itertools |
| 25 | +import os |
| 26 | +import pathlib |
| 27 | +from typing import Optional |
| 28 | + |
| 29 | +from typing_extensions import override |
| 30 | + |
| 31 | +from model_signing import manifest |
| 32 | +from model_signing._hashing import io |
| 33 | +from model_signing._serialization import serialization |
| 34 | + |
| 35 | + |
| 36 | +class IncrementalSerializer(serialization.Serializer): |
| 37 | + """Model serializer that only re-hashes changed files. |
| 38 | +
|
| 39 | + This serializer compares the current model state against an existing |
| 40 | + manifest (from a previous signature) and only re-hashes files that: |
| 41 | + - Are new (not in the existing manifest) |
| 42 | + - Have changed size (likely modified) |
| 43 | + - Are explicitly requested via files_to_hash parameter |
| 44 | +
|
| 45 | + Files that exist in both the current model and the existing manifest |
| 46 | + with matching sizes will have their digests reused from the existing |
| 47 | + manifest without re-hashing. |
| 48 | +
|
| 49 | + This provides significant performance improvements for large models where |
| 50 | + only a small number of files change between signings (e.g., updating |
| 51 | + documentation in a 200GB model). |
| 52 | + """ |
| 53 | + |
| 54 | + def __init__( |
| 55 | + self, |
| 56 | + file_hasher_factory: Callable[[pathlib.Path], io.FileHasher], |
| 57 | + existing_manifest: manifest.Manifest, |
| 58 | + *, |
| 59 | + max_workers: Optional[int] = None, |
| 60 | + allow_symlinks: bool = False, |
| 61 | + ignore_paths: Iterable[pathlib.Path] = frozenset(), |
| 62 | + ): |
| 63 | + """Initializes an incremental serializer. |
| 64 | +
|
| 65 | + Args: |
| 66 | + file_hasher_factory: A callable to build the hash engine used to |
| 67 | + hash individual files. |
| 68 | + existing_manifest: The manifest from a previous signature. Digests |
| 69 | + from this manifest will be reused for unchanged files. |
| 70 | + max_workers: Maximum number of workers to use in parallel. Default |
| 71 | + is to defer to the `concurrent.futures` library. |
| 72 | + allow_symlinks: Controls whether symbolic links are included. If a |
| 73 | + symlink is present but the flag is `False` (default) the |
| 74 | + serialization would raise an error. |
| 75 | + ignore_paths: The paths of files to ignore. |
| 76 | + """ |
| 77 | + self._hasher_factory = file_hasher_factory |
| 78 | + self._existing_manifest = existing_manifest |
| 79 | + self._max_workers = max_workers |
| 80 | + self._allow_symlinks = allow_symlinks |
| 81 | + self._ignore_paths = ignore_paths |
| 82 | + |
| 83 | + # Build lookup dictionary: file path -> manifest item |
| 84 | + self._existing_items = {} |
| 85 | + for item in existing_manifest._item_to_digest.keys(): |
| 86 | + # item is a _File or _Shard key; we only support files for now |
| 87 | + if isinstance(item, manifest._File): |
| 88 | + self._existing_items[item.path] = item |
| 89 | + |
| 90 | + # Precompute serialization description |
| 91 | + hasher = file_hasher_factory(pathlib.Path()) |
| 92 | + self._serialization_description = manifest._FileSerialization( |
| 93 | + hasher.digest_name, self._allow_symlinks, self._ignore_paths |
| 94 | + ) |
| 95 | + self._is_blake3 = hasher.digest_name == "blake3" |
| 96 | + |
| 97 | + @override |
| 98 | + def serialize( |
| 99 | + self, |
| 100 | + model_path: pathlib.Path, |
| 101 | + *, |
| 102 | + ignore_paths: Iterable[pathlib.Path] = frozenset(), |
| 103 | + files_to_hash: Optional[Iterable[pathlib.Path]] = None, |
| 104 | + ) -> manifest.Manifest: |
| 105 | + """Serializes the model, only re-hashing changed/new files. |
| 106 | +
|
| 107 | + Args: |
| 108 | + model_path: The path to the model. |
| 109 | + ignore_paths: The paths to ignore during serialization. If a |
| 110 | + provided path is a directory, all children of the directory are |
| 111 | + ignored. |
| 112 | + files_to_hash: Optional list of files that may have changed and |
| 113 | + should be re-hashed. If None, all files in the model directory |
| 114 | + are scanned, and only NEW files (not in existing manifest) are |
| 115 | + hashed. Existing files have their digests reused. |
| 116 | +
|
| 117 | + To detect changed files, use git diff or similar: |
| 118 | + changed_files = subprocess.check_output( |
| 119 | + ['git', 'diff', '--name-only', 'HEAD'] |
| 120 | + ).decode().splitlines() |
| 121 | + files_to_hash = [model_path / f for f in changed_files] |
| 122 | +
|
| 123 | + Returns: |
| 124 | + The model's serialized manifest with a mix of reused and |
| 125 | + newly-computed digests. |
| 126 | +
|
| 127 | + Raises: |
| 128 | + ValueError: The model contains a symbolic link, but the serializer |
| 129 | + was not initialized with `allow_symlinks=True`. |
| 130 | + """ |
| 131 | + # Build a set of files to rehash (files that potentially changed) |
| 132 | + rehash_paths = set() |
| 133 | + if files_to_hash is not None: |
| 134 | + # User provided explicit list of changed files |
| 135 | + for path in files_to_hash: |
| 136 | + if path.is_file(): |
| 137 | + rehash_paths.add(path.relative_to(model_path)) |
| 138 | + |
| 139 | + # Scan directory to find all current files in the model |
| 140 | + all_current_files = [] |
| 141 | + for path in itertools.chain((model_path,), model_path.glob("**/*")): |
| 142 | + if serialization.should_ignore(path, ignore_paths): |
| 143 | + continue |
| 144 | + serialization.check_file_or_directory( |
| 145 | + path, allow_symlinks=self._allow_symlinks |
| 146 | + ) |
| 147 | + if path.is_file(): |
| 148 | + all_current_files.append(path) |
| 149 | + |
| 150 | + # Build the new manifest |
| 151 | + files_to_rehash = [] |
| 152 | + manifest_items = [] |
| 153 | + |
| 154 | + for path in all_current_files: |
| 155 | + relative_path = path.relative_to(model_path) |
| 156 | + posix_path = pathlib.PurePosixPath(relative_path) |
| 157 | + |
| 158 | + # Determine if this file needs re-hashing |
| 159 | + needs_rehash = False |
| 160 | + |
| 161 | + if posix_path not in self._existing_items: |
| 162 | + # New file not in old manifest - must hash it |
| 163 | + needs_rehash = True |
| 164 | + elif rehash_paths and relative_path in rehash_paths: |
| 165 | + # File was explicitly marked as changed - must re-hash it |
| 166 | + needs_rehash = True |
| 167 | + elif not rehash_paths: |
| 168 | + # No explicit files_to_hash provided, so we're in "scan mode" |
| 169 | + # Reuse digest for existing files (assume unchanged) |
| 170 | + needs_rehash = False |
| 171 | + else: |
| 172 | + # File exists in old manifest and wasn't marked as changed |
| 173 | + # Reuse old digest |
| 174 | + needs_rehash = False |
| 175 | + |
| 176 | + if needs_rehash: |
| 177 | + files_to_rehash.append(path) |
| 178 | + else: |
| 179 | + # Reuse existing digest |
| 180 | + old_item_key = self._existing_items[posix_path] |
| 181 | + old_digest = self._existing_manifest._item_to_digest[old_item_key] |
| 182 | + manifest_items.append( |
| 183 | + manifest.FileManifestItem( |
| 184 | + path=relative_path, digest=old_digest |
| 185 | + ) |
| 186 | + ) |
| 187 | + |
| 188 | + # Hash all files that need re-hashing in parallel |
| 189 | + with concurrent.futures.ThreadPoolExecutor( |
| 190 | + max_workers=1 if self._is_blake3 else self._max_workers |
| 191 | + ) as tpe: |
| 192 | + futures = [ |
| 193 | + tpe.submit(self._compute_hash, model_path, path) |
| 194 | + for path in files_to_rehash |
| 195 | + ] |
| 196 | + for future in concurrent.futures.as_completed(futures): |
| 197 | + manifest_items.append(future.result()) |
| 198 | + |
| 199 | + # Handle ignore_paths for serialization description |
| 200 | + if ignore_paths: |
| 201 | + rel_ignore_paths = [] |
| 202 | + for p in ignore_paths: |
| 203 | + rp = os.path.relpath(p, model_path) |
| 204 | + if not rp.startswith("../"): |
| 205 | + rel_ignore_paths.append(pathlib.Path(rp)) |
| 206 | + |
| 207 | + hasher = self._hasher_factory(pathlib.Path()) |
| 208 | + self._serialization_description = manifest._FileSerialization( |
| 209 | + hasher.digest_name, |
| 210 | + self._allow_symlinks, |
| 211 | + frozenset(list(self._ignore_paths) + rel_ignore_paths), |
| 212 | + ) |
| 213 | + |
| 214 | + model_name = model_path.name |
| 215 | + if not model_name or model_name == "..": |
| 216 | + model_name = os.path.basename(model_path.resolve()) |
| 217 | + |
| 218 | + return manifest.Manifest( |
| 219 | + model_name, manifest_items, self._serialization_description |
| 220 | + ) |
| 221 | + |
| 222 | + def _compute_hash( |
| 223 | + self, model_path: pathlib.Path, path: pathlib.Path |
| 224 | + ) -> manifest.FileManifestItem: |
| 225 | + """Produces the manifest item of the file given by `path`. |
| 226 | +
|
| 227 | + Args: |
| 228 | + model_path: The path to the model. |
| 229 | + path: Path to the file in the model, that is currently transformed |
| 230 | + to a manifest item. |
| 231 | +
|
| 232 | + Returns: |
| 233 | + The itemized manifest. |
| 234 | + """ |
| 235 | + relative_path = path.relative_to(model_path) |
| 236 | + digest = self._hasher_factory(path).compute() |
| 237 | + return manifest.FileManifestItem(path=relative_path, digest=digest) |
0 commit comments