Skip to content

Commit 77f3d2e

Browse files
author
Emrick Donadei
committed
Add IncrementalSerializer for selective file re-hashing
Implements the core incremental hashing logic that compares the current model state against an existing manifest and only re-hashes changed files. Key features: - Reuses digests for unchanged files from previous manifest - Hashes new files not in the previous signature - Handles modified files via files_to_hash parameter - Handles file deletions automatically (omits them from new manifest) - Uses same parallel hashing as standard file serializer The algorithm: 1. Scan current model directory for all files 2. Build set of files to rehash from files_to_hash parameter 3. For each current file: - If not in old manifest: hash it (new file) - If in files_to_hash list: hash it (modified file) - Otherwise: reuse digest from old manifest (unchanged) 4. Deleted files are automatically excluded (not on disk) 5. Return manifest with mix of reused and new digests Usage for incremental signing (e.g., 500GB model, 1KB README changed): # Get changed files from git changed = subprocess.check_output(['git', 'diff', '--name-only', 'HEAD']) files_to_hash = [model_path / f for f in changed.decode().split()] # Only re-hash the changed file(s) serializer.serialize(model_path, files_to_hash=files_to_hash) This provides significant performance improvements - only re-hashing the changed 1KB instead of all 500GB. Includes comprehensive tests covering: - No changes: all digests reused - New file added: only new file hashed - Modified file: only modified file re-hashed - File deleted (auto): removed from manifest - File deleted (in files_to_hash): safely ignored - Mixed changes: all scenarios working together Related to issue #160 - API for incremental model re-hashing
1 parent f22bcad commit 77f3d2e

File tree

2 files changed

+633
-0
lines changed

2 files changed

+633
-0
lines changed
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# Copyright 2024 The Sigstore Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Incremental model serializer for selective file re-hashing.
16+
17+
This module provides a serializer that can reuse digests from an existing
18+
manifest, only re-hashing files that have changed. This is useful for large
19+
models where only a small subset of files change between signings.
20+
"""
21+
22+
from collections.abc import Callable, Iterable
23+
import concurrent.futures
24+
import itertools
25+
import os
26+
import pathlib
27+
from typing import Optional
28+
29+
from typing_extensions import override
30+
31+
from model_signing import manifest
32+
from model_signing._hashing import io
33+
from model_signing._serialization import serialization
34+
35+
36+
class IncrementalSerializer(serialization.Serializer):
37+
"""Model serializer that only re-hashes changed files.
38+
39+
This serializer compares the current model state against an existing
40+
manifest (from a previous signature) and only re-hashes files that:
41+
- Are new (not in the existing manifest)
42+
- Have changed size (likely modified)
43+
- Are explicitly requested via files_to_hash parameter
44+
45+
Files that exist in both the current model and the existing manifest
46+
with matching sizes will have their digests reused from the existing
47+
manifest without re-hashing.
48+
49+
This provides significant performance improvements for large models where
50+
only a small number of files change between signings (e.g., updating
51+
documentation in a 200GB model).
52+
"""
53+
54+
def __init__(
55+
self,
56+
file_hasher_factory: Callable[[pathlib.Path], io.FileHasher],
57+
existing_manifest: manifest.Manifest,
58+
*,
59+
max_workers: Optional[int] = None,
60+
allow_symlinks: bool = False,
61+
ignore_paths: Iterable[pathlib.Path] = frozenset(),
62+
):
63+
"""Initializes an incremental serializer.
64+
65+
Args:
66+
file_hasher_factory: A callable to build the hash engine used to
67+
hash individual files.
68+
existing_manifest: The manifest from a previous signature. Digests
69+
from this manifest will be reused for unchanged files.
70+
max_workers: Maximum number of workers to use in parallel. Default
71+
is to defer to the `concurrent.futures` library.
72+
allow_symlinks: Controls whether symbolic links are included. If a
73+
symlink is present but the flag is `False` (default) the
74+
serialization would raise an error.
75+
ignore_paths: The paths of files to ignore.
76+
"""
77+
self._hasher_factory = file_hasher_factory
78+
self._existing_manifest = existing_manifest
79+
self._max_workers = max_workers
80+
self._allow_symlinks = allow_symlinks
81+
self._ignore_paths = ignore_paths
82+
83+
# Build lookup dictionary: file path -> manifest item
84+
self._existing_items = {}
85+
for item in existing_manifest._item_to_digest.keys():
86+
# item is a _File or _Shard key; we only support files for now
87+
if isinstance(item, manifest._File):
88+
self._existing_items[item.path] = item
89+
90+
# Precompute serialization description
91+
hasher = file_hasher_factory(pathlib.Path())
92+
self._serialization_description = manifest._FileSerialization(
93+
hasher.digest_name, self._allow_symlinks, self._ignore_paths
94+
)
95+
self._is_blake3 = hasher.digest_name == "blake3"
96+
97+
@override
98+
def serialize(
99+
self,
100+
model_path: pathlib.Path,
101+
*,
102+
ignore_paths: Iterable[pathlib.Path] = frozenset(),
103+
files_to_hash: Optional[Iterable[pathlib.Path]] = None,
104+
) -> manifest.Manifest:
105+
"""Serializes the model, only re-hashing changed/new files.
106+
107+
Args:
108+
model_path: The path to the model.
109+
ignore_paths: The paths to ignore during serialization. If a
110+
provided path is a directory, all children of the directory are
111+
ignored.
112+
files_to_hash: Optional list of files that may have changed and
113+
should be re-hashed. If None, all files in the model directory
114+
are scanned, and only NEW files (not in existing manifest) are
115+
hashed. Existing files have their digests reused.
116+
117+
To detect changed files, use git diff or similar:
118+
changed_files = subprocess.check_output(
119+
['git', 'diff', '--name-only', 'HEAD']
120+
).decode().splitlines()
121+
files_to_hash = [model_path / f for f in changed_files]
122+
123+
Returns:
124+
The model's serialized manifest with a mix of reused and
125+
newly-computed digests.
126+
127+
Raises:
128+
ValueError: The model contains a symbolic link, but the serializer
129+
was not initialized with `allow_symlinks=True`.
130+
"""
131+
# Build a set of files to rehash (files that potentially changed)
132+
rehash_paths = set()
133+
if files_to_hash is not None:
134+
# User provided explicit list of changed files
135+
for path in files_to_hash:
136+
if path.is_file():
137+
rehash_paths.add(path.relative_to(model_path))
138+
139+
# Scan directory to find all current files in the model
140+
all_current_files = []
141+
for path in itertools.chain((model_path,), model_path.glob("**/*")):
142+
if serialization.should_ignore(path, ignore_paths):
143+
continue
144+
serialization.check_file_or_directory(
145+
path, allow_symlinks=self._allow_symlinks
146+
)
147+
if path.is_file():
148+
all_current_files.append(path)
149+
150+
# Build the new manifest
151+
files_to_rehash = []
152+
manifest_items = []
153+
154+
for path in all_current_files:
155+
relative_path = path.relative_to(model_path)
156+
posix_path = pathlib.PurePosixPath(relative_path)
157+
158+
# Determine if this file needs re-hashing
159+
needs_rehash = False
160+
161+
if posix_path not in self._existing_items:
162+
# New file not in old manifest - must hash it
163+
needs_rehash = True
164+
elif rehash_paths and relative_path in rehash_paths:
165+
# File was explicitly marked as changed - must re-hash it
166+
needs_rehash = True
167+
elif not rehash_paths:
168+
# No explicit files_to_hash provided, so we're in "scan mode"
169+
# Reuse digest for existing files (assume unchanged)
170+
needs_rehash = False
171+
else:
172+
# File exists in old manifest and wasn't marked as changed
173+
# Reuse old digest
174+
needs_rehash = False
175+
176+
if needs_rehash:
177+
files_to_rehash.append(path)
178+
else:
179+
# Reuse existing digest
180+
old_item_key = self._existing_items[posix_path]
181+
old_digest = self._existing_manifest._item_to_digest[old_item_key]
182+
manifest_items.append(
183+
manifest.FileManifestItem(
184+
path=relative_path, digest=old_digest
185+
)
186+
)
187+
188+
# Hash all files that need re-hashing in parallel
189+
with concurrent.futures.ThreadPoolExecutor(
190+
max_workers=1 if self._is_blake3 else self._max_workers
191+
) as tpe:
192+
futures = [
193+
tpe.submit(self._compute_hash, model_path, path)
194+
for path in files_to_rehash
195+
]
196+
for future in concurrent.futures.as_completed(futures):
197+
manifest_items.append(future.result())
198+
199+
# Handle ignore_paths for serialization description
200+
if ignore_paths:
201+
rel_ignore_paths = []
202+
for p in ignore_paths:
203+
rp = os.path.relpath(p, model_path)
204+
if not rp.startswith("../"):
205+
rel_ignore_paths.append(pathlib.Path(rp))
206+
207+
hasher = self._hasher_factory(pathlib.Path())
208+
self._serialization_description = manifest._FileSerialization(
209+
hasher.digest_name,
210+
self._allow_symlinks,
211+
frozenset(list(self._ignore_paths) + rel_ignore_paths),
212+
)
213+
214+
model_name = model_path.name
215+
if not model_name or model_name == "..":
216+
model_name = os.path.basename(model_path.resolve())
217+
218+
return manifest.Manifest(
219+
model_name, manifest_items, self._serialization_description
220+
)
221+
222+
def _compute_hash(
223+
self, model_path: pathlib.Path, path: pathlib.Path
224+
) -> manifest.FileManifestItem:
225+
"""Produces the manifest item of the file given by `path`.
226+
227+
Args:
228+
model_path: The path to the model.
229+
path: Path to the file in the model, that is currently transformed
230+
to a manifest item.
231+
232+
Returns:
233+
The itemized manifest.
234+
"""
235+
relative_path = path.relative_to(model_path)
236+
digest = self._hasher_factory(path).compute()
237+
return manifest.FileManifestItem(path=relative_path, digest=digest)

0 commit comments

Comments
 (0)