Skip to content

Commit 7499367

Browse files
authored
feat(ghcache): diff between manifest and cache (#132)
This pull request implements generating a diff of the manifest and the cache contents and uses this new functionality to refactor `./data/ghcache.py`. In turn, the new code is a refactoring and improvement of the original `./data/ghcache.py` code.
1 parent d4926f4 commit 7499367

File tree

5 files changed

+500
-125
lines changed

5 files changed

+500
-125
lines changed

data/ghcache.py

Lines changed: 34 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
"""
44
Remote cache management tool for IQB data files.
55
6+
Usage:
7+
uv run python data/ghcache.py scan
8+
69
This tool manages caching of large parquet/JSON files with local SHA256
710
verification. It scans locally generated files and updates the manifest
811
with correct GCS URLs for remote distribution.
@@ -29,72 +32,21 @@
2932
"""
3033

3134
import argparse
32-
import hashlib
3335
import json
3436
import os
35-
import re
36-
import subprocess
3737
import sys
3838
from pathlib import Path
3939

40+
from dacite import from_dict
41+
42+
from iqb.ghremote.cache import Manifest
43+
from iqb.ghremote.diff import DiffState, diff
44+
4045
MANIFEST_PATH = Path("state") / "ghremote" / "manifest.json"
41-
CACHE_DIR = Path("cache") / "v1"
4246
GCS_BUCKET = "mlab-sandbox-iqb-us-central1"
4347
GCS_BASE_URL = f"https://storage.googleapis.com/{GCS_BUCKET}"
4448

4549

46-
def compute_sha256(file_path: Path) -> str:
47-
"""Compute SHA256 hash of a file."""
48-
sha256 = hashlib.sha256()
49-
with open(file_path, "rb") as f:
50-
while chunk := f.read(8192):
51-
sha256.update(chunk)
52-
return sha256.hexdigest()
53-
54-
55-
def validate_cache_path(path: str) -> bool:
56-
"""
57-
Validate that a path follows the cache/v1 format.
58-
59-
Valid format:
60-
cache/v1/{rfc3339_timestamp}/{rfc3339_timestamp}/{name}/{file}
61-
62-
Where:
63-
- Component 1: "cache"
64-
- Component 2: "v1"
65-
- Component 3: RFC3339 timestamp (e.g., 20241001T000000Z)
66-
- Component 4: RFC3339 timestamp
67-
- Component 5: lowercase letters, numbers, and underscores [a-z0-9_]+
68-
- Component 6: "data.parquet" or "stats.json"
69-
"""
70-
parts = path.split("/")
71-
if len(parts) != 6:
72-
return False
73-
74-
# Component 1: cache
75-
if parts[0] != "cache":
76-
return False
77-
78-
# Component 2: v1
79-
if parts[1] != "v1":
80-
return False
81-
82-
# Components 3-4: RFC3339 timestamps (YYYYMMDDTHHMMSSZ format)
83-
rfc3339_pattern = re.compile(r"^\d{8}T\d{6}Z$")
84-
if not rfc3339_pattern.match(parts[2]):
85-
return False
86-
if not rfc3339_pattern.match(parts[3]):
87-
return False
88-
89-
# Component 5: lowercase letters, numbers, and underscores
90-
name_pattern = re.compile(r"^[a-z0-9_]+$")
91-
if not name_pattern.match(parts[4]):
92-
return False
93-
94-
# Component 6: data.parquet or stats.json
95-
return parts[5] in ("data.parquet", "stats.json")
96-
97-
9850
def load_manifest() -> dict:
9951
"""Load manifest from state/ghremote/manifest.json, or return empty if not found."""
10052
if not MANIFEST_PATH.exists():
@@ -112,27 +64,12 @@ def save_manifest(manifest: dict) -> None:
11264
f.write("\n") # Trailing newline
11365

11466

115-
def is_git_ignored(file_path: Path) -> bool:
116-
"""Check if a file is ignored by git."""
117-
try:
118-
result = subprocess.run(
119-
["git", "check-ignore", str(file_path)],
120-
stdout=subprocess.DEVNULL,
121-
stderr=subprocess.DEVNULL,
122-
)
123-
# Exit code 0 means the file is ignored
124-
return result.returncode == 0
125-
except Exception:
126-
# If git isn't available or other error, assume not ignored
127-
return False
128-
129-
13067
def cmd_scan(args) -> int:
13168
"""
13269
Scan command: Scan local files and update manifest.
13370
13471
1. Load or create manifest
135-
2. Scan cache/v1 for git-ignored files
72+
2. Diff manifest against local cache/v1 files
13673
3. For new or changed files:
13774
- Compute SHA256
13875
- Generate GCS URL
@@ -141,68 +78,40 @@ def cmd_scan(args) -> int:
14178
5. Print gcloud storage rsync command for uploading
14279
"""
14380
_ = args
144-
manifest = load_manifest()
145-
files_dict = manifest.setdefault("files", {})
146-
147-
if not CACHE_DIR.exists():
148-
print(f"Cache directory {CACHE_DIR} does not exist.")
149-
return 1
150-
151-
print(f"Scanning {CACHE_DIR} for git-ignored files...")
152-
153-
# Find all files under cache/v1
154-
all_files = list(CACHE_DIR.rglob("*"))
155-
cache_files = [f for f in all_files if f.is_file()]
156-
157-
# Filter to only git-ignored files
158-
ignored_files = [f for f in cache_files if is_git_ignored(f)]
81+
manifest_dict = load_manifest()
82+
files_dict = manifest_dict.setdefault("files", {})
83+
manifest = from_dict(Manifest, manifest_dict)
15984

160-
if not ignored_files:
161-
print("No git-ignored files found.")
162-
return 0
163-
164-
print(f"Found {len(ignored_files)} git-ignored files.")
85+
print("Scanning local cache files...")
16586

16687
updated_count = 0
16788

168-
for file_path in ignored_files:
169-
# Convert to relative path string with forward slashes for cross-platform compatibility
170-
rel_path = file_path.as_posix()
171-
172-
# Validate path format
173-
if not validate_cache_path(rel_path):
174-
print(f"Skipping invalid path format: {rel_path}")
175-
continue
176-
177-
# Compute SHA256
178-
sha256 = compute_sha256(file_path)
179-
180-
# Check if file is already in manifest with same SHA256
181-
existing_entry = files_dict.get(rel_path)
182-
if existing_entry and existing_entry["sha256"] == sha256:
183-
print(f"Already in manifest: {rel_path}")
184-
continue
185-
186-
# File is new or changed
187-
url = f"{GCS_BASE_URL}/{rel_path}"
188-
print(f"New/changed: {rel_path}")
189-
print(f" SHA256: {sha256}")
190-
print(f" URL: {url}")
191-
192-
files_dict[rel_path] = {"sha256": sha256, "url": url}
193-
updated_count += 1
89+
for entry in diff(manifest, Path(".")):
90+
if entry.state == DiffState.MATCHING:
91+
print(f"Already in manifest: {entry.file}")
92+
elif entry.state in (DiffState.ONLY_LOCAL, DiffState.SHA256_MISMATCH):
93+
sha256 = entry.local_sha256
94+
url = f"{GCS_BASE_URL}/{entry.file}"
95+
action = "Changed" if entry.state == DiffState.SHA256_MISMATCH else "New"
96+
print(f"{action}: {entry.file}")
97+
print(f" SHA256: {sha256}")
98+
print(f" URL: {url}")
99+
files_dict[entry.file] = {"sha256": sha256, "url": url}
100+
updated_count += 1
101+
elif entry.state == DiffState.ONLY_REMOTE:
102+
print(f"In manifest but not on disk: {entry.file}")
194103

195104
# Save updated manifest
196-
save_manifest(manifest)
105+
save_manifest(manifest_dict)
197106
print(f"\nManifest updated: {MANIFEST_PATH}")
198107

199108
if updated_count > 0:
200109
print(f"\n{updated_count} file(s) added/updated in manifest.")
201110
print("\nNext steps:")
202111
print("1. Remove zero-length .lock files left over by the pipeline:")
203-
print(f" find data/{CACHE_DIR} -type f -name .lock -delete")
112+
print(" find data/cache/v1 -type f -name .lock -delete")
204113
print("2. Upload files to GCS:")
205-
print(f" gcloud storage rsync -r data/{CACHE_DIR} gs://{GCS_BUCKET}/{CACHE_DIR}")
114+
print(f" gcloud storage rsync -r data/cache/v1 gs://{GCS_BUCKET}/cache/v1")
206115
print(f"3. Commit updated data/{MANIFEST_PATH} to repository")
207116

208117
return 0
@@ -213,7 +122,10 @@ def main() -> int:
213122
script_dir = Path(__file__).resolve().parent
214123
os.chdir(script_dir)
215124

216-
parser = argparse.ArgumentParser(description="Remote cache management tool for IQB data files")
125+
parser = argparse.ArgumentParser(
126+
description="Remote cache management tool for IQB data files. "
127+
"Run with: uv run python data/ghcache.py <command>",
128+
)
217129
subparsers = parser.add_subparsers(dest="command", help="Subcommand to run")
218130

219131
# Scan subcommand

library/src/iqb/ghremote/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,15 @@
2424
"""
2525

2626
from .cache import IQBRemoteCache
27+
from .diff import DiffEntry, DiffState, diff
2728

2829
# Backward compatibility alias
2930
IQBGitHubRemoteCache = IQBRemoteCache
3031

3132
__all__ = [
33+
"DiffEntry",
34+
"DiffState",
3235
"IQBGitHubRemoteCache",
3336
"IQBRemoteCache",
37+
"diff",
3438
]

library/src/iqb/ghremote/cache.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def _sync_file_entry(entry: FileEntry, dest_path: Path):
137137
"""Sync the given FileEntry with the remotely cached file."""
138138
# Determine whether we need to download again
139139
exists = dest_path.exists()
140-
if not exists or entry.sha256 != _compute_sha256(dest_path):
140+
if not exists or entry.sha256 != compute_sha256(dest_path):
141141
dest_path.parent.mkdir(parents=True, exist_ok=True)
142142
# Operate inside a temporary directory in the destination directory so
143143
# `os.replace()` is atomic and we avoid cross-filesystem moves.
@@ -183,13 +183,13 @@ def _sync_file_entry_tmp(entry: FileEntry, tmp_file: Path):
183183

184184
# Make sure the sha256 matches
185185
log.info("validating %s... start", entry)
186-
sha256 = _compute_sha256(tmp_file)
186+
sha256 = compute_sha256(tmp_file)
187187
if sha256 != entry.sha256:
188188
raise ValueError(f"SHA256 mismatch: expected {entry.sha256}, got {sha256}")
189189
log.info("validating %s... ok", entry)
190190

191191

192-
def _compute_sha256(path: Path) -> str:
192+
def compute_sha256(path: Path) -> str:
193193
"""Compute SHA256 hash of a file."""
194194
sha256 = hashlib.sha256()
195195
with open(path, "rb") as fp:

0 commit comments

Comments
 (0)