33"""
44Remote cache management tool for IQB data files.
55
6+ Usage:
7+ uv run python data/ghcache.py scan
8+
69This tool manages caching of large parquet/JSON files with local SHA256
710verification. It scans locally generated files and updates the manifest
811with correct GCS URLs for remote distribution.
2932"""
3033
3134import argparse
32- import hashlib
3335import json
3436import os
35- import re
36- import subprocess
3737import sys
3838from pathlib import Path
3939
40+ from dacite import from_dict
41+
42+ from iqb .ghremote .cache import Manifest
43+ from iqb .ghremote .diff import DiffState , diff
44+
4045MANIFEST_PATH = Path ("state" ) / "ghremote" / "manifest.json"
41- CACHE_DIR = Path ("cache" ) / "v1"
4246GCS_BUCKET = "mlab-sandbox-iqb-us-central1"
4347GCS_BASE_URL = f"https://storage.googleapis.com/{ GCS_BUCKET } "
4448
4549
46- def compute_sha256 (file_path : Path ) -> str :
47- """Compute SHA256 hash of a file."""
48- sha256 = hashlib .sha256 ()
49- with open (file_path , "rb" ) as f :
50- while chunk := f .read (8192 ):
51- sha256 .update (chunk )
52- return sha256 .hexdigest ()
53-
54-
55- def validate_cache_path (path : str ) -> bool :
56- """
57- Validate that a path follows the cache/v1 format.
58-
59- Valid format:
60- cache/v1/{rfc3339_timestamp}/{rfc3339_timestamp}/{name}/{file}
61-
62- Where:
63- - Component 1: "cache"
64- - Component 2: "v1"
65- - Component 3: RFC3339 timestamp (e.g., 20241001T000000Z)
66- - Component 4: RFC3339 timestamp
67- - Component 5: lowercase letters, numbers, and underscores [a-z0-9_]+
68- - Component 6: "data.parquet" or "stats.json"
69- """
70- parts = path .split ("/" )
71- if len (parts ) != 6 :
72- return False
73-
74- # Component 1: cache
75- if parts [0 ] != "cache" :
76- return False
77-
78- # Component 2: v1
79- if parts [1 ] != "v1" :
80- return False
81-
82- # Components 3-4: RFC3339 timestamps (YYYYMMDDTHHMMSSZ format)
83- rfc3339_pattern = re .compile (r"^\d{8}T\d{6}Z$" )
84- if not rfc3339_pattern .match (parts [2 ]):
85- return False
86- if not rfc3339_pattern .match (parts [3 ]):
87- return False
88-
89- # Component 5: lowercase letters, numbers, and underscores
90- name_pattern = re .compile (r"^[a-z0-9_]+$" )
91- if not name_pattern .match (parts [4 ]):
92- return False
93-
94- # Component 6: data.parquet or stats.json
95- return parts [5 ] in ("data.parquet" , "stats.json" )
96-
97-
9850def load_manifest () -> dict :
9951 """Load manifest from state/ghremote/manifest.json, or return empty if not found."""
10052 if not MANIFEST_PATH .exists ():
@@ -112,27 +64,12 @@ def save_manifest(manifest: dict) -> None:
11264 f .write ("\n " ) # Trailing newline
11365
11466
115- def is_git_ignored (file_path : Path ) -> bool :
116- """Check if a file is ignored by git."""
117- try :
118- result = subprocess .run (
119- ["git" , "check-ignore" , str (file_path )],
120- stdout = subprocess .DEVNULL ,
121- stderr = subprocess .DEVNULL ,
122- )
123- # Exit code 0 means the file is ignored
124- return result .returncode == 0
125- except Exception :
126- # If git isn't available or other error, assume not ignored
127- return False
128-
129-
13067def cmd_scan (args ) -> int :
13168 """
13269 Scan command: Scan local files and update manifest.
13370
13471 1. Load or create manifest
135- 2. Scan cache/v1 for git-ignored files
72+ 2. Diff manifest against local cache/v1 files
13673 3. For new or changed files:
13774 - Compute SHA256
13875 - Generate GCS URL
@@ -141,68 +78,40 @@ def cmd_scan(args) -> int:
14178 5. Print gcloud storage rsync command for uploading
14279 """
14380 _ = args
144- manifest = load_manifest ()
145- files_dict = manifest .setdefault ("files" , {})
146-
147- if not CACHE_DIR .exists ():
148- print (f"Cache directory { CACHE_DIR } does not exist." )
149- return 1
150-
151- print (f"Scanning { CACHE_DIR } for git-ignored files..." )
152-
153- # Find all files under cache/v1
154- all_files = list (CACHE_DIR .rglob ("*" ))
155- cache_files = [f for f in all_files if f .is_file ()]
156-
157- # Filter to only git-ignored files
158- ignored_files = [f for f in cache_files if is_git_ignored (f )]
81+ manifest_dict = load_manifest ()
82+ files_dict = manifest_dict .setdefault ("files" , {})
83+ manifest = from_dict (Manifest , manifest_dict )
15984
160- if not ignored_files :
161- print ("No git-ignored files found." )
162- return 0
163-
164- print (f"Found { len (ignored_files )} git-ignored files." )
85+ print ("Scanning local cache files..." )
16586
16687 updated_count = 0
16788
168- for file_path in ignored_files :
169- # Convert to relative path string with forward slashes for cross-platform compatibility
170- rel_path = file_path .as_posix ()
171-
172- # Validate path format
173- if not validate_cache_path (rel_path ):
174- print (f"Skipping invalid path format: { rel_path } " )
175- continue
176-
177- # Compute SHA256
178- sha256 = compute_sha256 (file_path )
179-
180- # Check if file is already in manifest with same SHA256
181- existing_entry = files_dict .get (rel_path )
182- if existing_entry and existing_entry ["sha256" ] == sha256 :
183- print (f"Already in manifest: { rel_path } " )
184- continue
185-
186- # File is new or changed
187- url = f"{ GCS_BASE_URL } /{ rel_path } "
188- print (f"New/changed: { rel_path } " )
189- print (f" SHA256: { sha256 } " )
190- print (f" URL: { url } " )
191-
192- files_dict [rel_path ] = {"sha256" : sha256 , "url" : url }
193- updated_count += 1
89+ for entry in diff (manifest , Path ("." )):
90+ if entry .state == DiffState .MATCHING :
91+ print (f"Already in manifest: { entry .file } " )
92+ elif entry .state in (DiffState .ONLY_LOCAL , DiffState .SHA256_MISMATCH ):
93+ sha256 = entry .local_sha256
94+ url = f"{ GCS_BASE_URL } /{ entry .file } "
95+ action = "Changed" if entry .state == DiffState .SHA256_MISMATCH else "New"
96+ print (f"{ action } : { entry .file } " )
97+ print (f" SHA256: { sha256 } " )
98+ print (f" URL: { url } " )
99+ files_dict [entry .file ] = {"sha256" : sha256 , "url" : url }
100+ updated_count += 1
101+ elif entry .state == DiffState .ONLY_REMOTE :
102+ print (f"In manifest but not on disk: { entry .file } " )
194103
195104 # Save updated manifest
196- save_manifest (manifest )
105+ save_manifest (manifest_dict )
197106 print (f"\n Manifest updated: { MANIFEST_PATH } " )
198107
199108 if updated_count > 0 :
200109 print (f"\n { updated_count } file(s) added/updated in manifest." )
201110 print ("\n Next steps:" )
202111 print ("1. Remove zero-length .lock files left over by the pipeline:" )
203- print (f " find data/{ CACHE_DIR } -type f -name .lock -delete" )
112+ print (" find data/cache/v1 -type f -name .lock -delete" )
204113 print ("2. Upload files to GCS:" )
205- print (f" gcloud storage rsync -r data/{ CACHE_DIR } gs://{ GCS_BUCKET } /{ CACHE_DIR } " )
114+ print (f" gcloud storage rsync -r data/cache/v1 gs://{ GCS_BUCKET } /cache/v1 " )
206115 print (f"3. Commit updated data/{ MANIFEST_PATH } to repository" )
207116
208117 return 0
@@ -213,7 +122,10 @@ def main() -> int:
213122 script_dir = Path (__file__ ).resolve ().parent
214123 os .chdir (script_dir )
215124
216- parser = argparse .ArgumentParser (description = "Remote cache management tool for IQB data files" )
125+ parser = argparse .ArgumentParser (
126+ description = "Remote cache management tool for IQB data files. "
127+ "Run with: uv run python data/ghcache.py <command>" ,
128+ )
217129 subparsers = parser .add_subparsers (dest = "command" , help = "Subcommand to run" )
218130
219131 # Scan subcommand
0 commit comments