Add GPU selection flag and document benchmarks

baogorek · baogorek · commit 8d60416e1f90 · 2026-01-17T12:09:55.000-05:00
diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py
@@ -15,22 +15,14 @@
 REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
 
 
-@app.function(
-    image=image,
-    secrets=[hf_secret],
-    memory=32768,
-    cpu=4.0,
-    gpu="A100-80GB",
-    timeout=14400,
-)
-def fit_weights(branch: str = "main", epochs: int = 200) -> bytes:
+def _fit_weights_impl(branch: str, epochs: int) -> bytes:
+    """Shared implementation for weight fitting."""
     os.chdir("/root")
     subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
     os.chdir("policyengine-us-data")
 
     subprocess.run(["uv", "sync", "--extra", "l0"], check=True)
 
-    # Download calibration inputs from HuggingFace
     print("Downloading calibration inputs from HuggingFace...")
     download_result = subprocess.run(
         [
@@ -51,7 +43,6 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes:
     if download_result.returncode != 0:
         raise RuntimeError(f"Download failed: {download_result.returncode}")
 
-    # Parse paths from output
     db_path = dataset_path = None
     for line in download_result.stdout.split('\n'):
         if line.startswith('DB:'):
@@ -90,13 +81,71 @@ def fit_weights(branch: str = "main", epochs: int = 200) -> bytes:
         return f.read()
 
 
+@app.function(
+    image=image, secrets=[hf_secret], memory=32768, cpu=4.0,
+    gpu="T4", timeout=14400,
+)
+def fit_weights_t4(branch: str = "main", epochs: int = 200) -> bytes:
+    return _fit_weights_impl(branch, epochs)
+
+
+@app.function(
+    image=image, secrets=[hf_secret], memory=32768, cpu=4.0,
+    gpu="A10", timeout=14400,
+)
+def fit_weights_a10(branch: str = "main", epochs: int = 200) -> bytes:
+    return _fit_weights_impl(branch, epochs)
+
+
+@app.function(
+    image=image, secrets=[hf_secret], memory=32768, cpu=4.0,
+    gpu="A100-40GB", timeout=14400,
+)
+def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> bytes:
+    return _fit_weights_impl(branch, epochs)
+
+
+@app.function(
+    image=image, secrets=[hf_secret], memory=32768, cpu=4.0,
+    gpu="A100-80GB", timeout=14400,
+)
+def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> bytes:
+    return _fit_weights_impl(branch, epochs)
+
+
+@app.function(
+    image=image, secrets=[hf_secret], memory=32768, cpu=4.0,
+    gpu="H100", timeout=14400,
+)
+def fit_weights_h100(branch: str = "main", epochs: int = 200) -> bytes:
+    return _fit_weights_impl(branch, epochs)
+
+
+GPU_FUNCTIONS = {
+    "T4": fit_weights_t4,
+    "A10": fit_weights_a10,
+    "A100-40GB": fit_weights_a100_40,
+    "A100-80GB": fit_weights_a100_80,
+    "H100": fit_weights_h100,
+}
+
+
 @app.local_entrypoint()
 def main(
     branch: str = "main",
     epochs: int = 200,
+    gpu: str = "T4",
     output: str = "calibration_weights.npy"
 ):
-    weights_bytes = fit_weights.remote(branch=branch, epochs=epochs)
+    if gpu not in GPU_FUNCTIONS:
+        raise ValueError(
+            f"Unknown GPU: {gpu}. Choose from: {list(GPU_FUNCTIONS.keys())}"
+        )
+
+    print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}")
+    func = GPU_FUNCTIONS[gpu]
+    weights_bytes = func.remote(branch=branch, epochs=epochs)
+
     with open(output, 'wb') as f:
         f.write(weights_bytes)
     print(f"Weights saved to: {output}")
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md
@@ -239,3 +239,52 @@ For most new targets:
 3. Run and verify with `MatrixTracer`
 
 No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements.
+
+## Running Weight Calibration on Modal (GPU)
+
+The `fit_calibration_weights.py` script can be run on Modal with GPU acceleration using `modal_app/fit_weights.py`.
+
+### Basic Usage
+
+```bash
+# Default: T4 GPU, 200 epochs
+modal run modal_app/fit_weights.py --branch main --epochs 200
+
+# Specify GPU type
+modal run modal_app/fit_weights.py --branch main --epochs 2000 --gpu A100-40GB
+```
+
+### GPU Benchmarks (200 epochs, 2 target groups, Jan 2026)
+
+| GPU | Time | Cost | Notes |
+|-----|------|------|-------|
+| T4 | 16m 4s | $0.16 | Best for small test runs |
+| A100-40GB | 9m 5s | $0.32 | ~44% faster |
+| A100-80GB | 10m 28s | $0.44 | Slower than 40GB (variance?) |
+
+### Key Findings
+
+1. **Memory bandwidth matters for sparse operations**: The P100 (not available on Modal) outperforms T4 by ~2x on Kaggle due to HBM2 memory (~732 GB/s) vs GDDR6 (~320 GB/s).
+
+2. **Significant overhead at low epochs**: With only 200 epochs, much of the runtime is fixed overhead:
+   - Git clone and `uv sync` (~2-3 min)
+   - HuggingFace data download (~1 min)
+   - Loading Microsimulation and building sparse matrix (~3-4 min, CPU-bound)
+
+3. **GPU choice depends on epoch count**:
+   - **< 500 epochs**: Use T4 (cheapest, overhead dominates)
+   - **500-2000 epochs**: A100-40GB may break even
+   - **> 2000 epochs**: A100 likely more cost-effective as training dominates
+
+4. **Available Modal GPUs** (by memory bandwidth):
+   - T4: 320 GB/s, $0.000164/sec
+   - L4: 300 GB/s, $0.000222/sec
+   - A10: 600 GB/s, $0.000306/sec
+   - L40S: 864 GB/s, $0.000542/sec
+   - A100-40GB: 1,555 GB/s, $0.000583/sec
+   - A100-80GB: 2,039 GB/s, $0.000694/sec
+   - H100: 3,350 GB/s, $0.001097/sec
+
+### Output
+
+Weights are saved locally to `calibration_weights.npy` (configurable via `--output` flag).