PolicyEngine · baogorek · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 17, 2026
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -1,4 +1,5 @@
-- bump: patch
+- bump: minor
   changes:
-    fixed:
-    - Versioning workflow checkout for push events
+    added:
+    - Support for health_insurance_premiums_without_medicare_part_b in local area calibration
+    - Test coverage for sparse matrix builder with person-level targets
diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb
@@ -459,10 +459,10 @@
     "print(\"Remember, this is a North Carolina target:\\n\")\n",
     "print(targets_df.iloc[row_loc])\n",
     "\n",
-    "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n",
+    "print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n",
     "print(X_sparse[row_loc, positions['3702']])  # Household donated to NC's 2nd district\n",
     "\n",
-    "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n",
+    "print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n",
     "print(X_sparse[row_loc, positions['201']])  # Household donated to AK's at Large District"
    ]
   },

diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py
@@ -0,0 +1,102 @@
+import os
+import subprocess
+import modal
+
+app = modal.App("policyengine-us-data-fit-weights")
+
+hf_secret = modal.Secret.from_name("huggingface-token")
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git")
+    .pip_install("uv")
+)
+
+REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
+
+
+@app.function(
+    image=image,
+    secrets=[hf_secret],
+    memory=32768,
+    cpu=4.0,
+    gpu="T4",
+    timeout=14400,
+)
+def fit_weights(branch: str = "main", epochs: int = 200) -> bytes:
+    os.chdir("/root")
+    subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
+    os.chdir("policyengine-us-data")
+
+    subprocess.run(["uv", "sync", "--extra", "l0"], check=True)
+
+    # Download calibration inputs from HuggingFace
+    print("Downloading calibration inputs from HuggingFace...")
+    download_result = subprocess.run(
+        [
+            "uv", "run", "python", "-c",
+            "from policyengine_us_data.utils.huggingface import "
+            "download_calibration_inputs; "
+            "paths = download_calibration_inputs('/root/calibration_data'); "
+            "print(f\"DB: {paths['database']}\"); "
+            "print(f\"DATASET: {paths['dataset']}\")"
+        ],
+        capture_output=True,
+        text=True,
+        env=os.environ.copy(),
+    )
+    print(download_result.stdout)
+    if download_result.stderr:
+        print("Download STDERR:", download_result.stderr)
+    if download_result.returncode != 0:
+        raise RuntimeError(f"Download failed: {download_result.returncode}")
+
+    # Parse paths from output
+    db_path = dataset_path = None
+    for line in download_result.stdout.split('\n'):
+        if line.startswith('DB:'):
+            db_path = line.split('DB:')[1].strip()
+        elif line.startswith('DATASET:'):
+            dataset_path = line.split('DATASET:')[1].strip()
+
+    script_path = (
+        "policyengine_us_data/datasets/cps/"
+        "local_area_calibration/fit_calibration_weights.py"
+    )
+    result = subprocess.run(
+        [
+            "uv", "run", "python", script_path,
+            "--device", "cuda",
+            "--epochs", str(epochs),
+            "--db-path", db_path,
+            "--dataset-path", dataset_path,
+        ],
+        capture_output=True,
+        text=True,
+        env=os.environ.copy(),
+    )
+    print(result.stdout)
+    if result.stderr:
+        print("STDERR:", result.stderr)
+    if result.returncode != 0:
+        raise RuntimeError(f"Script failed with code {result.returncode}")
+
+    output_line = [
+        line for line in result.stdout.split('\n') if 'OUTPUT_PATH:' in line
+    ][0]
+    output_path = output_line.split('OUTPUT_PATH:')[1].strip()
+
+    with open(output_path, 'rb') as f:
+        return f.read()
+
+
+@app.local_entrypoint()
+def main(
+    branch: str = "main",
+    epochs: int = 200,
+    output: str = "calibration_weights.npy"
+):
+    weights_bytes = fit_weights.remote(branch=branch, epochs=epochs)
+    with open(output, 'wb') as f:
+        f.write(weights_bytes)
+    print(f"Weights saved to: {output}")
diff --git a/...ngine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/...ngine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md
@@ -0,0 +1,241 @@
+# Adding Calibration Targets to Local Area Geo-Stacking
+
+This document summarizes key learnings from adding `health_insurance_premiums_without_medicare_part_b` as a calibration target. Use this as a reference when adding future targets.
+
+## Key Discovery: No Code Changes Needed for Most Targets
+
+The `sparse_matrix_builder.py` is **already entity-agnostic**. PolicyEngine's `map_to="household"` parameter automatically handles aggregation from any entity level (person, tax_unit, spm_unit) to household level.
+
+```python
+# This line in sparse_matrix_builder.py (line 220-222) handles ALL entity types:
+target_values = state_sim.calculate(
+    target["variable"], map_to="household"
+).values
+```
+
+**Verified behavior:**
+- Person-level variables (like health_insurance_premiums): automatically summed to household
+- SPM-unit variables (like snap): automatically aggregated to household
+- Household variables: returned as-is
+
+## Architecture Overview
+
+### File Locations
+
+```
+policyengine_us_data/
+├── storage/
+│   ├── calibration/
+│   │   └── policy_data.db          # Target database (SQLite)
+│   └── stratified_extended_cps_2023.h5  # Base dataset for calibration
+└── datasets/cps/local_area_calibration/
+    ├── sparse_matrix_builder.py    # Builds X_sparse matrix (GENERIC)
+    ├── matrix_tracer.py            # Diagnostics for debugging matrices
+    ├── calibration_utils.py        # Helper functions
+    └── build_calibration_matrix.py # Runner script
+```
+
+### Database Schema (policy_data.db)
+
+```sql
+-- Core tables
+targets(target_id, variable, period, stratum_id, value, active, ...)
+strata(stratum_id, definition_hash, stratum_group_id, ...)
+stratum_constraints(stratum_id, constraint_variable, operation, value, ...)
+```
+
+**Key stratum_group_ids:**
+- Group 1: National hardcoded targets (20 variables including health insurance, medicaid, snap national, etc.)
+- Group 4: SNAP state/CD targets (538 targets: 51 state snap + 487 household_count)
+
+### Target Filter Logic
+
+The `build_matrix()` method uses **OR logic** for filters:
+
+```python
+# Gets SNAP targets OR health insurance target
+target_filter={
+    "stratum_group_ids": [4],  # All SNAP targets
+    "variables": ["health_insurance_premiums_without_medicare_part_b"],  # Specific variable
+}
+```
+
+## How to Add a New Target
+
+### Step 1: Check if Target Exists in Database
+
+```python
+import sqlite3
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+conn = sqlite3.connect(STORAGE_FOLDER / "calibration" / "policy_data.db")
+cursor = conn.cursor()
+
+# Find your target
+cursor.execute("""
+    SELECT t.target_id, t.variable, t.value, t.period, t.stratum_id,
+           s.stratum_group_id
+    FROM targets t
+    JOIN strata s ON t.stratum_id = s.stratum_id
+    WHERE t.variable = 'your_variable_name'
+""")
+print(cursor.fetchall())
+
+# Check constraints for that stratum
+cursor.execute("""
+    SELECT * FROM stratum_constraints WHERE stratum_id = <stratum_id>
+""")
+print(cursor.fetchall())
+```
+
+### Step 2: Determine Entity Type
+
+```python
+from policyengine_us import Microsimulation
+
+sim = Microsimulation()
+var = sim.tax_benefit_system.variables['your_variable_name']
+print(f"Entity: {var.entity.key}")  # person, household, tax_unit, spm_unit, etc.
+```
+
+### Step 3: Verify Aggregation Works
+
+```python
+# For non-household variables, verify totals are preserved
+person_total = sim.calculate('your_variable', 2023, map_to='person').values.sum()
+household_total = sim.calculate('your_variable', 2023, map_to='household').values.sum()
+print(f"Match: {np.isclose(person_total, household_total, rtol=1e-6)}")
+```
+
+### Step 4: Update the Runner Script
+
+Edit `build_calibration_matrix.py` to include your new target:
+
+```python
+targets_df, X_sparse, household_id_mapping = builder.build_matrix(
+    sim,
+    target_filter={
+        "stratum_group_ids": [4],  # SNAP
+        "variables": [
+            "health_insurance_premiums_without_medicare_part_b",
+            "your_new_variable",  # Add here
+        ],
+    },
+)
+```
+
+### Step 5: Run and Verify
+
+```bash
+cd policyengine_us_data/datasets/cps/local_area_calibration
+python build_calibration_matrix.py
+```
+
+## When Code Changes ARE Needed
+
+The current implementation may need modification for:
+
+1. **Count variables with special semantics**: Variables ending in `_count` might need `.nunique()` instead of `.sum()` for aggregation. The junkyard implementation handles this but our current builder doesn't.
+
+2. **Variables with state-specific calculations**: SNAP and Medicaid are already handled (state_fips is set before calculation). Other state-dependent variables should work the same way.
+
+3. **Constraint evaluation at non-household level**: Currently all constraints are evaluated at household level after aggregation. If you need person-level constraint evaluation (e.g., "only count persons with income > X"), the junkyard has this pattern but our builder doesn't.
+
+## The Junkyard Reference
+
+Location: `~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/metrics_matrix_geo_stacking_sparse.py`
+
+This 2,400+ line file has extensive logic we intentionally avoided:
+- Hard-coded variable names and stratum_group_ids
+- Complex entity relationship tracking
+- Person-level constraint evaluation with `.any()` aggregation
+
+**Key pattern from junkyard (if ever needed):**
+```python
+# Dynamic entity detection
+target_entity = sim.tax_benefit_system.variables[target_variable].entity.key
+
+# Entity relationship DataFrame
+entity_rel = pd.DataFrame({
+    "person_id": sim.calculate("person_id", map_to="person").values,
+    "household_id": sim.calculate("household_id", map_to="person").values,
+    "tax_unit_id": sim.calculate("tax_unit_id", map_to="person").values,
+    # ... other entities
+})
+
+# For counts: use .nunique() on entity IDs
+# For amounts: use .sum() on values
+```
+
+## Matrix Structure
+
+The sparse matrix X has shape `(n_targets, n_households × n_cds)`:
+
+```
+Columns: [CD1_hh0, CD1_hh1, ..., CD1_hhN, CD2_hh0, ..., CDM_hhN]
+Rows: One per target (geographic_id + variable combination)
+
+Column index formula: col_idx = cd_idx * n_households + hh_idx
+```
+
+Use `MatrixTracer` for debugging:
+```python
+from matrix_tracer import MatrixTracer
+
+tracer = MatrixTracer(targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim)
+tracer.print_matrix_structure()
+tracer.get_column_info(100)  # Info about column 100
+tracer.get_row_info(0)       # Info about row 0 (first target)
+```
+
+## Environment Setup
+
+```bash
+# Use the sep environment for this repo
+source ~/envs/sep/bin/activate
+
+# Run from the local_area_calibration directory
+cd ~/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration
+
+# Run tests
+pytest ../../tests/test_sparse_matrix_builder.py -v
+```
+
+## Common Queries
+
+### List all target variables
+```sql
+SELECT DISTINCT variable FROM targets;
+```
+
+### List all constraint variables
+```sql
+SELECT DISTINCT constraint_variable FROM stratum_constraints;
+```
+
+### Find targets by geographic level
+```sql
+-- National targets (no geographic constraints)
+SELECT t.* FROM targets t
+JOIN strata s ON t.stratum_id = s.stratum_id
+WHERE t.stratum_id NOT IN (
+    SELECT stratum_id FROM stratum_constraints
+    WHERE constraint_variable IN ('state_fips', 'congressional_district_geoid')
+);
+
+-- State-level targets
+SELECT t.* FROM targets t
+WHERE t.stratum_id IN (
+    SELECT stratum_id FROM stratum_constraints
+    WHERE constraint_variable = 'state_fips'
+);
+```
+
+## Summary
+
+For most new targets:
+1. Verify target exists in `policy_data.db`
+2. Add variable name to the target filter in `build_calibration_matrix.py`
+3. Run and verify with `MatrixTracer`
+
+No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements.