diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 0f82eb65..c3810620 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +1,5 @@ -- bump: patch +- bump: minor changes: - fixed: - - Versioning workflow checkout for push events + added: + - Support for health_insurance_premiums_without_medicare_part_b in local area calibration + - Test coverage for sparse matrix builder with person-level targets diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb index cdd1cc97..9060a3df 100644 --- a/docs/local_area_calibration_setup.ipynb +++ b/docs/local_area_calibration_setup.ipynb @@ -459,10 +459,10 @@ "print(\"Remember, this is a North Carolina target:\\n\")\n", "print(targets_df.iloc[row_loc])\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nNC State target. Household donated to NC's 2nd district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n", "\n", - "print(\"\\nHousehold donated to NC's 2nd district, 2023 SNAP dollars:\")\n", + "print(\"\\nSame target, same household, donated to AK's at Large district, 2023 SNAP dollars:\")\n", "print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District" ] }, diff --git a/modal_app/fit_weights.py b/modal_app/fit_weights.py new file mode 100644 index 00000000..842223a1 --- /dev/null +++ b/modal_app/fit_weights.py @@ -0,0 +1,151 @@ +import os +import subprocess +import modal + +app = modal.App("policyengine-us-data-fit-weights") + +hf_secret = modal.Secret.from_name("huggingface-token") + +image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("git") + .pip_install("uv") +) + +REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" + + +def _fit_weights_impl(branch: str, epochs: int) -> bytes: + """Shared implementation for weight fitting.""" + os.chdir("/root") + subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) + os.chdir("policyengine-us-data") + + subprocess.run(["uv", "sync", "--extra", "l0"], check=True) + + print("Downloading calibration inputs from HuggingFace...") + download_result = subprocess.run( + [ + "uv", "run", "python", "-c", + "from policyengine_us_data.utils.huggingface import " + "download_calibration_inputs; " + "paths = download_calibration_inputs('/root/calibration_data'); " + "print(f\"DB: {paths['database']}\"); " + "print(f\"DATASET: {paths['dataset']}\")" + ], + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(download_result.stdout) + if download_result.stderr: + print("Download STDERR:", download_result.stderr) + if download_result.returncode != 0: + raise RuntimeError(f"Download failed: {download_result.returncode}") + + db_path = dataset_path = None + for line in download_result.stdout.split('\n'): + if line.startswith('DB:'): + db_path = line.split('DB:')[1].strip() + elif line.startswith('DATASET:'): + dataset_path = line.split('DATASET:')[1].strip() + + script_path = ( + "policyengine_us_data/datasets/cps/" + "local_area_calibration/fit_calibration_weights.py" + ) + result = subprocess.run( + [ + "uv", "run", "python", script_path, + "--device", "cuda", + "--epochs", str(epochs), + "--db-path", db_path, + "--dataset-path", dataset_path, + ], + capture_output=True, + text=True, + env=os.environ.copy(), + ) + print(result.stdout) + if result.stderr: + print("STDERR:", result.stderr) + if result.returncode != 0: + raise RuntimeError(f"Script failed with code {result.returncode}") + + output_line = [ + line for line in result.stdout.split('\n') if 'OUTPUT_PATH:' in line + ][0] + output_path = output_line.split('OUTPUT_PATH:')[1].strip() + + with open(output_path, 'rb') as f: + return f.read() + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="T4", timeout=14400, +) +def fit_weights_t4(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A10", timeout=14400, +) +def fit_weights_a10(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A100-40GB", timeout=14400, +) +def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="A100-80GB", timeout=14400, +) +def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +@app.function( + image=image, secrets=[hf_secret], memory=32768, cpu=4.0, + gpu="H100", timeout=14400, +) +def fit_weights_h100(branch: str = "main", epochs: int = 200) -> bytes: + return _fit_weights_impl(branch, epochs) + + +GPU_FUNCTIONS = { + "T4": fit_weights_t4, + "A10": fit_weights_a10, + "A100-40GB": fit_weights_a100_40, + "A100-80GB": fit_weights_a100_80, + "H100": fit_weights_h100, +} + + +@app.local_entrypoint() +def main( + branch: str = "main", + epochs: int = 200, + gpu: str = "T4", + output: str = "calibration_weights.npy" +): + if gpu not in GPU_FUNCTIONS: + raise ValueError( + f"Unknown GPU: {gpu}. Choose from: {list(GPU_FUNCTIONS.keys())}" + ) + + print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}") + func = GPU_FUNCTIONS[gpu] + weights_bytes = func.remote(branch=branch, epochs=epochs) + + with open(output, 'wb') as f: + f.write(weights_bytes) + print(f"Weights saved to: {output}") diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md new file mode 100644 index 00000000..59d486f9 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/ADDING_CALIBRATION_TARGETS.md @@ -0,0 +1,290 @@ +# Adding Calibration Targets to Local Area Geo-Stacking + +This document summarizes key learnings from adding `health_insurance_premiums_without_medicare_part_b` as a calibration target. Use this as a reference when adding future targets. + +## Key Discovery: No Code Changes Needed for Most Targets + +The `sparse_matrix_builder.py` is **already entity-agnostic**. PolicyEngine's `map_to="household"` parameter automatically handles aggregation from any entity level (person, tax_unit, spm_unit) to household level. + +```python +# This line in sparse_matrix_builder.py (line 220-222) handles ALL entity types: +target_values = state_sim.calculate( + target["variable"], map_to="household" +).values +``` + +**Verified behavior:** +- Person-level variables (like health_insurance_premiums): automatically summed to household +- SPM-unit variables (like snap): automatically aggregated to household +- Household variables: returned as-is + +## Architecture Overview + +### File Locations + +``` +policyengine_us_data/ +├── storage/ +│ ├── calibration/ +│ │ └── policy_data.db # Target database (SQLite) +│ └── stratified_extended_cps_2023.h5 # Base dataset for calibration +└── datasets/cps/local_area_calibration/ + ├── sparse_matrix_builder.py # Builds X_sparse matrix (GENERIC) + ├── matrix_tracer.py # Diagnostics for debugging matrices + ├── calibration_utils.py # Helper functions + └── build_calibration_matrix.py # Runner script +``` + +### Database Schema (policy_data.db) + +```sql +-- Core tables +targets(target_id, variable, period, stratum_id, value, active, ...) +strata(stratum_id, definition_hash, stratum_group_id, ...) +stratum_constraints(stratum_id, constraint_variable, operation, value, ...) +``` + +**Key stratum_group_ids:** +- Group 1: National hardcoded targets (20 variables including health insurance, medicaid, snap national, etc.) +- Group 4: SNAP state/CD targets (538 targets: 51 state snap + 487 household_count) + +### Target Filter Logic + +The `build_matrix()` method uses **OR logic** for filters: + +```python +# Gets SNAP targets OR health insurance target +target_filter={ + "stratum_group_ids": [4], # All SNAP targets + "variables": ["health_insurance_premiums_without_medicare_part_b"], # Specific variable +} +``` + +## How to Add a New Target + +### Step 1: Check if Target Exists in Database + +```python +import sqlite3 +from policyengine_us_data.storage import STORAGE_FOLDER + +conn = sqlite3.connect(STORAGE_FOLDER / "calibration" / "policy_data.db") +cursor = conn.cursor() + +# Find your target +cursor.execute(""" + SELECT t.target_id, t.variable, t.value, t.period, t.stratum_id, + s.stratum_group_id + FROM targets t + JOIN strata s ON t.stratum_id = s.stratum_id + WHERE t.variable = 'your_variable_name' +""") +print(cursor.fetchall()) + +# Check constraints for that stratum +cursor.execute(""" + SELECT * FROM stratum_constraints WHERE stratum_id = +""") +print(cursor.fetchall()) +``` + +### Step 2: Determine Entity Type + +```python +from policyengine_us import Microsimulation + +sim = Microsimulation() +var = sim.tax_benefit_system.variables['your_variable_name'] +print(f"Entity: {var.entity.key}") # person, household, tax_unit, spm_unit, etc. +``` + +### Step 3: Verify Aggregation Works + +```python +# For non-household variables, verify totals are preserved +person_total = sim.calculate('your_variable', 2023, map_to='person').values.sum() +household_total = sim.calculate('your_variable', 2023, map_to='household').values.sum() +print(f"Match: {np.isclose(person_total, household_total, rtol=1e-6)}") +``` + +### Step 4: Update the Runner Script + +Edit `build_calibration_matrix.py` to include your new target: + +```python +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], # SNAP + "variables": [ + "health_insurance_premiums_without_medicare_part_b", + "your_new_variable", # Add here + ], + }, +) +``` + +### Step 5: Run and Verify + +```bash +cd policyengine_us_data/datasets/cps/local_area_calibration +python build_calibration_matrix.py +``` + +## When Code Changes ARE Needed + +The current implementation may need modification for: + +1. **Count variables with special semantics**: Variables ending in `_count` might need `.nunique()` instead of `.sum()` for aggregation. The junkyard implementation handles this but our current builder doesn't. + +2. **Variables with state-specific calculations**: SNAP and Medicaid are already handled (state_fips is set before calculation). Other state-dependent variables should work the same way. + +3. **Constraint evaluation at non-household level**: Currently all constraints are evaluated at household level after aggregation. If you need person-level constraint evaluation (e.g., "only count persons with income > X"), the junkyard has this pattern but our builder doesn't. + +## The Junkyard Reference + +Location: `~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/metrics_matrix_geo_stacking_sparse.py` + +This 2,400+ line file has extensive logic we intentionally avoided: +- Hard-coded variable names and stratum_group_ids +- Complex entity relationship tracking +- Person-level constraint evaluation with `.any()` aggregation + +**Key pattern from junkyard (if ever needed):** +```python +# Dynamic entity detection +target_entity = sim.tax_benefit_system.variables[target_variable].entity.key + +# Entity relationship DataFrame +entity_rel = pd.DataFrame({ + "person_id": sim.calculate("person_id", map_to="person").values, + "household_id": sim.calculate("household_id", map_to="person").values, + "tax_unit_id": sim.calculate("tax_unit_id", map_to="person").values, + # ... other entities +}) + +# For counts: use .nunique() on entity IDs +# For amounts: use .sum() on values +``` + +## Matrix Structure + +The sparse matrix X has shape `(n_targets, n_households × n_cds)`: + +``` +Columns: [CD1_hh0, CD1_hh1, ..., CD1_hhN, CD2_hh0, ..., CDM_hhN] +Rows: One per target (geographic_id + variable combination) + +Column index formula: col_idx = cd_idx * n_households + hh_idx +``` + +Use `MatrixTracer` for debugging: +```python +from matrix_tracer import MatrixTracer + +tracer = MatrixTracer(targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim) +tracer.print_matrix_structure() +tracer.get_column_info(100) # Info about column 100 +tracer.get_row_info(0) # Info about row 0 (first target) +``` + +## Environment Setup + +```bash +# Use the sep environment for this repo +source ~/envs/sep/bin/activate + +# Run from the local_area_calibration directory +cd ~/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration + +# Run tests +pytest ../../tests/test_sparse_matrix_builder.py -v +``` + +## Common Queries + +### List all target variables +```sql +SELECT DISTINCT variable FROM targets; +``` + +### List all constraint variables +```sql +SELECT DISTINCT constraint_variable FROM stratum_constraints; +``` + +### Find targets by geographic level +```sql +-- National targets (no geographic constraints) +SELECT t.* FROM targets t +JOIN strata s ON t.stratum_id = s.stratum_id +WHERE t.stratum_id NOT IN ( + SELECT stratum_id FROM stratum_constraints + WHERE constraint_variable IN ('state_fips', 'congressional_district_geoid') +); + +-- State-level targets +SELECT t.* FROM targets t +WHERE t.stratum_id IN ( + SELECT stratum_id FROM stratum_constraints + WHERE constraint_variable = 'state_fips' +); +``` + +## Summary + +For most new targets: +1. Verify target exists in `policy_data.db` +2. Add variable name to the target filter in `build_calibration_matrix.py` +3. Run and verify with `MatrixTracer` + +No code changes to `sparse_matrix_builder.py` needed unless you have special aggregation or constraint requirements. + +## Running Weight Calibration on Modal (GPU) + +The `fit_calibration_weights.py` script can be run on Modal with GPU acceleration using `modal_app/fit_weights.py`. + +### Basic Usage + +```bash +# Default: T4 GPU, 200 epochs +modal run modal_app/fit_weights.py --branch main --epochs 200 + +# Specify GPU type +modal run modal_app/fit_weights.py --branch main --epochs 2000 --gpu A100-40GB +``` + +### GPU Benchmarks (200 epochs, 2 target groups, Jan 2026) + +| GPU | Time | Cost | Notes | +|-----|------|------|-------| +| T4 | 16m 4s | $0.16 | Best for small test runs | +| A100-40GB | 9m 5s | $0.32 | ~44% faster | +| A100-80GB | 10m 28s | $0.44 | Slower than 40GB (variance?) | + +### Key Findings + +1. **Memory bandwidth matters for sparse operations**: The P100 (not available on Modal) outperforms T4 by ~2x on Kaggle due to HBM2 memory (~732 GB/s) vs GDDR6 (~320 GB/s). + +2. **Significant overhead at low epochs**: With only 200 epochs, much of the runtime is fixed overhead: + - Git clone and `uv sync` (~2-3 min) + - HuggingFace data download (~1 min) + - Loading Microsimulation and building sparse matrix (~3-4 min, CPU-bound) + +3. **GPU choice depends on epoch count**: + - **< 500 epochs**: Use T4 (cheapest, overhead dominates) + - **500-2000 epochs**: A100-40GB may break even + - **> 2000 epochs**: A100 likely more cost-effective as training dominates + +4. **Available Modal GPUs** (by memory bandwidth): + - T4: 320 GB/s, $0.000164/sec + - L4: 300 GB/s, $0.000222/sec + - A10: 600 GB/s, $0.000306/sec + - L40S: 864 GB/s, $0.000542/sec + - A100-40GB: 1,555 GB/s, $0.000583/sec + - A100-80GB: 2,039 GB/s, $0.000694/sec + - H100: 3,350 GB/s, $0.001097/sec + +### Output + +Weights are saved locally to `calibration_weights.npy` (configurable via `--output` flag). diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py b/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py new file mode 100644 index 00000000..185914a5 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/build_calibration_matrix.py @@ -0,0 +1,195 @@ +""" +Build calibration matrix for geo-stacking reweighting. +Generates X_sparse and target vector, prints diagnostics using MatrixTracer. +""" + +import logging + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +import numpy as np +import pandas as pd +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from sparse_matrix_builder import SparseMatrixBuilder +from matrix_tracer import MatrixTracer +from calibration_utils import create_target_groups + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +db_uri = f"sqlite:///{db_path}" +time_period = 2023 + +# Base dataset for geo-stacking: stratified extended CPS +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +cds_to_calibrate = [ + "101", # Alabama CD-1 + "601", # California CD-1 + "602", # California CD-2 + "3601", # New York CD-1 + "4801", # Texas CD-1 +] + +print(f"Testing with {len(cds_to_calibrate)} congressional districts") + +# ============================================================================ +# STEP 1: LOAD SIMULATION FROM EXTENDED CPS +# ============================================================================ +print(f"\nLoading simulation from {dataset_path}...") +sim = Microsimulation(dataset=str(dataset_path)) +n_households = len(sim.calculate("household_id", map_to="household").values) +print(f"Loaded {n_households:,} households") + +# ============================================================================ +# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS +# ============================================================================ +print("\nBuilding sparse matrix...") +builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None, +) + +# SNAP targets (stratum_group_id=4) + specific health insurance variable +# Uses OR logic: gets all SNAP targets OR the health insurance target +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, +) + +print(f"\nMatrix built successfully:") +print(f" Shape: {X_sparse.shape}") +print(f" Targets: {len(targets_df)}") +nnz = X_sparse.nnz +total = X_sparse.shape[0] * X_sparse.shape[1] +print(f" Sparsity: {1 - nnz / total:.4%}") + +# ============================================================================ +# STEP 3: EXTRACT TARGET VECTOR +# ============================================================================ +target_vector = targets_df["value"].values +print(f"\nTarget vector shape: {target_vector.shape}") +print(f"Target total: ${target_vector.sum():,.0f}") + +# ============================================================================ +# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION +# ============================================================================ +print("\n" + "=" * 80) +print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") +print("=" * 80) + +health_ins_targets = targets_df[ + targets_df["variable"] + == "health_insurance_premiums_without_medicare_part_b" +] + +if len(health_ins_targets) > 0: + print(f"\nFound {len(health_ins_targets)} health insurance target(s):") + print( + health_ins_targets[ + [ + "target_id", + "variable", + "value", + "geographic_id", + "stratum_group_id", + ] + ] + ) + + health_ins_idx = health_ins_targets.index[0] + health_ins_row = X_sparse[health_ins_idx, :] + + print(f"\nMatrix row {health_ins_idx} (health insurance):") + print(f" Non-zero entries: {health_ins_row.nnz:,}") + print(f" Row sum: ${health_ins_row.sum():,.0f}") + print(f" Target value: ${health_ins_targets.iloc[0]['value']:,.0f}") + + person_total = sim.calculate( + "health_insurance_premiums_without_medicare_part_b", + time_period, + map_to="person", + ).values.sum() + + household_total = sim.calculate( + "health_insurance_premiums_without_medicare_part_b", + time_period, + map_to="household", + ).values.sum() + + print(f"\nEntity aggregation verification:") + print(f" Person-level total: ${person_total:,.0f}") + print(f" Household-level total: ${household_total:,.0f}") + print(f" Match: {np.isclose(person_total, household_total, rtol=1e-6)}") + +else: + print("\nWARNING: No health insurance targets found!") + +# ============================================================================ +# STEP 5: SNAP TARGET SUMMARY +# ============================================================================ +print("\n" + "=" * 80) +print("SNAP TARGET SUMMARY") +print("=" * 80) + +snap_targets = targets_df[targets_df["variable"] == "snap"] +household_count_targets = targets_df[ + targets_df["variable"] == "household_count" +] + +print(f"\nSNAP benefit targets: {len(snap_targets)}") +print(f"Household count targets: {len(household_count_targets)}") + +if len(snap_targets) > 0: + print(f"\nSNAP total (all states): ${snap_targets['value'].sum():,.0f}") + print(f"\nSample SNAP targets:") + print( + snap_targets[["target_id", "variable", "value", "geographic_id"]].head( + 10 + ) + ) + +# ============================================================================ +# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS +# ============================================================================ +print("\n" + "=" * 80) +print("MATRIX TRACER DIAGNOSTICS") +print("=" * 80) + +tracer = MatrixTracer( + targets_df=targets_df, + matrix=X_sparse, + household_id_mapping=household_id_mapping, + geographic_ids=cds_to_calibrate, + sim=sim, +) + +tracer.print_matrix_structure(show_groups=True) + +# ============================================================================ +# STEP 7: TARGET GROUP ANALYSIS +# ============================================================================ +print("\n" + "=" * 80) +print("TARGET GROUP ANALYSIS") +print("=" * 80) + +target_groups, group_info = create_target_groups(targets_df) + +print(f"\nTotal target groups: {len(group_info)}") +for group_id, info in enumerate(group_info): + group_mask = target_groups == group_id + n_targets_in_group = group_mask.sum() + print(f" Group {group_id}: {info} ({n_targets_in_group} targets)") + +print("\n" + "=" * 80) +print("RUNNER COMPLETED SUCCESSFULLY") +print("=" * 80) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt b/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt new file mode 100644 index 00000000..3242ef91 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/conversation_about_this_repo.txt @@ -0,0 +1,2147 @@ + + ▐▛███▜▌ Claude Code v2.1.7 +▝▜█████▛▘ Haiku 4.5 · Claude Max + ▘▘ ▝▝ ~/…/policyengine_us_data/datasets/cps/local_area_calibration + +❯ We need to bring in a new target to our calibration efforts, and this + will be manifested in the X matrix that is created from + sparse_matrix_builder.py in this directory. We have only test `snap` so + far, and you can see that demonstrated in + ~/docs/local_area_calibration_setup.ipynb. But now we want to bring in + health_insurance_premiums_without_medicare_part_b. We want to fully + understand what kind of variable this is from our target database (used + to construct the target vector), which you can find in + policyengine_us_data/storage/calibration/policy_data.db and see examples + for how to query it at + /home/baogorek/devl/code-snippets/reweighting/querying_policy_data_db.md. + This variable is defined at a different unit of analysis than `snap`, so + you need to be mindful of that. One more thing to be aware of: while we + are branched off the mainline in this repo, clone at + ~/devl/sep/policyengine-us-data, there is a "junkyard" clone of this same + repo at ~/devl/policyengine-us-data, which contains + /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cp + s/local_area_calibration/metrics_matrix_geo_stacking_sparse.py, a massive + 2,000+ line file that contains logic to handle all of our targets, but + it does it in a very idiosycratic way, hard-coding target names in the + module. Our sparse_matrix_builder.py must never do this! We want to use + our policy_data.db to handle any target and create that sparse matrix. + Due to the size of the matrix creator in the "junkyard", and the + existance of two repos, this is a difficult task. Use the uv environment + ~/envs/sep for this branch and task, and use ~/envs/pe for the + "junkyard". Plan out the addition of this new target, and Ultrathink! + +● I'll help you plan the addition of the health_insurance_premiums_without_medicare_part_b target to your calibration system. This is a complex task that requires understanding multiple components across two repositories. + + Let me start by exploring the codebase to understand the current implementation and requirements. + +● 3 Explore agents finished (ctrl+o to expand) + ├─ Understand current sparse_matrix_builder implementation · 17 tool uses · + 27.6k tokens + │ ⎿  Done + ├─ Understand policy_data.db structure · 40 tool uses · 40.6k tokens + │ ⎿  Done + └─ Explore junkyard implementation · 40 tool uses · 52.5k tokens + ⎿  Done + +● Excellent! I've gathered comprehensive information about the current implementation, database structure, and junkyard patterns. Now let me launch a Plan agent to design the implementation approach. + +● Plan(Design health insurance premiums integration) + ⎿  Done (36 tool uses · 46.4k tokens · 4m 36s) + + +● Let me verify the Plan agent's findings by reading the key files directly. + +● Read(sparse_matrix_builder.py) + ⎿  Read 239 lines + +● Read(~/docs/local_area_calibration_setup.ipynb) + ⎿  Error reading file + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿ === Health Insurance Premium Targets === + (5, 'health_insurance_premiums_without_medicare_part_b', + 385000000000.0 + , 2024, 1, 1) + … +3 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Search(pattern: "~/docs/*.ipynb") + ⎿  Found 0 files + +● Read(~/devl/code-snippets/reweighting/querying_policy_data_db.md) + ⎿  Read 96 lines + +● Bash(find /home/baogorek/docs -name "*.ipynb" -type f 2>/dev/null | head -10) + ⎿  (No content) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿  === Stratum Group Analysis === + + Stratum group 1 (health insurance stratum): + … +56 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + from policyengine_us import Microsimulation…) + ⎿  === Entity Aggregation Test === + Person-level array shape: (52826,) + Household-level array shape: (21045,) + … +6 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Excellent! I've now verified the Plan agent's key finding. Let me create the implementation plan and ask some clarifying questions. + +● Updated plan + ⎿  Wrote 267 lines to + ../../../../../../../.claude/plans/velvet-rolling-dawn.md + 1 # Implementation Plan: Adding Health Insurance Premiums to Calibr + ation + 2 ## Executive Summary + 3 **Surprising Finding**: The current `sparse_matrix_builder.py` im + plementation **already supports person-level targets** without an + y code changes needed. PolicyEngine's `map_to="household"` parame + ter automatically aggregates person-level values to household lev + el while preserving totals. + 4 ## Background + 5 ### Current State + 6 - **Working targets**: SNAP (stratum_group_id=4, spm_unit-level, + 538 targets: 51 state snap amounts + 487 household_count targets) + … +257 lines (ctrl+o to expand) + +● User answered Claude's questions: + ⎿  · Do you want to calibrate using only the existing national health + insurance premium target ($385B), or should state-level targets be + added to the database first? → National target only (Recommended) + · How do you plan to use the health insurance premium target in your + calibration? → Combined with SNAP + + · The existing health insurance target has no geographic constraints + (same target for all CDs). Is this the desired behavior? → Yes, + national is correct + + +● Excellent! Based on your answers, let me finalize the implementation plan. + +● Updated plan + ⎿  /plan to preview + 250 - ✓ Total preservation is guaranteed by PolicyEngine's aggrega + tion logic + 251 - ✓ Generic architecture means extensible to any future entity + types + 252 + 253 -## Next Steps (Post-Approval) + 253 +## User Decisions + 254 + 255 -1. **Update usage examples** to show health insurance premium + -calibration + 256 -2. **Test with combined targets** (SNAP + health insurance) to + - verify OR filter logic + 257 -3. **Document target filtering patterns** for future reference + 258 -4. **(Optional) Add state-level health insurance targets** to + -database if geographic variation is needed + 255 +✓ **Target Scope**: Use national target only (existing target_ + +id=5, no database changes) + 256 +✓ **Usage Pattern**: Combined with SNAP (test multi-target cal + +ibration) + 257 +✓ **Geographic Scope**: National aggregate is correct (no stat + +e variation needed) + 258 + 260 -## Questions for User + 259 +## Implementation Steps (Post-Approval) + 260 + 262 -Before finalizing this plan, clarification needed on: + 261 +### Step 1: Verify Database Target + 262 +Confirm health insurance premium target exists and is active: + 263 +```python + 264 +# Using ~/envs/sep environment + 265 +import sqlite3 + 266 +conn = sqlite3.connect('policyengine_us_data/storage/calibrati + +on/policy_data.db') + 267 +cursor = conn.cursor() + 268 +cursor.execute(""" + 269 + SELECT target_id, variable, value, period, stratum_id, act + +ive + 270 + FROM targets + 271 + WHERE variable = 'health_insurance_premiums_without_medica + +re_part_b' + 272 +""") + 273 +print(cursor.fetchone()) + 274 +``` + 275 +Expected: `(5, 'health_insurance_premiums_without_medicare_par + +t_b', 385000000000.0, 2024, 1, 1)` + 276 + 264 -1. **Target Scope**: Do you want ONLY the national health insu + -rance target, or should state-level targets be added to the da + -tabase? + 265 -2. **Usage Pattern**: Will you calibrate using health insuranc + -e alone, or combined with SNAP targets? + 266 -3. **Geographic Constraints**: The current target has no geogr + -aphic constraints (applies to all CDs equally). Is this correc + -t, or should premiums vary by state/CD? + 277 +### Step 2: Test Combined Target Matrix Building + 278 +Create a test script to build matrix with both SNAP and health + + insurance: + 279 + + 280 +```python + 281 +from policyengine_us import Microsimulation + 282 +from policyengine_us_data.datasets.cps.local_area_calibration. + +sparse_matrix_builder import SparseMatrixBuilder + 283 + + 284 +# Load simulation + 285 +sim = Microsimulation() + 286 + + 287 +# Create builder + 288 +builder = SparseMatrixBuilder( + 289 + db_uri="sqlite:///policyengine_us_data/storage/calibration + +/policy_data.db", + 290 + time_period=2024, + 291 + cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: + + AL-1, CA-1, CA-2 + 292 + dataset_path=None + 293 +) + 294 + + 295 +# Build matrix with SNAP + health insurance + 296 +targets_df, X_sparse, household_mapping = builder.build_matrix + +( + 297 + sim, + 298 + target_filter={"stratum_group_ids": [1, 4]} # National + + +SNAP targets + 299 +) + 300 + + 301 +print(f"Total targets: {len(targets_df)}") + 302 +print(f"Matrix shape: {X_sparse.shape}") + 303 +print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_s + +parse.shape[1]):.2%}") + 304 + + 305 +# Verify health insurance target is included + 306 +health_ins = targets_df[ + 307 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 308 +] + 309 +print(f"\nHealth insurance target:") + 310 +print(health_ins[["target_id", "variable", "value", "geographi + +c_id"]]) + 311 +``` + 312 + + 313 +### Step 3: Verify Entity Aggregation + 314 +Confirm person-to-household aggregation works correctly: + 315 + + 316 +```python + 317 +# Verify totals are preserved + 318 +person_total = sim.calculate( + 319 + 'health_insurance_premiums_without_medicare_part_b', + 320 + 2024, + 321 + map_to='person' + 322 +).values.sum() + 323 + + 324 +household_total = sim.calculate( + 325 + 'health_insurance_premiums_without_medicare_part_b', + 326 + 2024, + 327 + map_to='household' + 328 +).values.sum() + 329 + + 330 +print(f"Person-level total: ${person_total:,.0f}") + 331 +print(f"Household-level total: ${household_total:,.0f}") + 332 +print(f"Match: {abs(person_total - household_total) < 1}") + 333 + + 334 +# Check matrix row sum for health insurance + 335 +health_ins_idx = targets_df[ + 336 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 337 +].index[0] + 338 +matrix_total = X_sparse[health_ins_idx, :].sum() * len(cds_to_ + +calibrate) # Adjust for stacking + 339 +print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") + 340 +``` + 341 + + 342 +### Step 4: Update Documentation + 343 +Add example to `~/docs/local_area_calibration_setup.ipynb` sho + +wing: + 344 +- How to filter for multiple target groups + 345 +- How person-level targets work alongside spm_unit-level targe + +ts + 346 +- Interpretation of the resulting matrix structure + 347 + + 348 +### Step 5: Calibration Integration + 349 +Use the matrix in calibration workflow: + 350 +```python + 351 +# After building matrix with combined targets + 352 +from your_calibration_module import solve_weights + 353 + + 354 +# Solve for weights + 355 +weights = solve_weights(X_sparse, targets_df["value"].values) + 356 + + 357 +# Verify calibration matches targets + 358 +calibrated = X_sparse.T @ weights + 359 +targets = targets_df["value"].values + 360 + + 361 +for idx, row in targets_df.iterrows(): + 362 + print(f"{row['variable']}: target={row['value']:,.0f}, cal + +ibrated={calibrated[idx]:,.0f}") + 363 +``` + 364 + + 365 +## Success Criteria + 366 + + 367 +1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + +, 4]}` filter + 368 +2. ✓ Health insurance row appears in targets_df with correct v + +alue ($385B) + 369 +3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + +_targets ≈ 558 (20 national + 538 SNAP) + 370 +4. ✓ Person-to-household aggregation preserves totals + 371 +5. ✓ Matrix row for health insurance contains household-aggreg + +ated premium values + 372 +6. ✓ No code changes needed (only usage pattern demonstration) + +● Updated plan + ⎿  /plan to preview + 1 # Implementation Plan: Adding Health Insurance Premiums to Cali + bration + 2 + 3 +## TL;DR + 4 + + 5 +**ZERO CODE CHANGES NEEDED** 🎉 + 6 + + 7 +The current `sparse_matrix_builder.py` already supports person- + +level targets through PolicyEngine's `map_to="household"` param + +eter, which automatically aggregates person-level values to hou + +sehold level. Simply use: + 8 + + 9 +```python + 10 +builder.build_matrix(sim, target_filter={"stratum_group_ids": [ + +1, 4]}) # SNAP + health insurance + 11 +``` + 12 + + 13 +This works because: + 14 +- Health insurance premium is in the database (target_id=5, $38 + +5B) + 15 +- PolicyEngine auto-aggregates person→household (preserves tota + +ls) + 16 +- No hard-coded variable names in the builder + 17 + + 18 ## Executive Summary + 19 + 5 -**Surprising Finding**: The current `sparse_matrix_builder.py` + -implementation **already supports person-level targets** withou + -t any code changes needed. PolicyEngine's `map_to="household"` + -parameter automatically aggregates person-level values to house + -hold level while preserving totals. + 20 +**Key Finding**: The current `sparse_matrix_builder.py` impleme + +ntation **already supports person-level targets** without any c + +ode changes needed. PolicyEngine's `map_to="household"` paramet + +er automatically aggregates person-level values to household le + +vel while preserving totals. + 21 + 22 ## Background + 23 + +● Updated plan + ⎿  /plan to preview + 385 4. ✓ Person-to-household aggregation preserves totals + 386 5. ✓ Matrix row for health insurance contains household-aggreg + ated premium values + 387 6. ✓ No code changes needed (only usage pattern demonstration) + 388 + + 389 +## Quick Reference + 390 + + 391 +### Critical Files + 392 +- **No changes**: `sparse_matrix_builder.py` (already supports + + person-level targets) + 393 +- **Database**: `policyengine_us_data/storage/calibration/poli + +cy_data.db` (contains target_id=5) + 394 +- **Environment**: `~/envs/sep` (for this development branch) + 395 + + 396 +### Key Commands + 397 +```bash + 398 +# Activate environment + 399 +source ~/envs/sep/bin/activate + 400 + + 401 +# Query database for health insurance target + 402 +cd ~/devl/sep/policyengine-us-data + 403 +python -c "import sqlite3; conn = sqlite3.connect('policyengin + +e_us_data/storage/calibration/policy_data.db'); cursor = conn. + +cursor(); cursor.execute('SELECT * FROM targets WHERE variable + + = \"health_insurance_premiums_without_medicare_part_b\"'); pr + +int(cursor.fetchone())" + 404 + + 405 +# Run verification tests + 406 +python test_combined_targets.py # Create this test script per + + Step 2 above + 407 +``` + 408 + + 409 +### Target Filter Examples + 410 +```python + 411 +# SNAP only + 412 +{"stratum_group_ids": [4]} + 413 + + 414 +# Health insurance only + 415 +{"variables": ["health_insurance_premiums_without_medicare_par + +t_b"]} + 416 + + 417 +# Both SNAP and health insurance (recommended) + 418 +{"stratum_group_ids": [1, 4]} + 419 +``` + +● Perfect! I've completed a comprehensive investigation and created a detailed implementation plan. The surprising finding is that no code changes are needed - your sparse_matrix_builder.py already supports person-level targets through PolicyEngine's automatic entity aggregation. + + Key discoveries: + 1. Health insurance premiums is person-level (verified via PolicyEngine metadata) + 2. PolicyEngine's map_to="household" automatically aggregates person values to household level + 3. Totals are preserved (verified: $56,465,248 at both person and household levels) + 4. Database already contains the target (target_id=5, $385B, stratum_group_id=1) + 5. Your generic architecture works perfectly for any entity type + + The plan includes: + - Complete verification of entity aggregation + - Step-by-step testing procedures + - Combined SNAP + health insurance usage pattern + - Success criteria and quick reference commands + ⎿ Tool use rejected with user message: Do not do Step 5, rather, please create + a "runner" program that generates X_sparse and the target vector and prints + some diagnostics. You can use the first 100 lines of + /home/baogorek/devl/policyengine-us-data/policyengine_us_data/datasets/cps/l + ocal_area_calibration/calibrate_cds_sparse.py (from the junkyard) as an + example. Remember that you have access to the tracer in this repo as well. + + +● Read(~/devl/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_ + calibration/calibrate_cds_sparse.py) + ⎿  Read 100 lines + +● Search(pattern: "**/tracer*.py") + ⎿  Found 0 files + +● Search(pattern: "**/*tracer*.py") + ⎿  Found 1 file (ctrl+o to expand) + +● Read(matrix_tracer.py) + ⎿  Read 384 lines + +● Updated plan + ⎿  /plan to preview + 360 - How person-level targets work alongside spm_unit-level targe + ts + 361 - Interpretation of the resulting matrix structure + 362 + 363 -### Step 5: Calibration Integration + 364 -Use the matrix in calibration workflow: + 363 +### Step 5: Create Runner Program with Diagnostics + 364 +Create a runner script `test_combined_targets.py` that: + 365 +- Builds X_sparse with SNAP + health insurance targets + 366 +- Extracts target vector + 367 +- Uses MatrixTracer to print diagnostics + 368 + + 369 +Pattern based on first 100 lines of junkyard's `calibrate_cds_ + +sparse.py`: + 370 + + 371 ```python + 366 -# After building matrix with combined targets + 367 -from your_calibration_module import solve_weights + 372 +""" + 373 +Test runner for combined SNAP + health insurance calibration t + +argets. + 374 +Generates X_sparse and target vector, prints diagnostics using + + MatrixTracer. + 375 +""" + 376 +import logging + 377 +from pathlib import Path + 378 + 369 -# Solve for weights + 370 -weights = solve_weights(X_sparse, targets_df["value"].values) + 379 +logging.basicConfig( + 380 + level=logging.INFO, + 381 + format="%(asctime)s - %(levelname)s - %(message)s" + 382 +) + 383 + 372 -# Verify calibration matches targets + 373 -calibrated = X_sparse.T @ weights + 374 -targets = targets_df["value"].values + 384 +import numpy as np + 385 +import pandas as pd + 386 +from policyengine_us import Microsimulation + 387 +from policyengine_us_data.datasets.cps.local_area_calibration. + +sparse_matrix_builder import SparseMatrixBuilder + 388 +from policyengine_us_data.datasets.cps.local_area_calibration. + +matrix_tracer import MatrixTracer + 389 +from policyengine_us_data.datasets.cps.local_area_calibration. + +calibration_utils import ( + 390 + create_target_groups, + 391 +) + 392 + 376 -for idx, row in targets_df.iterrows(): + 377 - print(f"{row['variable']}: target={row['value']:,.0f}, cal + -ibrated={calibrated[idx]:,.0f}") + 393 +# ============================================================ + +================ + 394 +# CONFIGURATION + 395 +# ============================================================ + +================ + 396 +db_path = "policyengine_us_data/storage/calibration/policy_dat + +a.db" + 397 +db_uri = f"sqlite:///{db_path}" + 398 +time_period = 2024 + 399 + + 400 +# Test with a few CDs from different states + 401 +cds_to_calibrate = [ + 402 + '0101', # Alabama CD-1 + 403 + '0601', # California CD-1 + 404 + '0602', # California CD-2 + 405 + '3601', # New York CD-1 + 406 + '4801', # Texas CD-1 + 407 +] + 408 + + 409 +print(f"Testing with {len(cds_to_calibrate)} congressional dis + +tricts") + 410 + + 411 +# ============================================================ + +================ + 412 +# STEP 1: LOAD SIMULATION + 413 +# ============================================================ + +================ + 414 +print("\nLoading simulation...") + 415 +sim = Microsimulation() + 416 +n_households = len(sim.calculate("household_id", map_to="house + +hold").values) + 417 +print(f"Loaded {n_households:,} households") + 418 + + 419 +# ============================================================ + +================ + 420 +# STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS + 421 +# ============================================================ + +================ + 422 +print("\nBuilding sparse matrix...") + 423 +builder = SparseMatrixBuilder( + 424 + db_uri=db_uri, + 425 + time_period=time_period, + 426 + cds_to_calibrate=cds_to_calibrate, + 427 + dataset_path=None + 428 +) + 429 + + 430 +# Query both SNAP (stratum_group_id=4) and national targets (s + +tratum_group_id=1) + 431 +targets_df, X_sparse, household_id_mapping = builder.build_mat + +rix( + 432 + sim, + 433 + target_filter={"stratum_group_ids": [1, 4]} + 434 +) + 435 + + 436 +print(f"\nMatrix built successfully:") + 437 +print(f" Shape: {X_sparse.shape}") + 438 +print(f" Targets: {len(targets_df)}") + 439 +print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X + +_sparse.shape[1]):.4%}") + 440 + + 441 +# ============================================================ + +================ + 442 +# STEP 3: EXTRACT TARGET VECTOR + 443 +# ============================================================ + +================ + 444 +target_vector = targets_df["value"].values + 445 +print(f"\nTarget vector shape: {target_vector.shape}") + 446 +print(f"Target total: ${target_vector.sum():,.0f}") + 447 + + 448 +# ============================================================ + +================ + 449 +# STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION + 450 +# ============================================================ + +================ + 451 +print("\n" + "="*80) + 452 +print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") + 453 +print("="*80) + 454 + + 455 +health_ins_targets = targets_df[ + 456 + targets_df["variable"] == "health_insurance_premiums_witho + +ut_medicare_part_b" + 457 +] + 458 + + 459 +if len(health_ins_targets) > 0: + 460 + print(f"\nFound {len(health_ins_targets)} health insurance + + target(s):") + 461 + print(health_ins_targets[["target_id", "variable", "value" + +, "geographic_id", "stratum_group_id"]]) + 462 + + 463 + # Get the row index for health insurance + 464 + health_ins_idx = health_ins_targets.index[0] + 465 + health_ins_row = X_sparse[health_ins_idx, :] + 466 + + 467 + print(f"\nMatrix row {health_ins_idx} (health insurance):" + +) + 468 + print(f" Non-zero entries: {health_ins_row.nnz:,}") + 469 + print(f" Row sum: ${health_ins_row.sum():,.0f}") + 470 + print(f" Target value: ${health_ins_targets.iloc[0]['valu + +e']:,.0f}") + 471 + + 472 + # Verify person-to-household aggregation + 473 + person_total = sim.calculate( + 474 + 'health_insurance_premiums_without_medicare_part_b', + 475 + time_period, + 476 + map_to='person' + 477 + ).values.sum() + 478 + + 479 + household_total = sim.calculate( + 480 + 'health_insurance_premiums_without_medicare_part_b', + 481 + time_period, + 482 + map_to='household' + 483 + ).values.sum() + 484 + + 485 + print(f"\nEntity aggregation verification:") + 486 + print(f" Person-level total: ${person_total:,.0f}") + 487 + print(f" Household-level total: ${household_total:,.0f}") + 488 + print(f" Match: {abs(person_total - household_total) < 1} + +") + 489 + + 490 +else: + 491 + print("\nWARNING: No health insurance targets found!") + 492 + + 493 +# ============================================================ + +================ + 494 +# STEP 5: SNAP TARGET SUMMARY + 495 +# ============================================================ + +================ + 496 +print("\n" + "="*80) + 497 +print("SNAP TARGET SUMMARY") + 498 +print("="*80) + 499 + + 500 +snap_targets = targets_df[targets_df["variable"] == "snap"] + 501 +household_count_targets = targets_df[targets_df["variable"] == + + "household_count"] + 502 + + 503 +print(f"\nSNAP benefit targets: {len(snap_targets)}") + 504 +print(f"Household count targets: {len(household_count_targets) + +}") + 505 + + 506 +if len(snap_targets) > 0: + 507 + print(f"\nSNAP total (all states): ${snap_targets['value'] + +.sum():,.0f}") + 508 + print(f"\nSample SNAP targets:") + 509 + print(snap_targets[["target_id", "variable", "value", "geo + +graphic_id"]].head(10)) + 510 + + 511 +# ============================================================ + +================ + 512 +# STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS + 513 +# ============================================================ + +================ + 514 +print("\n" + "="*80) + 515 +print("MATRIX TRACER DIAGNOSTICS") + 516 +print("="*80) + 517 + + 518 +tracer = MatrixTracer( + 519 + targets_df=targets_df, + 520 + matrix=X_sparse, + 521 + household_id_mapping=household_id_mapping, + 522 + geographic_ids=cds_to_calibrate, + 523 + sim=sim + 524 +) + 525 + + 526 +# Print comprehensive matrix structure + 527 +tracer.print_matrix_structure(show_groups=True) + 528 + + 529 +# ============================================================ + +================ + 530 +# STEP 7: TARGET GROUP ANALYSIS + 531 +# ============================================================ + +================ + 532 +print("\n" + "="*80) + 533 +print("TARGET GROUP ANALYSIS") + 534 +print("="*80) + 535 + + 536 +target_groups, group_info = create_target_groups(targets_df) + 537 + + 538 +print(f"\nTotal target groups: {len(group_info)}") + 539 +for group_id, info in enumerate(group_info): + 540 + group_mask = target_groups == group_id + 541 + n_targets_in_group = group_mask.sum() + 542 + print(f" Group {group_id}: {info} ({n_targets_in_group} t + +argets)") + 543 + + 544 +print("\n" + "="*80) + 545 +print("RUNNER COMPLETED SUCCESSFULLY") + 546 +print("="*80) + 547 ``` + 548 + 549 +**Key Features**: + 550 +- Loads simulation once + 551 +- Builds matrix with combined targets using OR filter + 552 +- Verifies health insurance person→household aggregation + 553 +- Uses MatrixTracer for comprehensive diagnostics + 554 +- Prints target group analysis + 555 +- Follows junkyard pattern but with new sparse_matrix_builder + 556 + + 557 ## Success Criteria + 558 + 559 1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + , 4]}` filter + +● Updated plan + ⎿  /plan to preview + 556 + 557 ## Success Criteria + 558 + 559 -1. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + -, 4]}` filter + 560 -2. ✓ Health insurance row appears in targets_df with correct v + -alue ($385B) + 561 -3. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + -_targets ≈ 558 (20 national + 538 SNAP) + 562 -4. ✓ Person-to-household aggregation preserves totals + 563 -5. ✓ Matrix row for health insurance contains household-aggreg + -ated premium values + 564 -6. ✓ No code changes needed (only usage pattern demonstration) + 559 +1. ✓ Runner script executes successfully without errors + 560 +2. ✓ Matrix builds successfully with `{"stratum_group_ids": [1 + +, 4]}` filter + 561 +3. ✓ Health insurance target appears in targets_df with correc + +t value ($385B) + 562 +4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n + +_targets ≈ 558 (20 national + 538 SNAP) + 563 +5. ✓ Person-to-household aggregation preserves totals (verifie + +d in runner output) + 564 +6. ✓ MatrixTracer prints comprehensive diagnostics + 565 +7. ✓ Target groups are correctly identified and displayed + 566 +8. ✓ No code changes to sparse_matrix_builder.py (only new run + +ner script) + 567 + 568 ## Quick Reference + 569 + +● Updated plan + ⎿  /plan to preview + 569 + 570 ### Critical Files + 571 - **No changes**: `sparse_matrix_builder.py` (already supports + person-level targets) + 572 +- **New file**: `test_combined_targets.py` (runner program to + +create and verify) + 573 - **Database**: `policyengine_us_data/storage/calibration/poli + cy_data.db` (contains target_id=5) + 574 +- **Tracer**: `matrix_tracer.py` (used for diagnostics in runn + +er) + 575 - **Environment**: `~/envs/sep` (for this development branch) + 576 + 577 ### Key Commands + ... + 579 # Activate environment + 580 source ~/envs/sep/bin/activate + 581 + 580 -# Query database for health insurance target + 582 +# Navigate to project directory + 583 cd ~/devl/sep/policyengine-us-data + 584 + + 585 +# Query database for health insurance target + 586 python -c "import sqlite3; conn = sqlite3.connect('policyengin + e_us_data/storage/calibration/policy_data.db'); cursor = conn. + cursor(); cursor.execute('SELECT * FROM targets WHERE variable + = \"health_insurance_premiums_without_medicare_part_b\"'); pr + int(cursor.fetchone())" + 587 + 584 -# Run verification tests + 585 -python test_combined_targets.py # Create this test script per + - Step 2 above + 588 +# Run the combined targets runner (after creating per Step 5) + 589 +python test_combined_targets.py + 590 ``` + 591 + 592 ### Target Filter Examples + +● Perfect! I've updated the plan to include a comprehensive runner program (Step 5) that will: + - Build X_sparse with combined SNAP + health insurance targets + - Extract the target vector + - Use MatrixTracer for detailed diagnostics + - Verify person→household aggregation + - Print comprehensive analysis + + The runner follows the pattern from the junkyard's calibrate_cds_sparse.py but uses your new generic sparse_matrix_builder.py. + +● User approved Claude's plan + ⎿  Plan saved to: ~/.claude/plans/velvet-rolling-dawn.md · /plan to edit + + Implementation Plan: Adding Health Insurance Premiums to Calibration + + TL;DR + + ZERO CODE CHANGES NEEDED 🎉 + + The current sparse_matrix_builder.py already supports person-level + targets through PolicyEngine's map_to="household" parameter, which + automatically aggregates person-level values to household level. + Simply use: + + builder.build_matrix(sim, target_filter={"stratum_group_ids": [1, 4]}) + # SNAP + health insurance + + This works because: + - Health insurance premium is in the database (target_id=5, $385B) + - PolicyEngine auto-aggregates person→household (preserves totals) + - No hard-coded variable names in the builder + + Executive Summary + + Key Finding: The current sparse_matrix_builder.py implementation + already supports person-level targets without any code changes needed. + PolicyEngine's map_to="household" parameter automatically aggregates + person-level values to household level while preserving totals. + + Background + + Current State + + - Working targets: SNAP (stratum_group_id=4, spm_unit-level, 538 + targets: 51 state snap amounts + 487 household_count targets) + - New target: health_insurance_premiums_without_medicare_part_b + (person-level, stratum_group_id=1, 1 national target) + - Repository: Development branch at ~/devl/sep/policyengine-us-data + (use ~/envs/sep) + - Junkyard: Reference implementation at ~/devl/policyengine-us-data + (use ~/envs/pe) + + Key Differences + + - SNAP: spm_unit-level variable with state-specific constraints and + two variables (snap dollars + household_count) + - Health Insurance: person-level variable, single national target (no + geographic constraints), only one variable + + Investigation Results + + Database Analysis + + -- Health insurance premium target in policy_data.db + target_id: 5 + variable: 'health_insurance_premiums_without_medicare_part_b' + value: $385,000,000,000 (2024) + stratum_id: 1 + stratum_group_id: 1 (national hardcoded targets) + constraints: NONE (applies to entire nation) + + Stratum group 1 contains 20 national hardcoded targets including + medicaid, snap (national), ssi, tanf, social_security, rent, etc. + + Entity Aggregation Verification + + # Verified with PolicyEngine Microsimulation + Person-level array: 52,826 persons, sum = $56,465,248 + Household-level array: 21,045 households, sum = $56,465,248 + Entity type: 'person' + Sums match: True ✓ + + Conclusion: PolicyEngine's map_to="household" automatically sums + person-level values to household level, preserving totals perfectly. + + Current Implementation Analysis + + The generic code at sparse_matrix_builder.py:220-222: + target_values = state_sim.calculate( + target["variable"], map_to="household" + ).values + masked_values = (target_values * mask).astype(np.float32) + + This works for ANY entity type because: + 1. PolicyEngine detects the variable's entity type from the tax + benefit system + 2. map_to="household" triggers automatic aggregation (sum for amounts, + count for _count variables) + 3. No explicit entity handling code needed + + Implementation Plan + + Phase 1: No Code Changes Needed ✓ + + The sparse_matrix_builder.py requires zero modifications to support + person-level targets. + + Why: The implementation is already entity-agnostic via PolicyEngine's + map_to parameter. + + Phase 2: Usage Pattern + + To add health insurance premiums to calibration, use the appropriate + target filter: + + Option A: Include specific stratum_group_id + builder = SparseMatrixBuilder(db_uri, time_period, cds_to_calibrate, + dataset_path) + + # Just health insurance (national hardcoded targets) + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1]} # 20 national targets + ) + + # SNAP only (state-level targets) + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [4]} # 538 SNAP targets + ) + + # Both SNAP and national targets together + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} # 558 targets total + ) + + Option B: Include specific variable(s) + # Just health insurance premiums + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"variables": + ["health_insurance_premiums_without_medicare_part_b"]} + ) + + # Multiple specific variables + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={ + "variables": ["snap", + "health_insurance_premiums_without_medicare_part_b"] + } + ) + + Option C: Include specific stratum_id or target_id + # Using stratum_id + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_ids": [1]} # National stratum + ) + + # Using target_id + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"target_ids": [5]} # Health insurance target + ) + + Phase 3: Matrix Structure + + The resulting sparse matrix will have: + - Rows: Number of targets (1 for health insurance only, 558 for SNAP + + health insurance) + - Columns: n_households × n_cds (e.g., 21,045 households × 19 CDs = + 399,855 columns) + - Values: For health insurance row, each column contains the sum of + person-level premiums for that household + - Sparsity: ~99.95% sparse (most households have zero premiums) + + Phase 4: Constraint Handling + + For health insurance premiums (no constraints): + - All households included (mask = all True) + - Each household gets sum of all persons' premiums in that household + - Same value repeated across all CDs (since there's no geographic + constraint) + + For SNAP (with state constraints): + - Constraints filtered by state_fips and snap>0 + - Only qualifying households get non-zero values + - Values vary by state due to state-specific SNAP rules + + Phase 5: Verification + + Test 1: Basic Matrix Construction + # Create builder with policy_data.db + builder = SparseMatrixBuilder( + db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ + data.db", + time_period=2024, + cds_to_calibrate=['0101', '0102', ...], # Example CDs + dataset_path=None # Or path to CPS dataset + ) + + # Build matrix with health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"variables": + ["health_insurance_premiums_without_medicare_part_b"]} + ) + + # Verify + assert len(targets_df) == 1 + assert targets_df.iloc[0]["variable"] == + "health_insurance_premiums_without_medicare_part_b" + assert targets_df.iloc[0]["value"] == 385_000_000_000 + assert X_sparse.shape[0] == 1 + assert X_sparse.sum() > 0 + + Test 2: Combined Targets + # Build matrix with both SNAP and health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} + ) + + # Verify + assert len(targets_df) == 558 # 20 national + 538 SNAP + health_ins_rows = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + assert len(health_ins_rows) == 1 + + Test 3: Entity Aggregation + # Verify person-to-household aggregation preserves totals + sim = Microsimulation(dataset=dataset_path) + + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='household' + ).values.sum() + + assert np.isclose(person_total, household_total, rtol=1e-6) + + Critical Files + + No Modifications Needed + + - ✓ sparse_matrix_builder.py - Already supports person-level targets + + Database (Already Contains Target) + + - ✓ policyengine_us_data/storage/calibration/policy_data.db - Contains + target_id=5 + + Documentation/Examples + + - May need to update: ~/docs/local_area_calibration_setup.ipynb - Show + health insurance example + + Architecture Decisions + + Why No Code Changes? + + 1. PolicyEngine's Generic Design: map_to parameter handles all entity + aggregations + 2. Database-Driven Approach: All target metadata comes from + policy_data.db + 3. Constraint-Based Filtering: Generic mask application works for any + entity type + 4. No Hard-Coding: Variable names never appear in the builder logic + + Entity Aggregation Semantics + + - Amount variables (like premiums): SUM from entity to household + - Count variables (like person_count): COUNT unique entities per + household + - Detection: PolicyEngine uses variable metadata from + tax_benefit_system + + Limitations (Current Implementation) + + - Assumes all targets should be aggregated to household level for + calibration + - No support for tax_unit or spm_unit level calibration (only + household-level weights) + - This is correct for geo-stacking where we reweight households within + CDs + + Risk Analysis + + Low Risk + + - ✓ No code changes means no regression risk + - ✓ Existing SNAP functionality unchanged + - ✓ Database already contains correct target definition + + Medium Risk + + - ⚠️ If user wants state-level health insurance targets, those need to + be added to database first + - ⚠️ Performance: Person-level aggregation is more expensive than + household-level (but PolicyEngine caches this) + + Zero Risk Items + + - ✓ Entity detection is automatic via PolicyEngine + - ✓ Total preservation is guaranteed by PolicyEngine's aggregation + logic + - ✓ Generic architecture means extensible to any future entity types + + User Decisions + + ✓ Target Scope: Use national target only (existing target_id=5, no + database changes) + ✓ Usage Pattern: Combined with SNAP (test multi-target calibration) + ✓ Geographic Scope: National aggregate is correct (no state variation + needed) + + Implementation Steps (Post-Approval) + + Step 1: Verify Database Target + + Confirm health insurance premium target exists and is active: + # Using ~/envs/sep environment + import sqlite3 + conn = sqlite3.connect('policyengine_us_data/storage/calibration/polic + y_data.db') + cursor = conn.cursor() + cursor.execute(""" + SELECT target_id, variable, value, period, stratum_id, active + FROM targets + WHERE variable = + 'health_insurance_premiums_without_medicare_part_b' + """) + print(cursor.fetchone()) + Expected: (5, 'health_insurance_premiums_without_medicare_part_b', + 385000000000.0, 2024, 1, 1) + + Step 2: Test Combined Target Matrix Building + + Create a test script to build matrix with both SNAP and health + insurance: + + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m + atrix_builder import SparseMatrixBuilder + + # Load simulation + sim = Microsimulation() + + # Create builder + builder = SparseMatrixBuilder( + db_uri="sqlite:///policyengine_us_data/storage/calibration/policy_ + data.db", + time_period=2024, + cds_to_calibrate=['0101', '0601', '0602'], # Example CDs: AL-1, + CA-1, CA-2 + dataset_path=None + ) + + # Build matrix with SNAP + health insurance + targets_df, X_sparse, household_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} # National + SNAP + targets + ) + + print(f"Total targets: {len(targets_df)}") + print(f"Matrix shape: {X_sparse.shape}") + print(f"Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * + X_sparse.shape[1]):.2%}") + + # Verify health insurance target is included + health_ins = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + print(f"\nHealth insurance target:") + print(health_ins[["target_id", "variable", "value", "geographic_id"]]) + + Step 3: Verify Entity Aggregation + + Confirm person-to-household aggregation works correctly: + + # Verify totals are preserved + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + 2024, + map_to='household' + ).values.sum() + + print(f"Person-level total: ${person_total:,.0f}") + print(f"Household-level total: ${household_total:,.0f}") + print(f"Match: {abs(person_total - household_total) < 1}") + + # Check matrix row sum for health insurance + health_ins_idx = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ].index[0] + matrix_total = X_sparse[health_ins_idx, :].sum() * + len(cds_to_calibrate) # Adjust for stacking + print(f"Matrix row total (adjusted): ${matrix_total:,.0f}") + + Step 4: Update Documentation + + Add example to ~/docs/local_area_calibration_setup.ipynb showing: + - How to filter for multiple target groups + - How person-level targets work alongside spm_unit-level targets + - Interpretation of the resulting matrix structure + + Step 5: Create Runner Program with Diagnostics + + Create a runner script test_combined_targets.py that: + - Builds X_sparse with SNAP + health insurance targets + - Extracts target vector + - Uses MatrixTracer to print diagnostics + + Pattern based on first 100 lines of junkyard's + calibrate_cds_sparse.py: + + """ + Test runner for combined SNAP + health insurance calibration targets. + Generates X_sparse and target vector, prints diagnostics using + MatrixTracer. + """ + import logging + from pathlib import Path + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + + import numpy as np + import pandas as pd + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps.local_area_calibration.sparse_m + atrix_builder import SparseMatrixBuilder + from + policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer + import MatrixTracer + from policyengine_us_data.datasets.cps.local_area_calibration.calibrat + ion_utils import ( + create_target_groups, + ) + + # ==================================================================== + ======== + # CONFIGURATION + # ==================================================================== + ======== + db_path = "policyengine_us_data/storage/calibration/policy_data.db" + db_uri = f"sqlite:///{db_path}" + time_period = 2024 + + # Test with a few CDs from different states + cds_to_calibrate = [ + '0101', # Alabama CD-1 + '0601', # California CD-1 + '0602', # California CD-2 + '3601', # New York CD-1 + '4801', # Texas CD-1 + ] + + print(f"Testing with {len(cds_to_calibrate)} congressional districts") + + # ==================================================================== + ======== + # STEP 1: LOAD SIMULATION + # ==================================================================== + ======== + print("\nLoading simulation...") + sim = Microsimulation() + n_households = len(sim.calculate("household_id", + map_to="household").values) + print(f"Loaded {n_households:,} households") + + # ==================================================================== + ======== + # STEP 2: BUILD SPARSE MATRIX WITH COMBINED TARGETS + # ==================================================================== + ======== + print("\nBuilding sparse matrix...") + builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None + ) + + # Query both SNAP (stratum_group_id=4) and national targets + (stratum_group_id=1) + targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={"stratum_group_ids": [1, 4]} + ) + + print(f"\nMatrix built successfully:") + print(f" Shape: {X_sparse.shape}") + print(f" Targets: {len(targets_df)}") + print(f" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * + X_sparse.shape[1]):.4%}") + + # ==================================================================== + ======== + # STEP 3: EXTRACT TARGET VECTOR + # ==================================================================== + ======== + target_vector = targets_df["value"].values + print(f"\nTarget vector shape: {target_vector.shape}") + print(f"Target total: ${target_vector.sum():,.0f}") + + # ==================================================================== + ======== + # STEP 4: HEALTH INSURANCE PREMIUM VERIFICATION + # ==================================================================== + ======== + print("\n" + "="*80) + print("HEALTH INSURANCE PREMIUM TARGET ANALYSIS") + print("="*80) + + health_ins_targets = targets_df[ + targets_df["variable"] == + "health_insurance_premiums_without_medicare_part_b" + ] + + if len(health_ins_targets) > 0: + print(f"\nFound {len(health_ins_targets)} health insurance + target(s):") + print(health_ins_targets[["target_id", "variable", "value", + "geographic_id", "stratum_group_id"]]) + + # Get the row index for health insurance + health_ins_idx = health_ins_targets.index[0] + health_ins_row = X_sparse[health_ins_idx, :] + + print(f"\nMatrix row {health_ins_idx} (health insurance):") + print(f" Non-zero entries: {health_ins_row.nnz:,}") + print(f" Row sum: ${health_ins_row.sum():,.0f}") + print(f" Target value: + ${health_ins_targets.iloc[0]['value']:,.0f}") + + # Verify person-to-household aggregation + person_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + time_period, + map_to='person' + ).values.sum() + + household_total = sim.calculate( + 'health_insurance_premiums_without_medicare_part_b', + time_period, + map_to='household' + ).values.sum() + + print(f"\nEntity aggregation verification:") + print(f" Person-level total: ${person_total:,.0f}") + print(f" Household-level total: ${household_total:,.0f}") + print(f" Match: {abs(person_total - household_total) < 1}") + + else: + print("\nWARNING: No health insurance targets found!") + + # ==================================================================== + ======== + # STEP 5: SNAP TARGET SUMMARY + # ==================================================================== + ======== + print("\n" + "="*80) + print("SNAP TARGET SUMMARY") + print("="*80) + + snap_targets = targets_df[targets_df["variable"] == "snap"] + household_count_targets = targets_df[targets_df["variable"] == + "household_count"] + + print(f"\nSNAP benefit targets: {len(snap_targets)}") + print(f"Household count targets: {len(household_count_targets)}") + + if len(snap_targets) > 0: + print(f"\nSNAP total (all states): + ${snap_targets['value'].sum():,.0f}") + print(f"\nSample SNAP targets:") + print(snap_targets[["target_id", "variable", "value", + "geographic_id"]].head(10)) + + # ==================================================================== + ======== + # STEP 6: USE MATRIX TRACER FOR DETAILED DIAGNOSTICS + # ==================================================================== + ======== + print("\n" + "="*80) + print("MATRIX TRACER DIAGNOSTICS") + print("="*80) + + tracer = MatrixTracer( + targets_df=targets_df, + matrix=X_sparse, + household_id_mapping=household_id_mapping, + geographic_ids=cds_to_calibrate, + sim=sim + ) + + # Print comprehensive matrix structure + tracer.print_matrix_structure(show_groups=True) + + # ==================================================================== + ======== + # STEP 7: TARGET GROUP ANALYSIS + # ==================================================================== + ======== + print("\n" + "="*80) + print("TARGET GROUP ANALYSIS") + print("="*80) + + target_groups, group_info = create_target_groups(targets_df) + + print(f"\nTotal target groups: {len(group_info)}") + for group_id, info in enumerate(group_info): + group_mask = target_groups == group_id + n_targets_in_group = group_mask.sum() + print(f" Group {group_id}: {info} ({n_targets_in_group} + targets)") + + print("\n" + "="*80) + print("RUNNER COMPLETED SUCCESSFULLY") + print("="*80) + + Key Features: + - Loads simulation once + - Builds matrix with combined targets using OR filter + - Verifies health insurance person→household aggregation + - Uses MatrixTracer for comprehensive diagnostics + - Prints target group analysis + - Follows junkyard pattern but with new sparse_matrix_builder + + Success Criteria + + 1. ✓ Runner script executes successfully without errors + 2. ✓ Matrix builds successfully with {"stratum_group_ids": [1, 4]} + filter + 3. ✓ Health insurance target appears in targets_df with correct value + ($385B) + 4. ✓ Matrix shape is (n_targets, n_households × n_cds) where n_targets + ≈ 558 (20 national + 538 SNAP) + 5. ✓ Person-to-household aggregation preserves totals (verified in + runner output) + 6. ✓ MatrixTracer prints comprehensive diagnostics + 7. ✓ Target groups are correctly identified and displayed + 8. ✓ No code changes to sparse_matrix_builder.py (only new runner + script) + + Quick Reference + + Critical Files + + - No changes: sparse_matrix_builder.py (already supports person-level + targets) + - New file: test_combined_targets.py (runner program to create and + verify) + - Database: policyengine_us_data/storage/calibration/policy_data.db + (contains target_id=5) + - Tracer: matrix_tracer.py (used for diagnostics in runner) + - Environment: ~/envs/sep (for this development branch) + + Key Commands + + # Activate environment + source ~/envs/sep/bin/activate + + # Navigate to project directory + cd ~/devl/sep/policyengine-us-data + + # Query database for health insurance target + python -c "import sqlite3; conn = sqlite3.connect('policyengine_us_dat + a/storage/calibration/policy_data.db'); cursor = conn.cursor(); + cursor.execute('SELECT * FROM targets WHERE variable = + \"health_insurance_premiums_without_medicare_part_b\"'); + print(cursor.fetchone())" + + # Run the combined targets runner (after creating per Step 5) + python test_combined_targets.py + + Target Filter Examples + + # SNAP only + {"stratum_group_ids": [4]} + + # Health insurance only + {"variables": ["health_insurance_premiums_without_medicare_part_b"]} + + # Both SNAP and health insurance (recommended) + {"stratum_group_ids": [1, 4]} + + + + + + + + +● Excellent! The plan is approved. Let me create a todo list and start implementing the runner program. + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && ~/envs/sep/bin/python + -c " + import sqlite3…) + ⎿  Database target verification: + target_id: 5 + variable: health_insurance_premiums_without_medicare_part_b + … +4 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Write(test_combined_targets.py) + ⎿  Wrote 188 lines to test_combined_targets.py + 1 """ + 2 Test runner for combined SNAP + health insurance calibration targ + ets. + 3 Generates X_sparse and target vector, prints diagnostics using Ma + trixTracer. + 4 """ + 5 import logging + 6 logging.basicConfig( + 7 level=logging.INFO, + 8 format="%(asctime)s - %(levelname)s - %(message)s" + 9 ) + … +178 lines (ctrl+o to expand) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: + ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s + area_calibration/test_combined_targets.py 2>&1) + ⎿ 2026-01-15 09:12:53,215 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:13:03,329 - INFO - HTTP Request: GET + https://huggingface. + co/api/models/policyengine/policyengine-us-data "HTTP/1.1 200 OK" + … +301 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● The runner executed successfully. Let me update the todos and summarize the results. + +● All tasks completed successfully. Here's a summary: + + Results + + Matrix Structure + + - Shape: (558 targets, 105,225 columns = 21,045 households × 5 CDs) + - Targets breakdown: 20 national (stratum_group_id=1) + 538 SNAP (stratum_group_id=4) + - Sparsity: 98.82% + + Health Insurance Premium Verification + + - Target found: target_id=5, value=$385B (2024) + - Matrix row: 66,150 non-zero entries, row sum = $282,326,240 + - Entity aggregation: Person-level total = Household-level total = $56,465,248 ✓ + - Math check: Row sum $282,326,240 = 5 CDs × $56,465,248 ✓ + + Key Insights + + 1. No code changes to sparse_matrix_builder.py - it already handles person-level targets via PolicyEngine's map_to="household" aggregation + 2. Health insurance is Group 4 in the target groups: "National Health Insurance Premiums Without Medicare Part B (1 target, value=385,000,000,000)" + 3. Combined targets work correctly - the OR filter {"stratum_group_ids": [1, 4]} successfully retrieves both national and SNAP targets + + Files Created + + - /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/datasets/cps/local_area_calibration/test_combined_targets.py - Runner program with diagnostics + + The health insurance premium target is now fully integrated into your calibration system with zero code changes to the core builder. + +✻ Baked for 13m 5s + +! git status + ⎿  On branch health-insurance-premiums + Untracked files: + (use "git add ..." to include in what will be committed) + … +4 lines (ctrl+o to expand) + +❯ Ok I see you have a test created, which is nice, but I was hoping for a + runner file too. Did you see that in my last request? It would create + sparse X matrix and run the tracer. + +● The file I created (test_combined_targets.py) is the runner - it generates X_sparse, extracts the target vector, and runs the MatrixTracer for diagnostics. It ran successfully and produced the output you saw. + + Perhaps the name is confusing? Would you like me to rename it to something clearer like run_combined_targets.py? Or is there something additional you'd like the runner to do that I missed? + +! ls test* + ⎿  test_combined_targets.py + +❯ Oh! Yeah let's not call this file "test*". Eventually it will build the + real matrix! Now it will only have snap and our new variable in it. It's + going to be our procedural script that will be an integral part of the + machine learning flow! + +● Got it! Let me rename it to something more appropriate for a production runner. + +● Bash(mv test_combined_targets.py build_calibration_matrix.py) + ⎿  (No content) + +● Read(build_calibration_matrix.py) + ⎿  Read 10 lines + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 1 """ + 2 -Test runner for combined SNAP + health insurance calibration tar + -gets. + 2 +Build calibration matrix for geo-stacking reweighting. + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + +● Done! Renamed to build_calibration_matrix.py with an updated docstring. This is now your production runner for the ML flow. + +● Bash(ls -la build_calibration_matrix.py) + ⎿ -rw-rw-r-- 1 baogorek baogorek 6266 Jan 15 09:19 + build_calibration_matr + ix.py + +● Ready to go: + python build_calibration_matrix.py + +❯ Ok there are some problems here. targets_df has 558 rows and I see + variables like alimony expense in there. I thought we were just adding + our new variable! Remember, it's one variable at a time! also, wy are you + bringing in a default microsimulation? Remember that the "base h5 + dataset" is a very important concept in this local area stacking + approach! That base dataset it ... we'll I'll let you tell me what it + should be. + +● You're right on both counts. Let me fix this: + + 1. Target filter should be specific: Only SNAP (stratum_group_id=4) + the specific health insurance variable - not all of stratum_group_id=1 + 2. Base dataset: For local area stacking, this should be the extended CPS dataset that's been prepared for geo-stacking (likely extended_cps_2024.h5 or similar in the storage directory) + + Let me check what's available: + +● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ + storage/*.h5 2>/dev/null || echo "No h5 files in storage root") + ⎿ -rw-rw-r-- 1 baogorek baogorek 446389680 Dec 24 08:03 + /home/baogorek/de + vl/sep/policyengine-us-data/policyengine_us_data/storage/acs_2022.h5 + -rw-rw-r-- 1 baogorek baogorek 789139664 Dec 24 08:03 + /home/baogorek/de + … +78 lines (ctrl+o to expand) + +● I see extended_cps_2024.h5 - that's the extended CPS which has the geographic identifiers and enhanced variables needed for geo-stacking. + + Let me fix both issues: + +● Read(build_calibration_matrix.py) + ⎿  Read 188 lines + +● Update(build_calibration_matrix.py) + ⎿  Added 9 lines, removed 6 lines + 25 # ============================================================= + =============== + 26 # CONFIGURATION + 27 # ============================================================= + =============== + 28 -db_path = ( + 29 - "policyengine_us_data/storage/calibration/policy_data.db" + 30 -) + 28 +db_path = "policyengine_us_data/storage/calibration/policy_data + +.db" + 29 db_uri = f"sqlite:///{db_path}" + 30 time_period = 2024 + 31 + 32 +# Base dataset for geo-stacking: extended CPS with geographic i + +dentifiers + 33 +dataset_path = ( + 34 + "policyengine_us_data/storage/extended_cps_2024.h5" + 35 +) + 36 + + 37 cds_to_calibrate = [ + 38 "101", # Alabama CD-1 + 39 "601", # California CD-1 + ... + 45 print(f"Testing with {len(cds_to_calibrate)} congressional dist + ricts") + 46 + 47 # ============================================================= + =============== + 45 -# STEP 1: LOAD SIMULATION + 48 +# STEP 1: LOAD SIMULATION FROM EXTENDED CPS + 49 # ============================================================= + =============== + 47 -print("\nLoading simulation...") + 48 -sim = Microsimulation() + 50 +print(f"\nLoading simulation from {dataset_path}...") + 51 +sim = Microsimulation(dataset=dataset_path) + 52 n_households = len(sim.calculate("household_id", map_to="househ + old").values) + 53 print(f"Loaded {n_households:,} households") + 54 + +● Update(build_calibration_matrix.py) + ⎿  Added 7 lines, removed 1 line + 63 dataset_path=None, + 64 ) + 65 + 66 +# SNAP targets (stratum_group_id=4) + specific health insurance + + variable + 67 +# Uses OR logic: gets all SNAP targets OR the health insurance + +target + 68 targets_df, X_sparse, household_id_mapping = builder.build_matr + ix( + 67 - sim, target_filter={"stratum_group_ids": [1, 4]} + 69 + sim, + 70 + target_filter={ + 71 + "stratum_group_ids": [4], + 72 + "variables": ["health_insurance_premiums_without_medica + +re_part_b"], + 73 + }, + 74 ) + 75 + 76 print(f"\nMatrix built successfully:") + +● Now let me run it to verify: + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: + ~/envs/sep/bin/python policyengine_us_data/datasets/cps/local_ 5m 0s + area_calibration/build_calibration_matrix.py 2>…) + ⎿ 2026-01-15 09:27:58,320 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:28:19,765 - INFO - Tracer initialized: 41314 households + x + 5 geographies + … +187 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Now we have: + - 539 targets (SNAP + health insurance only, not all 20 national targets) + - Extended CPS 2024 as the base dataset (41,314 households) + - 4 target groups: + - Group 0: Health Insurance Premiums (1 national target) + - Group 1: State SNAP Household Count (51 targets) + - Group 2: State Snap (51 targets) + - Group 3: District SNAP Household Count (436 targets) + + The entity aggregation "Match: False" is just a floating point precision issue ($4 difference out of $109M). Let me fix that check: + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 130 print(f"\nEntity aggregation verification:") + 131 print(f" Person-level total: ${person_total:,.0f}") + 132 print(f" Household-level total: ${household_total:,.0f}") + 133 - print(f" Match: {abs(person_total - household_total) < 1} + -") + 133 + print(f" Match: {np.isclose(person_total, household_total + +, rtol=1e-6)}") + 134 + 135 else: + 136 print("\nWARNING: No health insurance targets found!") + +● Done! The runner now: + 1. Uses extended_cps_2024.h5 as the base dataset + 2. Includes only SNAP + health insurance targets (539 total) + 3. Creates 4 target groups (down from 23) + +✻ Crunched for 1m 36s + +❯ I guess I never made it clear that the base dataset will be + "stratified_extended_cps_2023.h5" which you perhaps could have figured + out from the local-area* target in the Makefile, but that's okay. Can you + help the builder use that file in a way that it could be run + interactively by a user in the directory that script resides in? + +● Got it! Let me fix the paths to work when running from the script's directory using __file__ to resolve paths. + +● Update(build_calibration_matrix.py) + ⎿  Added 12 lines, removed 15 lines + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + 6 +from pathlib import Path + 7 + 8 logging.basicConfig( + 9 level=logging.INFO, + ... + 13 import numpy as np + 14 import pandas as pd + 15 from policyengine_us import Microsimulation + 15 -from policyengine_us_data.datasets.cps.local_area_calibration.s + -parse_matrix_builder import ( + 16 - SparseMatrixBuilder, + 17 -) + 18 -from policyengine_us_data.datasets.cps.local_area_calibration.m + -atrix_tracer import ( + 19 - MatrixTracer, + 20 -) + 21 -from policyengine_us_data.datasets.cps.local_area_calibration.c + -alibration_utils import ( + 22 - create_target_groups, + 23 -) + 16 +from sparse_matrix_builder import SparseMatrixBuilder + 17 +from matrix_tracer import MatrixTracer + 18 +from calibration_utils import create_target_groups + 19 + 20 # ============================================================= + =============== + 21 # CONFIGURATION + 22 # ============================================================= + =============== + 28 -db_path = "policyengine_us_data/storage/calibration/policy_data + -.db" + 23 +# Resolve paths relative to this script's location + 24 +SCRIPT_DIR = Path(__file__).resolve().parent + 25 +STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" + 26 + + 27 +db_path = STORAGE_DIR / "calibration" / "policy_data.db" + 28 db_uri = f"sqlite:///{db_path}" + 30 -time_period = 2024 + 29 +time_period = 2023 + 30 + 32 -# Base dataset for geo-stacking: extended CPS with geographic i + -dentifiers + 33 -dataset_path = ( + 34 - "policyengine_us_data/storage/extended_cps_2024.h5" + 35 -) + 31 +# Base dataset for geo-stacking: stratified extended CPS + 32 +dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" + 33 + 34 cds_to_calibrate = [ + 35 "101", # Alabama CD-1 + +● Now let me verify it runs from the script's directory: + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s + ⎿  Error: Exit code 1 + 2026-01-15 09:34:39,873 - INFO - NumExpr defaulting to 16 threads. + TEST_LITE == False + Testing with 5 congressional districts + + Loading simulation from /home/baogorek/devl/sep/policyengine-us-data/policye + ngine_us_data/storage/stratified_extended_cps_2023.h5... + Traceback (most recent call last): + File "/home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/da + tasets/cps/local_area_calibration/build_calibration_matrix.py", line 48, in + + sim = Microsimulation(dataset=dataset_path) + File "/home/baogorek/envs/sep/lib/python3.13/site-packages/policyengine_us + /system.py", line 220, in __init__ + … +9 lines (ctrl+o to see all) + +● Need to convert the Path to a string for Microsimulation: + +● Update(build_calibration_matrix.py) + ⎿  Added 1 line, removed 1 line + 45 # STEP 1: LOAD SIMULATION FROM EXTENDED CPS + 46 # ============================================================= + =============== + 47 print(f"\nLoading simulation from {dataset_path}...") + 48 -sim = Microsimulation(dataset=dataset_path) + 48 +sim = Microsimulation(dataset=str(dataset_path)) + 49 n_households = len(sim.calculate("household_id", map_to="househ + old").values) + 50 print(f"Loaded {n_households:,} households") + 51 + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1) timeout: 5m 0s + ⎿ 2026-01-15 09:35:11,752 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:35:27,099 - INFO - Tracer initialized: 13507 households + x + 5 geographies + … +187 lines (ctrl+o to expand) + +● Now it works from the script directory: + - Base dataset: stratified_extended_cps_2023.h5 (13,507 households) + - Time period: 2023 + - 539 targets: 1 health insurance + 538 SNAP + - 4 target groups + - Entity aggregation: Match: True + + The script can now be run interactively: + cd policyengine_us_data/datasets/cps/local_area_calibration + python build_calibration_matrix.py + +✻ Brewed for 1m 12s + +❯ Do you really need SCRIPT_DIR = Path(__file__).resolve().parent? in + there? I can't run __FILE__ in ipython? You can import STORAGE_DIR + directly, right? + +● You're right, let me check if STORAGE_DIR is already available to import: + +● Search(pattern: "STORAGE_DIR", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content") + ⎿  Found 3 lines (ctrl+o to expand) + +● Search(pattern: "storage.*=.*Path", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content") + ⎿  Found 0 lines + +● Search(pattern: "**/policyengine_us_data/**/constants*.py") + ⎿  Found 0 files + +● Search(pattern: "from policyengine_us_data import", path: + "~/devl/sep/policyengine-us-data/policyengine_us_data", output_mode: + "content", head_limit: 10) + ⎿  Found 0 lines + +● Search(pattern: "policyengine_us_data\.STORAGE", path: + "~/devl/sep/policyengine-us-data", output_mode: "content") + ⎿  Found 0 lines + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/__init__.py) + ⎿  Read 3 lines + +● Bash(ls -la /home/baogorek/devl/sep/policyengine-us-data/policyengine_us_data/ + storage/*.py 2>/dev/null | head -5) + ⎿ -rw-rw-r-- 1 baogorek baogorek 780 Jan 6 11:19 + /home/baogorek/devl/se + p/policyengine-us-data/policyengine_us_data/storage/download_private_p + r + erequisites.py + … +7 lines (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/storage/__init__.py) + ⎿  Read 6 lines + +● STORAGE_FOLDER is already defined in policyengine_us_data.storage. Let me update the script: + +● Update(build_calibration_matrix.py) + ⎿  Added 3 lines, removed 7 lines + 3 Generates X_sparse and target vector, prints diagnostics using M + atrixTracer. + 4 """ + 5 import logging + 6 -from pathlib import Path + 6 + 7 logging.basicConfig( + 8 level=logging.INFO, + ... + 12 import numpy as np + 13 import pandas as pd + 14 from policyengine_us import Microsimulation + 15 +from policyengine_us_data.storage import STORAGE_FOLDER + 16 from sparse_matrix_builder import SparseMatrixBuilder + 17 from matrix_tracer import MatrixTracer + 18 from calibration_utils import create_target_groups + ... + 20 # ============================================================= + =============== + 21 # CONFIGURATION + 22 # ============================================================= + =============== + 23 -# Resolve paths relative to this script's location + 24 -SCRIPT_DIR = Path(__file__).resolve().parent + 25 -STORAGE_DIR = SCRIPT_DIR.parent.parent.parent / "storage" + 26 - + 27 -db_path = STORAGE_DIR / "calibration" / "policy_data.db" + 23 +db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + 24 db_uri = f"sqlite:///{db_path}" + 25 time_period = 2023 + 26 + 27 # Base dataset for geo-stacking: stratified extended CPS + 32 -dataset_path = STORAGE_DIR / "stratified_extended_cps_2023.h5" + 28 +dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h + +5" + 29 + 30 cds_to_calibrate = [ + 31 "101", # Alabama CD-1 + +● Bash(~/envs/sep/bin/python build_calibration_matrix.py 2>&1 | head -30) + ⎿ 2026-01-15 09:45:04,625 - INFO - NumExpr defaulting to 16 threads. + 2026-01-15 09:45:19,327 - INFO - Tracer initialized: 13507 households + x + 5 geographies + … +31 lines (ctrl+o to expand) + +● Done. Now you can use STORAGE_FOLDER in IPython too: + + from policyengine_us_data.storage import STORAGE_FOLDER + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +✻ Sautéed for 1m 11s + +❯ Is there an opportunity to create a real test in tests? + +● Search(pattern: "**/tests/**/*.py", path: "~/devl/sep/policyengine-us-data") + ⎿  Found 100 files (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_database. + py) + ⎿  Read 87 lines + +● Read(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_datasets/ + test_small_enhanced_cps.py) + ⎿  Read 18 lines + +● Yes! I can create a test for the sparse matrix builder. Here's what we could test: + + 1. Person-level aggregation preserves totals + 2. Matrix builds with correct shape + 3. Health insurance target is included correctly + +● Write(~/devl/sep/policyengine-us-data/policyengine_us_data/tests/test_sparse_m + atrix_builder.py) + ⎿  Wrote 61 lines to ../../../tests/test_sparse_matrix_builder.py + 1 import pytest + 2 import numpy as np + 3 from policyengine_us import Microsimulation + 4 from policyengine_us_data.storage import STORAGE_FOLDER + 5 from policyengine_us_data.datasets.cps.local_area_calibration.spa + rse_matrix_builder import ( + 6 SparseMatrixBuilder, + 7 ) + 8 @pytest.fixture + … +51 lines (ctrl+o to expand) + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && timeout: 5m + ~/envs/sep/bin/pytest 0s + policyengine_us_data/tests/test_sparse_matrix_builder.py -v + 2>&1) + ⎿ ============================= test session starts + ===================== + ========= + platform linux -- Python 3.13.6, pytest-8.4.2, pluggy-1.6.0 -- + /home/ba + … +16 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● All 3 tests pass: + + 1. test_person_level_aggregation_preserves_totals - Verifies person→household aggregation preserves totals + 2. test_matrix_shape - Verifies matrix has correct dimensions + 3. test_combined_snap_and_health_insurance - Verifies both SNAP and health insurance targets are included + + policyengine_us_data/tests/test_sparse_matrix_builder.py - 3 passed in 12.33s + +✻ Cooked for 1m 1s + +❯ Cool, I'm surprised we didn't have one before. Let's go ahead and update + the changelog_entry.yaml, create an issue, format with black, commit, + push, and create the PR that links to the issue. + +● Search(pattern: "**/changelog_entry.yaml", path: + "~/devl/sep/policyengine-us-data") + ⎿  Found 1 file (ctrl+o to expand) + +● Read(~/devl/sep/policyengine-us-data/changelog_entry.yaml) + ⎿  Read 5 lines + +● Update(~/devl/sep/policyengine-us-data/changelog_entry.yaml) + ⎿  Added 4 lines, removed 3 lines + 1 -- bump: patch + 1 +- bump: minor + 2 changes: + 3 - fixed: + 4 - - Versioning workflow checkout for push events + 3 + added: + 4 + - Support for health_insurance_premiums_without_medicare_par + +t_b in local area calibration + 5 + - Test coverage for sparse matrix builder with person-level + +targets + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git remote -v) + ⎿  origin git@github.com:PolicyEngine/policyengine-us-data.git (fetch) + origin git@github.com:PolicyEngine/policyengine-us-data.git (push) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh issue create \ + --repo PolicyEngine/policyengine-us-data \…) + ⎿  https://github.com/PolicyEngine/policyengine-us-data/issues/472 + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && black . -l 79 2>&1) + ⎿ Skipping .ipynb files as Jupyter dependencies are not installed. + You can fix this by running ``pip install "black[jupyter]"`` + reformatted + /home/baogorek/devl/sep/policyengine-us-data/policyengine_u + … +6 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git status) + ⎿  On branch health-insurance-premiums + Changes not staged for commit: + (use "git add ..." to update what will be committed) + … +12 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git add + changelog_entry.yaml policyengine_us_data/datasets/cps/local_area_calibrat + ion/build_calibration_matri…) + ⎿  On branch health-insurance-premiums + Changes to be committed: + (use "git restore --staged ..." to unstage) + … +9 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git commit -m "Add + health insurance premiums to local area calibration…) + ⎿ [health-insurance-premiums 4268676] Add health insurance premiums to + lo + cal area calibration + 3 files changed, 263 insertions(+), 3 deletions(-) + … +4 lines (ctrl+o to expand) + + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && git push -u origin + health-insurance-premiums 2>&1) + ⎿ remote: + remote: Create a pull request for 'health-insurance-premiums' on + GitHub + by visiting: + … +13 lines (ctrl+o to expand) + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Bash(cd /home/baogorek/devl/sep/policyengine-us-data && gh pr create --title + "Add health insurance premiums to local area calibration" --body "## + Summary…) + ⎿  https://github.com/PolicyEngine/policyengine-us-data/pull/473 + Warning: 1 uncommitted change + ⎿ Shell cwd was reset to /home/baogorek/devl/sep/policyengine-us-data/po + licyengine_us_data/datasets/cps/local_area_calibration + +● Done! Here's the summary: + + - Issue: https://github.com/PolicyEngine/policyengine-us-data/issues/472 + - PR: https://github.com/PolicyEngine/policyengine-us-data/pull/473 + + The PR includes: + - build_calibration_matrix.py - Runner script for building X_sparse with SNAP + health insurance + - test_sparse_matrix_builder.py - 3 tests for person-level aggregation + - changelog_entry.yaml - Minor version bump + +✻ Crunched for 1m 51s + +❯ You've done a lot of work and discovery in this session, can you please + write up a report about what you've learned? This will be useful for your + future self to come back in here and be ready for the next variable, + which might not be quite as easy as this one! + +● Write(ADDING_CALIBRATION_TARGETS.md) + ⎿  Wrote 242 lines to ADDING_CALIBRATION_TARGETS.md + 1 # Adding Calibration Targets to Local Area Geo-Stacking + 2 This document summarizes key learnings from adding `health_insura + nce_premiums_without_medicare_part_b` as a calibration target. Us + e this as a reference when adding future targets. + 3 ## Key Discovery: No Code Changes Needed for Most Targets + 4 The `sparse_matrix_builder.py` is **already entity-agnostic**. Po + licyEngine's `map_to="household"` parameter automatically handles + aggregation from any entity level (person, tax_unit, spm_unit) t + o household level. + 5 ```python + 6 # This line in sparse_matrix_builder.py (line 220-222) handles AL + L entity types: + … +232 lines (ctrl+o to expand) + +● Created ADDING_CALIBRATION_TARGETS.md with: + + 1. Key Discovery: sparse_matrix_builder.py is already entity-agnostic via map_to="household" + 2. Architecture Overview: File locations, database schema, stratum groups + 3. Step-by-Step Guide: How to add new targets (5 steps) + 4. When Code Changes ARE Needed: Count variables, state-specific calcs, person-level constraints + 5. Junkyard Reference: Key patterns from the 2,400+ line reference implementation + 6. Matrix Structure: Shape, column indexing, MatrixTracer usage + 7. Common Queries: SQL snippets for exploring the database + + Would you like me to add this report to the PR as well? + +✻ Sautéed for 53s + + + diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py new file mode 100644 index 00000000..f6521e23 --- /dev/null +++ b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py @@ -0,0 +1,237 @@ +""" +Fit calibration weights using L0-regularized optimization. +Prototype script for weight calibration using the l0-python package. +""" + +import argparse +import logging +from datetime import datetime +from pathlib import Path + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +parser = argparse.ArgumentParser(description="Fit calibration weights") +parser.add_argument( + "--device", default="cpu", choices=["cpu", "cuda"], + help="Device for training (cpu or cuda)" +) +parser.add_argument( + "--epochs", type=int, default=100, + help="Total epochs for training" +) +parser.add_argument( + "--db-path", default=None, + help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)" +) +parser.add_argument( + "--dataset-path", default=None, + help="Path to stratified CPS h5 file" +) +args = parser.parse_args() + +import numpy as np +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from sparse_matrix_builder import SparseMatrixBuilder +from calibration_utils import get_all_cds_from_database + +try: + from l0.calibration import SparseCalibrationWeights +except ImportError: + raise ImportError( + "l0-python is required for weight fitting. " + "Install with: pip install policyengine-us-data[l0]" + ) + +# ============================================================================ +# CONFIGURATION +# ============================================================================ +DEVICE = args.device +TOTAL_EPOCHS = args.epochs +EPOCHS_PER_CHUNK = 50 + +# Hyperparameters +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L0 = 1e-8 +LAMBDA_L2 = 1e-8 +LEARNING_RATE = 0.15 + +# Data paths +if args.db_path: + db_path = Path(args.db_path) +else: + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" +db_uri = f"sqlite:///{db_path}" + +if args.dataset_path: + dataset_path = Path(args.dataset_path) +else: + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + +output_dir = STORAGE_FOLDER / "calibration" +output_dir.mkdir(parents=True, exist_ok=True) +time_period = 2023 + +# Get all CDs from database +cds_to_calibrate = get_all_cds_from_database(db_uri) +print(f"Found {len(cds_to_calibrate)} congressional districts") + +# ============================================================================ +# STEP 1: BUILD CALIBRATION MATRIX +# ============================================================================ +print(f"Loading simulation from {dataset_path}...") +sim = Microsimulation(dataset=str(dataset_path)) +n_households = len(sim.calculate("household_id", map_to="household").values) +print(f"Loaded {n_households:,} households") + +print("\nBuilding sparse matrix...") +builder = SparseMatrixBuilder( + db_uri=db_uri, + time_period=time_period, + cds_to_calibrate=cds_to_calibrate, + dataset_path=str(dataset_path), +) + +targets_df, X_sparse, household_id_mapping = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, +) + +print(f"Matrix shape: {X_sparse.shape}") +print(f"Targets: {len(targets_df)}") + +# Filter to achievable targets (rows with non-zero data) +row_sums = np.array(X_sparse.sum(axis=1)).flatten() +achievable_mask = row_sums > 0 +n_achievable = achievable_mask.sum() +n_impossible = (~achievable_mask).sum() + +print(f"\nAchievable targets: {n_achievable}") +print(f"Impossible targets (filtered out): {n_impossible}") + +targets_df = targets_df[achievable_mask].reset_index(drop=True) +X_sparse = X_sparse[achievable_mask, :] + +print(f"Filtered matrix shape: {X_sparse.shape}") + +# Extract target vector and names +targets = targets_df["value"].values +target_names = [ + f"{row['geographic_id']}/{row['variable']}" + for _, row in targets_df.iterrows() +] + +# ============================================================================ +# STEP 2: INITIALIZE WEIGHTS +# ============================================================================ +initial_weights = np.ones(X_sparse.shape[1]) * 100 +print(f"\nInitial weights shape: {initial_weights.shape}") +print(f"Initial weights sum: {initial_weights.sum():,.0f}") + +# ============================================================================ +# STEP 3: CREATE MODEL +# ============================================================================ +print("\nCreating SparseCalibrationWeights model...") +model = SparseCalibrationWeights( + n_features=X_sparse.shape[1], + beta=BETA, + gamma=GAMMA, + zeta=ZETA, + init_keep_prob=INIT_KEEP_PROB, + init_weights=initial_weights, + log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD, + log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD, + device=DEVICE, +) + +# ============================================================================ +# STEP 4: TRAIN IN CHUNKS +# ============================================================================ +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +sparsity_log = [] + +for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK): + chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start) + current_epoch = chunk_start + chunk_epochs + + print(f"\nTraining epochs {chunk_start + 1} to {current_epoch}...") + + model.fit( + M=X_sparse, + y=targets, + target_groups=None, + lambda_l0=LAMBDA_L0, + lambda_l2=LAMBDA_L2, + lr=LEARNING_RATE, + epochs=chunk_epochs, + loss_type="relative", + verbose=True, + verbose_freq=chunk_epochs, + ) + + active_info = model.get_active_weights() + active_count = active_info["count"] + total_count = X_sparse.shape[1] + sparsity_pct = 100 * (1 - active_count / total_count) + + sparsity_log.append( + { + "epoch": current_epoch, + "active_weights": active_count, + "total_weights": total_count, + "sparsity_pct": sparsity_pct, + } + ) + + print(f" Active weights: {active_count:,} / {total_count:,}") + print(f" Sparsity: {sparsity_pct:.2f}%") + +# ============================================================================ +# STEP 5: EXTRACT AND SAVE WEIGHTS +# ============================================================================ +import torch + +with torch.no_grad(): + w = model.get_weights(deterministic=True).cpu().numpy() + +print(f"\nFinal weights shape: {w.shape}") +print(f"Final weights sum: {w.sum():,.0f}") +print(f"Non-zero weights: {(w > 0).sum():,}") + +output_path = output_dir / f"calibration_weights_{timestamp}.npy" +np.save(output_path, w) +print(f"\nWeights saved to: {output_path}") +print(f"OUTPUT_PATH:{output_path}") + +# ============================================================================ +# STEP 6: VERIFY PREDICTIONS +# ============================================================================ +print("\n" + "=" * 60) +print("PREDICTION VERIFICATION") +print("=" * 60) + +with torch.no_grad(): + predictions = model.predict(X_sparse).cpu().numpy() + +for i in range(len(targets)): + rel_error = (predictions[i] - targets[i]) / targets[i] * 100 + print( + f"{target_names[i][:50]:50} | " + f"pred: {predictions[i]:>12,.0f} | " + f"target: {targets[i]:>12,.0f} | " + f"err: {rel_error:>6.2f}%" + ) + +print("\n" + "=" * 60) +print("FITTING COMPLETED") +print("=" * 60) diff --git a/policyengine_us_data/tests/test_sparse_matrix_builder.py b/policyengine_us_data/tests/test_sparse_matrix_builder.py new file mode 100644 index 00000000..2d08ea3f --- /dev/null +++ b/policyengine_us_data/tests/test_sparse_matrix_builder.py @@ -0,0 +1,64 @@ +import pytest +import numpy as np +from policyengine_us import Microsimulation +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( + SparseMatrixBuilder, +) + + +@pytest.fixture +def sim(): + dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2023.h5" + return Microsimulation(dataset=str(dataset_path)) + + +@pytest.fixture +def builder(): + db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" + db_uri = f"sqlite:///{db_path}" + cds_to_calibrate = ["101", "601"] # AL-1, CA-1 + return SparseMatrixBuilder( + db_uri=db_uri, + time_period=2023, + cds_to_calibrate=cds_to_calibrate, + dataset_path=None, + ) + + +def test_person_level_aggregation_preserves_totals(sim): + """Health insurance premiums (person-level) should sum correctly to household.""" + var = "health_insurance_premiums_without_medicare_part_b" + person_total = sim.calculate(var, 2023, map_to="person").values.sum() + household_total = sim.calculate(var, 2023, map_to="household").values.sum() + assert np.isclose(person_total, household_total, rtol=1e-6) + + +def test_matrix_shape(sim, builder): + """Matrix should have (n_targets, n_households * n_cds) shape.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter={ + "variables": ["health_insurance_premiums_without_medicare_part_b"] + }, + ) + n_households = len( + sim.calculate("household_id", map_to="household").values + ) + n_cds = 2 + assert X_sparse.shape == (1, n_households * n_cds) + + +def test_combined_snap_and_health_insurance(sim, builder): + """Matrix should include both SNAP and health insurance targets.""" + targets_df, X_sparse, _ = builder.build_matrix( + sim, + target_filter={ + "stratum_group_ids": [4], + "variables": ["health_insurance_premiums_without_medicare_part_b"], + }, + ) + variables = targets_df["variable"].unique() + assert "snap" in variables + assert "household_count" in variables + assert "health_insurance_premiums_without_medicare_part_b" in variables diff --git a/pyproject.toml b/pyproject.toml index 0bf25a16..ec9ddb9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ dependencies = [ calibration = [ "samplics", ] +l0 = [ + "l0-python", +] [dependency-groups] dev = [