PolicyEngine · baogorek · Nov 20, 2025 · Oct 17, 2025 · Oct 24, 2025 · Oct 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,5 @@ node_modules
 !age_state.csv
 !agi_state.csv
 !soi_targets.csv
+!policyengine_us_data/datasets/cps/long_term/social_security_aux.csv
+!policyengine_us_data/datasets/cps/long_term/SSPopJul_TR2024.csv
diff --git a/README.md b/README.md
@@ -14,6 +14,14 @@ which installs the development dependencies in a reference-only manner (so that
 to the package code will be reflected immediately); `policyengine-us-data` is a dev package
 and not intended for direct access.
 
+## SSA Data Sources
+
+The following SSA data sources are used in this project:
+
+- [Latest Trustee's Report (2025)](https://www.ssa.gov/oact/TR/2025/index.html) - Source for `social_security_aux.csv` (extracted via `extract_ssa_costs.py`)
+- [Single Year Supplementary Tables (2025)](https://www.ssa.gov/oact/tr/2025/lrIndex.html) - Long-range demographic and economic projections
+- [Single Year Age Demographic Projections (2024 - latest published)](https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html) - Source for `SSPopJul_TR2024.csv` population data
+
 ## Building the Paper
 
 ### Prerequisites

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+      - Additional calibration based on SSA Trustees data that extends projections until 2100
diff --git a/policyengine_us_data/datasets/cps/long_term/README.md b/policyengine_us_data/datasets/cps/long_term/README.md
diff --git a/policyengine_us_data/datasets/cps/long_term/SSPopJul_TR2024.csv b/policyengine_us_data/datasets/cps/long_term/SSPopJul_TR2024.csv
diff --git a/policyengine_us_data/datasets/cps/long_term/calibration.py b/policyengine_us_data/datasets/cps/long_term/calibration.py
@@ -0,0 +1,181 @@
+import numpy as np
+import pandas as pd
+
+
+def iterative_proportional_fitting(
+    X, y, w_initial, max_iters=100, tol=1e-6, verbose=True
+):
+    """
+    Fast iterative proportional fitting (raking) for reweighting.
+
+    Args:
+        X: Design matrix (n_households x n_features)
+        y: Target vector (n_features,)
+        w_initial: Initial weights (n_households,)
+        max_iters: Maximum iterations
+        tol: Convergence tolerance
+        verbose: Print progress
+
+    Returns:
+        w_new: New weights (n_households,)
+        info: Dictionary with convergence info
+    """
+    w = w_initial.copy()
+    n_features = X.shape[1]
+
+    for iter_num in range(max_iters):
+        predictions = X.T @ w
+
+        adjustment_factors = y / (predictions + 1e-10)
+
+        w_new = w.copy()
+        for i in range(len(w)):
+            household_features = X[i, :]
+            relevant_adjustments = adjustment_factors[household_features > 0]
+            if len(relevant_adjustments) > 0:
+                adjustment = np.prod(
+                    relevant_adjustments
+                    ** (
+                        household_features[household_features > 0]
+                        / household_features.sum()
+                    )
+                )
+                w_new[i] *= adjustment
+
+        rel_change = np.abs(w_new - w).max() / (np.abs(w).max() + 1e-10)
+        w = w_new
+
+        if verbose and (iter_num % 10 == 0 or rel_change < tol):
+            predictions_new = X.T @ w
+            rel_errors = np.abs(predictions_new - y) / y
+            max_rel_error = rel_errors.max()
+            print(
+                f"Iteration {iter_num:3d}: Max relative error = {max_rel_error:.6f}, Weight change = {rel_change:.6e}"
+            )
+
+        if rel_change < tol:
+            if verbose:
+                print(f"Converged in {iter_num + 1} iterations")
+            break
+
+    predictions_final = X.T @ w
+    predictions_initial = X.T @ w_initial
+
+    info = {
+        "success": True,
+        "iterations": iter_num + 1,
+        "predictions_initial": predictions_initial,
+        "predictions_new": predictions_final,
+        "relative_errors_initial": (predictions_initial - y) / y,
+        "relative_errors_new": (predictions_final - y) / y,
+        "weight_ratio": w / w_initial,
+    }
+
+    return w, info
+
+
+def calibrate_greg(
+    calibrator,
+    X,
+    y_target,
+    baseline_weights,
+    ss_values=None,
+    ss_target=None,
+    n_ages=86,
+):
+    """
+    Calibrate weights using GREG method via samplics.
+
+    Args:
+        calibrator: SampleWeight instance from samplics
+        X: Design matrix (n_households x n_ages)
+        y_target: Target age distribution
+        baseline_weights: Initial household weights
+        ss_values: Optional Social Security values per household
+        ss_target: Optional Social Security target total
+        n_ages: Number of age groups
+
+    Returns:
+        w_new: Calibrated weights
+        iterations: Number of iterations (always 1 for GREG)
+    """
+    controls = {}
+    for age_idx in range(n_ages):
+        controls[f"age_{age_idx}"] = y_target[age_idx]
+
+    if ss_values is not None and ss_target is not None:
+        age_cols = {f"age_{i}": X[:, i] for i in range(n_ages)}
+        aux_df = pd.DataFrame(age_cols)
+        aux_df["ss_total"] = ss_values
+        controls["ss_total"] = ss_target
+        aux_vars = aux_df
+    else:
+        aux_vars = X
+
+    w_new = calibrator.calibrate(
+        samp_weight=baseline_weights,
+        aux_vars=aux_vars,
+        control=controls,
+    )
+
+    return w_new, 1
+
+
+def calibrate_weights(
+    X,
+    y_target,
+    baseline_weights,
+    method="ipf",
+    calibrator=None,
+    ss_values=None,
+    ss_target=None,
+    n_ages=86,
+    max_iters=100,
+    tol=1e-6,
+    verbose=False,
+):
+    """
+    Unified interface for weight calibration.
+
+    Args:
+        X: Design matrix (n_households x n_features)
+        y_target: Target vector
+        baseline_weights: Initial weights
+        method: 'ipf' or 'greg'
+        calibrator: Required if method='greg'
+        ss_values: Optional SS values (for GREG with SS)
+        ss_target: Optional SS target (for GREG with SS)
+        n_ages: Number of age groups
+        max_iters: Max iterations for IPF
+        tol: Convergence tolerance for IPF
+        verbose: Print progress
+
+    Returns:
+        w_new: Calibrated weights
+        iterations: Number of iterations
+    """
+    if method == "greg":
+        if calibrator is None:
+            raise ValueError("calibrator required for GREG method")
+        try:
+            return calibrate_greg(
+                calibrator,
+                X,
+                y_target,
+                baseline_weights,
+                ss_values,
+                ss_target,
+                n_ages,
+            )
+        except Exception as e:
+            if verbose:
+                print(f"GREG failed: {e}, falling back to IPF")
+            w_new, info = iterative_proportional_fitting(
+                X, y_target, baseline_weights, max_iters, tol, verbose
+            )
+            return w_new, info["iterations"]
+    else:
+        w_new, info = iterative_proportional_fitting(
+            X, y_target, baseline_weights, max_iters, tol, verbose
+        )
+        return w_new, info["iterations"]
diff --git a/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py b/policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py
@@ -0,0 +1,40 @@
+import pandas as pd
+import numpy as np
+
+# Read the file
+df = pd.read_excel(
+    "SingleYearTRTables_TR2025.xlsx", sheet_name="VI.G9", header=None
+)
+
+print("DataFrame shape:", df.shape)
+print("\nChecking data types around row 66-70:")
+for i in range(66, 71):
+    year_val = df.iloc[i, 0]
+    cost_val = df.iloc[i, 2]
+    print(
+        f"Row {i}: Year={year_val} (type: {type(year_val)}), Cost={cost_val} (type: {type(cost_val)})"
+    )
+
+# Extract OASDI costs more carefully
+oasdi_costs_2025_dollars = {}
+for i in range(66, min(142, len(df))):
+    year_val = df.iloc[i, 0]
+    cost_val = df.iloc[i, 2]
+
+    if pd.notna(year_val) and pd.notna(cost_val):
+        try:
+            year = int(year_val)
+            cost = float(cost_val)
+            oasdi_costs_2025_dollars[year] = cost
+            if year <= 2030:
+                print(f"Extracted: {year} -> ${cost}B")
+        except Exception as e:
+            print(f"Error at row {i}: {e}")
+            break
+
+print(f"\nTotal years extracted: {len(oasdi_costs_2025_dollars)}")
+
+# Show the dictionary
+print("\nFirst 10 years:")
+for year in sorted(oasdi_costs_2025_dollars.keys())[:10]:
+    print(f"  {year}: ${oasdi_costs_2025_dollars[year]:.1f}B")