Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ node_modules
!age_state.csv
!agi_state.csv
!soi_targets.csv
!policyengine_us_data/datasets/cps/long_term/social_security_aux.csv
!policyengine_us_data/datasets/cps/long_term/SSPopJul_TR2024.csv
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ which installs the development dependencies in a reference-only manner (so that
to the package code will be reflected immediately); `policyengine-us-data` is a dev package
and not intended for direct access.

## SSA Data Sources

The following SSA data sources are used in this project:

- [Latest Trustee's Report (2025)](https://www.ssa.gov/oact/TR/2025/index.html) - Source for `social_security_aux.csv` (extracted via `extract_ssa_costs.py`)
- [Single Year Supplementary Tables (2025)](https://www.ssa.gov/oact/tr/2025/lrIndex.html) - Long-range demographic and economic projections
- [Single Year Age Demographic Projections (2024 - latest published)](https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html) - Source for `SSPopJul_TR2024.csv` population data

## Building the Paper

### Prerequisites
Expand Down
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Additional calibration based on SSA Trustees data that extends projections until 2100
369 changes: 369 additions & 0 deletions policyengine_us_data/datasets/cps/long_term/README.md

Large diffs are not rendered by default.

16,161 changes: 16,161 additions & 0 deletions policyengine_us_data/datasets/cps/long_term/SSPopJul_TR2024.csv

Large diffs are not rendered by default.

181 changes: 181 additions & 0 deletions policyengine_us_data/datasets/cps/long_term/calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import numpy as np
import pandas as pd


def iterative_proportional_fitting(
X, y, w_initial, max_iters=100, tol=1e-6, verbose=True
):
"""
Fast iterative proportional fitting (raking) for reweighting.

Args:
X: Design matrix (n_households x n_features)
y: Target vector (n_features,)
w_initial: Initial weights (n_households,)
max_iters: Maximum iterations
tol: Convergence tolerance
verbose: Print progress

Returns:
w_new: New weights (n_households,)
info: Dictionary with convergence info
"""
w = w_initial.copy()
n_features = X.shape[1]

for iter_num in range(max_iters):
predictions = X.T @ w

adjustment_factors = y / (predictions + 1e-10)

w_new = w.copy()
for i in range(len(w)):
household_features = X[i, :]
relevant_adjustments = adjustment_factors[household_features > 0]
if len(relevant_adjustments) > 0:
adjustment = np.prod(
relevant_adjustments
** (
household_features[household_features > 0]
/ household_features.sum()
)
)
w_new[i] *= adjustment

rel_change = np.abs(w_new - w).max() / (np.abs(w).max() + 1e-10)
w = w_new

if verbose and (iter_num % 10 == 0 or rel_change < tol):
predictions_new = X.T @ w
rel_errors = np.abs(predictions_new - y) / y
max_rel_error = rel_errors.max()
print(
f"Iteration {iter_num:3d}: Max relative error = {max_rel_error:.6f}, Weight change = {rel_change:.6e}"
)

if rel_change < tol:
if verbose:
print(f"Converged in {iter_num + 1} iterations")
break

predictions_final = X.T @ w
predictions_initial = X.T @ w_initial

info = {
"success": True,
"iterations": iter_num + 1,
"predictions_initial": predictions_initial,
"predictions_new": predictions_final,
"relative_errors_initial": (predictions_initial - y) / y,
"relative_errors_new": (predictions_final - y) / y,
"weight_ratio": w / w_initial,
}

return w, info


def calibrate_greg(
calibrator,
X,
y_target,
baseline_weights,
ss_values=None,
ss_target=None,
n_ages=86,
):
"""
Calibrate weights using GREG method via samplics.

Args:
calibrator: SampleWeight instance from samplics
X: Design matrix (n_households x n_ages)
y_target: Target age distribution
baseline_weights: Initial household weights
ss_values: Optional Social Security values per household
ss_target: Optional Social Security target total
n_ages: Number of age groups

Returns:
w_new: Calibrated weights
iterations: Number of iterations (always 1 for GREG)
"""
controls = {}
for age_idx in range(n_ages):
controls[f"age_{age_idx}"] = y_target[age_idx]

if ss_values is not None and ss_target is not None:
age_cols = {f"age_{i}": X[:, i] for i in range(n_ages)}
aux_df = pd.DataFrame(age_cols)
aux_df["ss_total"] = ss_values
controls["ss_total"] = ss_target
aux_vars = aux_df
else:
aux_vars = X

w_new = calibrator.calibrate(
samp_weight=baseline_weights,
aux_vars=aux_vars,
control=controls,
)

return w_new, 1


def calibrate_weights(
X,
y_target,
baseline_weights,
method="ipf",
calibrator=None,
ss_values=None,
ss_target=None,
n_ages=86,
max_iters=100,
tol=1e-6,
verbose=False,
):
"""
Unified interface for weight calibration.

Args:
X: Design matrix (n_households x n_features)
y_target: Target vector
baseline_weights: Initial weights
method: 'ipf' or 'greg'
calibrator: Required if method='greg'
ss_values: Optional SS values (for GREG with SS)
ss_target: Optional SS target (for GREG with SS)
n_ages: Number of age groups
max_iters: Max iterations for IPF
tol: Convergence tolerance for IPF
verbose: Print progress

Returns:
w_new: Calibrated weights
iterations: Number of iterations
"""
if method == "greg":
if calibrator is None:
raise ValueError("calibrator required for GREG method")
try:
return calibrate_greg(
calibrator,
X,
y_target,
baseline_weights,
ss_values,
ss_target,
n_ages,
)
except Exception as e:
if verbose:
print(f"GREG failed: {e}, falling back to IPF")
w_new, info = iterative_proportional_fitting(
X, y_target, baseline_weights, max_iters, tol, verbose
)
return w_new, info["iterations"]
else:
w_new, info = iterative_proportional_fitting(
X, y_target, baseline_weights, max_iters, tol, verbose
)
return w_new, info["iterations"]
40 changes: 40 additions & 0 deletions policyengine_us_data/datasets/cps/long_term/extract_ssa_costs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd
import numpy as np

# Read the file
df = pd.read_excel(
"SingleYearTRTables_TR2025.xlsx", sheet_name="VI.G9", header=None
)

print("DataFrame shape:", df.shape)
print("\nChecking data types around row 66-70:")
for i in range(66, 71):
year_val = df.iloc[i, 0]
cost_val = df.iloc[i, 2]
print(
f"Row {i}: Year={year_val} (type: {type(year_val)}), Cost={cost_val} (type: {type(cost_val)})"
)

# Extract OASDI costs more carefully
oasdi_costs_2025_dollars = {}
for i in range(66, min(142, len(df))):
year_val = df.iloc[i, 0]
cost_val = df.iloc[i, 2]

if pd.notna(year_val) and pd.notna(cost_val):
try:
year = int(year_val)
cost = float(cost_val)
oasdi_costs_2025_dollars[year] = cost
if year <= 2030:
print(f"Extracted: {year} -> ${cost}B")
except Exception as e:
print(f"Error at row {i}: {e}")
break

print(f"\nTotal years extracted: {len(oasdi_costs_2025_dollars)}")

# Show the dictionary
print("\nFirst 10 years:")
for year in sorted(oasdi_costs_2025_dollars.keys())[:10]:
print(f" {year}: ${oasdi_costs_2025_dollars[year]:.1f}B")
Loading
Loading