Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- bump: minor
changes:
added:
- SPM threshold calculation using policyengine/spm-calculator package
- New utility module (policyengine_us_data/utils/spm.py) for SPM calculations
changed:
- CPS datasets now calculate SPM thresholds using spm-calculator with Census-provided geographic adjustments
- ACS datasets now calculate SPM thresholds using spm-calculator with national-level thresholds
51 changes: 48 additions & 3 deletions policyengine_us_data/datasets/acs/acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def generate(self) -> None:
self.add_id_variables(acs, person, household)
self.add_person_variables(acs, person, household)
self.add_household_variables(acs, household)
self.add_spm_variables(acs, person, household, self.time_period)

acs.close()
raw_data.close()
Expand Down Expand Up @@ -93,9 +94,53 @@ def add_person_variables(
)

@staticmethod
def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None:
acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES
acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD
def add_spm_variables(
acs: h5py.File,
person: DataFrame,
household: DataFrame,
time_period: int,
) -> None:
from policyengine_us_data.utils.spm import (
calculate_spm_thresholds_national,
map_tenure_acs_to_spm,
)

# In ACS, SPM unit = household
# Calculate number of adults (18+) and children (<18) per household
person_with_hh = person.copy()
person_with_hh["is_adult"] = person_with_hh["AGEP"] >= 18
person_with_hh["is_child"] = person_with_hh["AGEP"] < 18

hh_counts = (
person_with_hh.groupby("household_id")
.agg({"is_adult": "sum", "is_child": "sum"})
.rename(
columns={"is_adult": "num_adults", "is_child": "num_children"}
)
)

# Ensure household is indexed properly
household_indexed = household.set_index("household_id")

# Get counts aligned with household order
num_adults = hh_counts.loc[
household_indexed.index, "num_adults"
].values
num_children = hh_counts.loc[
household_indexed.index, "num_children"
].values

# Map ACS tenure to SPM tenure codes
tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values)

# Calculate SPM thresholds using national-level values
# (ACS doesn't have Census-provided geographic adjustments)
acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national(
num_adults=num_adults,
num_children=num_children,
tenure_codes=tenure_codes,
year=time_period,
)

@staticmethod
def add_household_variables(acs: h5py.File, household: DataFrame) -> None:
Expand Down
23 changes: 20 additions & 3 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def generate(self):
undocumented_students_target=0.21 * 1.9e6,
)
logging.info("Adding family variables")
add_spm_variables(cps, spm_unit)
add_spm_variables(cps, spm_unit, self.time_period)
logging.info("Adding household variables")
add_household_variables(cps, household)
logging.info("Adding rent")
Expand Down Expand Up @@ -602,7 +602,15 @@ def add_personal_income_variables(
cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob


def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
def add_spm_variables(
cps: h5py.File,
spm_unit: DataFrame,
time_period: int,
) -> None:
from policyengine_us_data.utils.spm import (
calculate_spm_thresholds_with_geoadj,
)

SPM_RENAMES = dict(
spm_unit_total_income_reported="SPM_TOTVAL",
snap_reported="SPM_SNAPSUB",
Expand All @@ -616,7 +624,6 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
# State tax includes refundable credits.
spm_unit_state_tax_reported="SPM_STTAX",
spm_unit_capped_work_childcare_expenses="SPM_CAPWKCCXPNS",
spm_unit_spm_threshold="SPM_POVTHRESHOLD",
spm_unit_net_income_reported="SPM_RESOURCES",
spm_unit_pre_subsidy_childcare_expenses="SPM_CHILDCAREXPNS",
)
Expand All @@ -625,6 +632,16 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
if asec_variable in spm_unit.columns:
cps[openfisca_variable] = spm_unit[asec_variable]

# Calculate SPM thresholds using spm-calculator with Census-provided
# geographic adjustment factors (SPM_GEOADJ)
cps["spm_unit_spm_threshold"] = calculate_spm_thresholds_with_geoadj(
num_adults=spm_unit["SPM_NUMADULTS"].values,
num_children=spm_unit["SPM_NUMKIDS"].values,
tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values,
geoadj=spm_unit["SPM_GEOADJ"].values,
year=time_period,
)

cps["reduced_price_school_meals_reported"] = (
cps["free_school_meals_reported"] * 0
)
Expand Down
122 changes: 122 additions & 0 deletions policyengine_us_data/utils/spm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""SPM threshold calculation utilities using the spm-calculator package."""

import numpy as np
from spm_calculator import SPMCalculator, spm_equivalence_scale


# Census CPS SPM_TENMORTSTATUS codes to spm-calculator tenure mapping
# Based on IPUMS SPMMORT documentation:
# 1 = Owner with mortgage
# 2 = Owner without mortgage
# 3 = Renter
TENURE_CODE_MAP = {
1: "owner_with_mortgage",
2: "owner_without_mortgage",
3: "renter",
}


def calculate_spm_thresholds_with_geoadj(
num_adults: np.ndarray,
num_children: np.ndarray,
tenure_codes: np.ndarray,
geoadj: np.ndarray,
year: int,
) -> np.ndarray:
"""
Calculate SPM thresholds using Census-provided geographic adjustments.

This function uses the SPM_GEOADJ values already computed by the Census
Bureau, combined with spm-calculator's base thresholds and equivalence
scale formula. This avoids the need for a Census API key.

Args:
num_adults: Array of number of adults (18+) in each SPM unit.
num_children: Array of number of children (<18) in each SPM unit.
tenure_codes: Array of Census tenure/mortgage status codes.
1 = owner with mortgage, 2 = owner without mortgage, 3 = renter.
geoadj: Array of Census SPM_GEOADJ geographic adjustment factors.
year: The year for which to calculate thresholds.

Returns:
Array of SPM threshold values.
"""
calc = SPMCalculator(year=year)
base_thresholds = calc.get_base_thresholds()

n = len(num_adults)
thresholds = np.zeros(n)

for i in range(n):
tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter")
base = base_thresholds[tenure_str]
equiv_scale = spm_equivalence_scale(
int(num_adults[i]), int(num_children[i])
)
thresholds[i] = base * equiv_scale * geoadj[i]

return thresholds


def calculate_spm_thresholds_national(
num_adults: np.ndarray,
num_children: np.ndarray,
tenure_codes: np.ndarray,
year: int,
) -> np.ndarray:
"""
Calculate SPM thresholds using national-level thresholds (no geoadj).

This is used for datasets like ACS that don't have pre-computed
geographic adjustment factors.

Args:
num_adults: Array of number of adults (18+) in each SPM unit.
num_children: Array of number of children (<18) in each SPM unit.
tenure_codes: Array of Census tenure/mortgage status codes.
1 = owner with mortgage, 2 = owner without mortgage, 3 = renter.
year: The year for which to calculate thresholds.

Returns:
Array of SPM threshold values using national averages.
"""
calc = SPMCalculator(year=year)
base_thresholds = calc.get_base_thresholds()

n = len(num_adults)
thresholds = np.zeros(n)

for i in range(n):
tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter")
base = base_thresholds[tenure_str]
equiv_scale = spm_equivalence_scale(
int(num_adults[i]), int(num_children[i])
)
# No geographic adjustment for national-level thresholds
thresholds[i] = base * equiv_scale

return thresholds


def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray:
"""
Map ACS tenure type values to spm-calculator tenure codes.

Args:
tenure_type: Array of ACS TEN values.
1 = Owned with mortgage/loan
2 = Owned free and clear
3 = Rented

Returns:
Array of tenure code integers matching Census SPM format.
"""
# ACS TEN codes map directly to Census SPM codes:
# ACS 1 (owned with mortgage) -> Census 1 (owner_with_mortgage)
# ACS 2 (owned outright) -> Census 2 (owner_without_mortgage)
# ACS 3 (rented) -> Census 3 (renter)
return np.where(
np.isin(tenure_type, [1, 2, 3]),
tenure_type,
3, # Default to renter for unknown values
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies = [
"sqlalchemy>=2.0.41",
"sqlmodel>=0.0.24",
"xlrd>=2.0.2",
"spm-calculator @ git+https://github.com/PolicyEngine/spm-calculator.git",
]

[project.optional-dependencies]
Expand Down
Loading