diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..c3d09844 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,8 @@ +- bump: minor + changes: + added: + - SPM threshold calculation using policyengine/spm-calculator package + - New utility module (policyengine_us_data/utils/spm.py) for SPM calculations + changed: + - CPS datasets now calculate SPM thresholds using spm-calculator with Census-provided geographic adjustments + - ACS datasets now calculate SPM thresholds using spm-calculator with national-level thresholds diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py index 0ecd3ee7..836c2a1f 100644 --- a/policyengine_us_data/datasets/acs/acs.py +++ b/policyengine_us_data/datasets/acs/acs.py @@ -25,6 +25,7 @@ def generate(self) -> None: self.add_id_variables(acs, person, household) self.add_person_variables(acs, person, household) self.add_household_variables(acs, household) + self.add_spm_variables(acs, person, household, self.time_period) acs.close() raw_data.close() @@ -93,9 +94,53 @@ def add_person_variables( ) @staticmethod - def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None: - acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES - acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD + def add_spm_variables( + acs: h5py.File, + person: DataFrame, + household: DataFrame, + time_period: int, + ) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_national, + map_tenure_acs_to_spm, + ) + + # In ACS, SPM unit = household + # Calculate number of adults (18+) and children (<18) per household + person_with_hh = person.copy() + person_with_hh["is_adult"] = person_with_hh["AGEP"] >= 18 + person_with_hh["is_child"] = person_with_hh["AGEP"] < 18 + + hh_counts = ( + person_with_hh.groupby("household_id") + .agg({"is_adult": "sum", "is_child": "sum"}) + .rename( + columns={"is_adult": "num_adults", "is_child": "num_children"} + ) + ) + + # Ensure household is indexed properly + household_indexed = household.set_index("household_id") + + # Get counts aligned with household order + num_adults = hh_counts.loc[ + household_indexed.index, "num_adults" + ].values + num_children = hh_counts.loc[ + household_indexed.index, "num_children" + ].values + + # Map ACS tenure to SPM tenure codes + tenure_codes = map_tenure_acs_to_spm(household_indexed["TEN"].values) + + # Calculate SPM thresholds using national-level values + # (ACS doesn't have Census-provided geographic adjustments) + acs["spm_unit_spm_threshold"] = calculate_spm_thresholds_national( + num_adults=num_adults, + num_children=num_children, + tenure_codes=tenure_codes, + year=time_period, + ) @staticmethod def add_household_variables(acs: h5py.File, household: DataFrame) -> None: diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index f932e0d5..90fc72c3 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -78,7 +78,7 @@ def generate(self): undocumented_students_target=0.21 * 1.9e6, ) logging.info("Adding family variables") - add_spm_variables(cps, spm_unit) + add_spm_variables(cps, spm_unit, self.time_period) logging.info("Adding household variables") add_household_variables(cps, household) logging.info("Adding rent") @@ -602,7 +602,15 @@ def add_personal_income_variables( cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob -def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: +def add_spm_variables( + cps: h5py.File, + spm_unit: DataFrame, + time_period: int, +) -> None: + from policyengine_us_data.utils.spm import ( + calculate_spm_thresholds_with_geoadj, + ) + SPM_RENAMES = dict( spm_unit_total_income_reported="SPM_TOTVAL", snap_reported="SPM_SNAPSUB", @@ -616,7 +624,6 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: # State tax includes refundable credits. spm_unit_state_tax_reported="SPM_STTAX", spm_unit_capped_work_childcare_expenses="SPM_CAPWKCCXPNS", - spm_unit_spm_threshold="SPM_POVTHRESHOLD", spm_unit_net_income_reported="SPM_RESOURCES", spm_unit_pre_subsidy_childcare_expenses="SPM_CHILDCAREXPNS", ) @@ -625,6 +632,16 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: if asec_variable in spm_unit.columns: cps[openfisca_variable] = spm_unit[asec_variable] + # Calculate SPM thresholds using spm-calculator with Census-provided + # geographic adjustment factors (SPM_GEOADJ) + cps["spm_unit_spm_threshold"] = calculate_spm_thresholds_with_geoadj( + num_adults=spm_unit["SPM_NUMADULTS"].values, + num_children=spm_unit["SPM_NUMKIDS"].values, + tenure_codes=spm_unit["SPM_TENMORTSTATUS"].values, + geoadj=spm_unit["SPM_GEOADJ"].values, + year=time_period, + ) + cps["reduced_price_school_meals_reported"] = ( cps["free_school_meals_reported"] * 0 ) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py new file mode 100644 index 00000000..05c04234 --- /dev/null +++ b/policyengine_us_data/utils/spm.py @@ -0,0 +1,122 @@ +"""SPM threshold calculation utilities using the spm-calculator package.""" + +import numpy as np +from spm_calculator import SPMCalculator, spm_equivalence_scale + + +# Census CPS SPM_TENMORTSTATUS codes to spm-calculator tenure mapping +# Based on IPUMS SPMMORT documentation: +# 1 = Owner with mortgage +# 2 = Owner without mortgage +# 3 = Renter +TENURE_CODE_MAP = { + 1: "owner_with_mortgage", + 2: "owner_without_mortgage", + 3: "renter", +} + + +def calculate_spm_thresholds_with_geoadj( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + geoadj: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using Census-provided geographic adjustments. + + This function uses the SPM_GEOADJ values already computed by the Census + Bureau, combined with spm-calculator's base thresholds and equivalence + scale formula. This avoids the need for a Census API key. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + geoadj: Array of Census SPM_GEOADJ geographic adjustment factors. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + thresholds[i] = base * equiv_scale * geoadj[i] + + return thresholds + + +def calculate_spm_thresholds_national( + num_adults: np.ndarray, + num_children: np.ndarray, + tenure_codes: np.ndarray, + year: int, +) -> np.ndarray: + """ + Calculate SPM thresholds using national-level thresholds (no geoadj). + + This is used for datasets like ACS that don't have pre-computed + geographic adjustment factors. + + Args: + num_adults: Array of number of adults (18+) in each SPM unit. + num_children: Array of number of children (<18) in each SPM unit. + tenure_codes: Array of Census tenure/mortgage status codes. + 1 = owner with mortgage, 2 = owner without mortgage, 3 = renter. + year: The year for which to calculate thresholds. + + Returns: + Array of SPM threshold values using national averages. + """ + calc = SPMCalculator(year=year) + base_thresholds = calc.get_base_thresholds() + + n = len(num_adults) + thresholds = np.zeros(n) + + for i in range(n): + tenure_str = TENURE_CODE_MAP.get(int(tenure_codes[i]), "renter") + base = base_thresholds[tenure_str] + equiv_scale = spm_equivalence_scale( + int(num_adults[i]), int(num_children[i]) + ) + # No geographic adjustment for national-level thresholds + thresholds[i] = base * equiv_scale + + return thresholds + + +def map_tenure_acs_to_spm(tenure_type: np.ndarray) -> np.ndarray: + """ + Map ACS tenure type values to spm-calculator tenure codes. + + Args: + tenure_type: Array of ACS TEN values. + 1 = Owned with mortgage/loan + 2 = Owned free and clear + 3 = Rented + + Returns: + Array of tenure code integers matching Census SPM format. + """ + # ACS TEN codes map directly to Census SPM codes: + # ACS 1 (owned with mortgage) -> Census 1 (owner_with_mortgage) + # ACS 2 (owned outright) -> Census 2 (owner_without_mortgage) + # ACS 3 (rented) -> Census 3 (renter) + return np.where( + np.isin(tenure_type, [1, 2, 3]), + tenure_type, + 3, # Default to renter for unknown values + ) diff --git a/pyproject.toml b/pyproject.toml index 3d00d389..f80c7c7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "sqlalchemy>=2.0.41", "sqlmodel>=0.0.24", "xlrd>=2.0.2", + "spm-calculator @ git+https://github.com/PolicyEngine/spm-calculator.git", ] [project.optional-dependencies]