diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..b79528b3 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,5 @@ +- bump: minor + changes: + added: + - SSN card type implementation for CPS dataset. + - Calibration of undocumented population to 10.1 million based on Pew Research data. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 896140af..fe25691b 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -299,5 +299,6 @@ class CensusCPS_2018(CensusCPS): "POTC_VAL", "PMED_VAL", "PEMCPREM", + "PRCITSHP", "NOW_GRP", ] diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 666151ac..1ce98852 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -53,10 +53,11 @@ def generate(self): add_personal_variables(cps, person) add_personal_income_variables(cps, person, self.raw_cps.time_period) add_previous_year_income(self, cps) + add_ssn_card_type(cps, person) add_spm_variables(cps, spm_unit) add_household_variables(cps, household) - add_tips(self, cps) add_rent(self, cps, person, household) + add_tips(self, cps) raw_data.close() self.save_dataset(cps) @@ -655,6 +656,49 @@ def add_previous_year_income(self, cps: h5py.File) -> None: ].values +def add_ssn_card_type(cps: h5py.File, person: pd.DataFrame) -> None: + """ + Deterministically assign SSA card type based on PRCITSHP and student/employment status. + Code: + - 1: Citizen (PRCITSHP 1–4) + - 2: Foreign-born, noncitizen but likely on valid EAD (student or worker) + - 0: Other noncitizens (to refine or default) + """ + ssn_card_type = np.full(len(person), 0) + + # Code 1: Citizens + ssn_card_type[np.isin(person.PRCITSHP, [1, 2, 3, 4])] = 1 + + # Code 2: Noncitizens (PRCITSHP == 5) who are working or studying + noncitizen_mask = person.PRCITSHP == 5 + is_worker = (person.WSAL_VAL > 0) | (person.SEMP_VAL > 0) # worker + is_student = person.A_HSCOL == 2 # student + ead_like_mask = noncitizen_mask & (is_worker | is_student) + ssn_card_type[ead_like_mask] = 2 + + # Step 3: Refine remaining 0s into 0 or 3 + share_code_3 = 0.3 # IRS/SSA target share of SSA-benefit-only cards + rng = np.random.default_rng(seed=42) + to_refine = (ssn_card_type == 0) & noncitizen_mask + refine_indices = np.where(to_refine)[0] + + if len(refine_indices) > 0: + draw = rng.random(len(refine_indices)) + assign_code_3 = draw < share_code_3 + ssn_card_type[refine_indices[assign_code_3]] = 3 + + code_to_str = { + 0: "NONE", + 1: "CITIZEN", + 2: "NON_CITIZEN_VALID_EAD", + 3: "OTHER_NON_CITIZEN", + } + ssn_card_type_str = ( + pd.Series(ssn_card_type).map(code_to_str).astype("S").values + ) + cps["ssn_card_type"] = ssn_card_type_str + + def add_tips(self, cps: h5py.File): self.save_dataset(cps) from policyengine_us import Microsimulation diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 2d22fcea..e8195142 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -90,3 +90,25 @@ def apply(self): f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn differs from target {target/1e9:.1f}bn by {pct_error:.2%}" ) assert pct_error < TOLERANCE, deduction + + +def test_ssn_card_type_none_target(): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + import numpy as np + + TARGET_COUNT = 11e6 + TOLERANCE = 0.2 # Allow ±20% error + + sim = Microsimulation(dataset=EnhancedCPS_2024) + + # Calculate the number of individuals with ssn_card_type == "NONE" + ssn_type_none_mask = sim.calculate("ssn_card_type") == "NONE" + count = ssn_type_none_mask.sum() + + pct_error = abs((count - TARGET_COUNT) / TARGET_COUNT) + + print( + f'SSN card type "NONE" count: {count:.0f}, target: {TARGET_COUNT:.0f}, error: {pct_error:.2%}' + ) + assert pct_error < TOLERANCE diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 76884a03..8a522161 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -370,6 +370,23 @@ def build_loss_matrix(dataset: type, time_period): if any(pd.isna(targets_array)): raise ValueError("Some targets are missing from the targets array") + # SSN Card Type calibration + for card_type_str in ["NONE"]: # SSN card types as strings + ssn_type_mask = sim.calculate("ssn_card_type").values == card_type_str + + # Overall count by SSN card type + label = f"ssa/ssn_card_type_{card_type_str.lower()}_count" + loss_matrix[label] = sim.map_result( + ssn_type_mask, "person", "household" + ) + + # Target value - replace with actual target values from SSA/IRS data + if card_type_str == "NONE": + # https://www.pewresearch.org/race-and-ethnicity/2018/11/27/u-s-unauthorized-immigrant-total-dips-to-lowest-level-in-a-decade/ + target_count = 11e6 + + targets_array.append(target_count) + return loss_matrix, np.array(targets_array)