Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
added:
- SSN card type implementation for CPS dataset.
- Calibration of undocumented population to 10.1 million based on Pew Research data.
1 change: 1 addition & 0 deletions policyengine_us_data/datasets/cps/census_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,5 +299,6 @@ class CensusCPS_2018(CensusCPS):
"POTC_VAL",
"PMED_VAL",
"PEMCPREM",
"PRCITSHP",
"NOW_GRP",
]
46 changes: 45 additions & 1 deletion policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,11 @@ def generate(self):
add_personal_variables(cps, person)
add_personal_income_variables(cps, person, self.raw_cps.time_period)
add_previous_year_income(self, cps)
add_ssn_card_type(cps, person)
add_spm_variables(cps, spm_unit)
add_household_variables(cps, household)
add_tips(self, cps)
add_rent(self, cps, person, household)
add_tips(self, cps)

raw_data.close()
self.save_dataset(cps)
Expand Down Expand Up @@ -655,6 +656,49 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
].values


def add_ssn_card_type(cps: h5py.File, person: pd.DataFrame) -> None:
"""
Deterministically assign SSA card type based on PRCITSHP and student/employment status.
Code:
- 1: Citizen (PRCITSHP 1–4)
- 2: Foreign-born, noncitizen but likely on valid EAD (student or worker)
- 0: Other noncitizens (to refine or default)
"""
ssn_card_type = np.full(len(person), 0)

# Code 1: Citizens
ssn_card_type[np.isin(person.PRCITSHP, [1, 2, 3, 4])] = 1

# Code 2: Noncitizens (PRCITSHP == 5) who are working or studying
noncitizen_mask = person.PRCITSHP == 5
is_worker = (person.WSAL_VAL > 0) | (person.SEMP_VAL > 0) # worker
is_student = person.A_HSCOL == 2 # student
ead_like_mask = noncitizen_mask & (is_worker | is_student)
ssn_card_type[ead_like_mask] = 2

# Step 3: Refine remaining 0s into 0 or 3
share_code_3 = 0.3 # IRS/SSA target share of SSA-benefit-only cards
rng = np.random.default_rng(seed=42)
to_refine = (ssn_card_type == 0) & noncitizen_mask
refine_indices = np.where(to_refine)[0]

if len(refine_indices) > 0:
draw = rng.random(len(refine_indices))
assign_code_3 = draw < share_code_3
ssn_card_type[refine_indices[assign_code_3]] = 3

code_to_str = {
0: "NONE",
1: "CITIZEN",
2: "NON_CITIZEN_VALID_EAD",
3: "OTHER_NON_CITIZEN",
}
ssn_card_type_str = (
pd.Series(ssn_card_type).map(code_to_str).astype("S").values
)
cps["ssn_card_type"] = ssn_card_type_str


def add_tips(self, cps: h5py.File):
self.save_dataset(cps)
from policyengine_us import Microsimulation
Expand Down
22 changes: 22 additions & 0 deletions policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,25 @@ def apply(self):
f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn differs from target {target/1e9:.1f}bn by {pct_error:.2%}"
)
assert pct_error < TOLERANCE, deduction


def test_ssn_card_type_none_target():
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
from policyengine_us import Microsimulation
import numpy as np

TARGET_COUNT = 11e6
TOLERANCE = 0.2 # Allow ±20% error

sim = Microsimulation(dataset=EnhancedCPS_2024)

# Calculate the number of individuals with ssn_card_type == "NONE"
ssn_type_none_mask = sim.calculate("ssn_card_type") == "NONE"
count = ssn_type_none_mask.sum()

pct_error = abs((count - TARGET_COUNT) / TARGET_COUNT)

print(
f'SSN card type "NONE" count: {count:.0f}, target: {TARGET_COUNT:.0f}, error: {pct_error:.2%}'
)
assert pct_error < TOLERANCE
17 changes: 17 additions & 0 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,23 @@ def build_loss_matrix(dataset: type, time_period):
if any(pd.isna(targets_array)):
raise ValueError("Some targets are missing from the targets array")

# SSN Card Type calibration
for card_type_str in ["NONE"]: # SSN card types as strings
ssn_type_mask = sim.calculate("ssn_card_type").values == card_type_str

# Overall count by SSN card type
label = f"ssa/ssn_card_type_{card_type_str.lower()}_count"
loss_matrix[label] = sim.map_result(
ssn_type_mask, "person", "household"
)

# Target value - replace with actual target values from SSA/IRS data
if card_type_str == "NONE":
# https://www.pewresearch.org/race-and-ethnicity/2018/11/27/u-s-unauthorized-immigrant-total-dips-to-lowest-level-in-a-decade/
target_count = 11e6

targets_array.append(target_count)

return loss_matrix, np.array(targets_array)


Expand Down
Loading