PolicyEngine · nikhilwoodruff · May 14, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+      - SSN card type implementation for CPS dataset.
+      - Calibration of undocumented population to 10.1 million based on Pew Research data.
diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py
@@ -299,5 +299,6 @@ class CensusCPS_2018(CensusCPS):
     "POTC_VAL",
     "PMED_VAL",
     "PEMCPREM",
+    "PRCITSHP",
     "NOW_GRP",
 ]
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -53,10 +53,11 @@ def generate(self):
         add_personal_variables(cps, person)
         add_personal_income_variables(cps, person, self.raw_cps.time_period)
         add_previous_year_income(self, cps)
+        add_ssn_card_type(cps, person)
         add_spm_variables(cps, spm_unit)
         add_household_variables(cps, household)
-        add_tips(self, cps)
         add_rent(self, cps, person, household)
+        add_tips(self, cps)
 
         raw_data.close()
         self.save_dataset(cps)
@@ -655,6 +656,49 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
     ].values
 
 
+def add_ssn_card_type(cps: h5py.File, person: pd.DataFrame) -> None:
+    """
+    Deterministically assign SSA card type based on PRCITSHP and student/employment status.
+    Code:
+    - 1: Citizen (PRCITSHP 1–4)
+    - 2: Foreign-born, noncitizen but likely on valid EAD (student or worker)
+    - 0: Other noncitizens (to refine or default)
+    """
+    ssn_card_type = np.full(len(person), 0)
+
+    # Code 1: Citizens
+    ssn_card_type[np.isin(person.PRCITSHP, [1, 2, 3, 4])] = 1
+
+    # Code 2: Noncitizens (PRCITSHP == 5) who are working or studying
+    noncitizen_mask = person.PRCITSHP == 5
+    is_worker = (person.WSAL_VAL > 0) | (person.SEMP_VAL > 0)  # worker
+    is_student = person.A_HSCOL == 2  # student
+    ead_like_mask = noncitizen_mask & (is_worker | is_student)
+    ssn_card_type[ead_like_mask] = 2
+
+    # Step 3: Refine remaining 0s into 0 or 3
+    share_code_3 = 0.3  # IRS/SSA target share of SSA-benefit-only cards
+    rng = np.random.default_rng(seed=42)
+    to_refine = (ssn_card_type == 0) & noncitizen_mask
+    refine_indices = np.where(to_refine)[0]
+
+    if len(refine_indices) > 0:
+        draw = rng.random(len(refine_indices))
+        assign_code_3 = draw < share_code_3
+        ssn_card_type[refine_indices[assign_code_3]] = 3
+
+    code_to_str = {
+        0: "NONE",
+        1: "CITIZEN",
+        2: "NON_CITIZEN_VALID_EAD",
+        3: "OTHER_NON_CITIZEN",
+    }
+    ssn_card_type_str = (
+        pd.Series(ssn_card_type).map(code_to_str).astype("S").values
+    )
+    cps["ssn_card_type"] = ssn_card_type_str
+
+
 def add_tips(self, cps: h5py.File):
     self.save_dataset(cps)
     from policyengine_us import Microsimulation

diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -90,3 +90,25 @@ def apply(self):
             f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn differs from target {target/1e9:.1f}bn by {pct_error:.2%}"
         )
         assert pct_error < TOLERANCE, deduction
+
+
+def test_ssn_card_type_none_target():
+    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
+    from policyengine_us import Microsimulation
+    import numpy as np
+
+    TARGET_COUNT = 11e6
+    TOLERANCE = 0.2  # Allow ±20% error
+
+    sim = Microsimulation(dataset=EnhancedCPS_2024)
+
+    # Calculate the number of individuals with ssn_card_type == "NONE"
+    ssn_type_none_mask = sim.calculate("ssn_card_type") == "NONE"
+    count = ssn_type_none_mask.sum()
+
+    pct_error = abs((count - TARGET_COUNT) / TARGET_COUNT)
+
+    print(
+        f'SSN card type "NONE" count: {count:.0f}, target: {TARGET_COUNT:.0f}, error: {pct_error:.2%}'
+    )
+    assert pct_error < TOLERANCE
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -370,6 +370,23 @@ def build_loss_matrix(dataset: type, time_period):
     if any(pd.isna(targets_array)):
         raise ValueError("Some targets are missing from the targets array")
 
+    # SSN Card Type calibration
+    for card_type_str in ["NONE"]:  # SSN card types as strings
+        ssn_type_mask = sim.calculate("ssn_card_type").values == card_type_str
+
+        # Overall count by SSN card type
+        label = f"ssa/ssn_card_type_{card_type_str.lower()}_count"
+        loss_matrix[label] = sim.map_result(
+            ssn_type_mask, "person", "household"
+        )
+
+        # Target value - replace with actual target values from SSA/IRS data
+        if card_type_str == "NONE":
+            # https://www.pewresearch.org/race-and-ethnicity/2018/11/27/u-s-unauthorized-immigrant-total-dips-to-lowest-level-in-a-decade/
+            target_count = 11e6
+
+        targets_array.append(target_count)
+
     return loss_matrix, np.array(targets_array)