Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
4c77463
add conditions
vahid-ahmadi May 23, 2025
e143d61
add changelog
vahid-ahmadi May 23, 2025
e947c58
Merge branch 'main' into ssn-impute-new
vahid-ahmadi May 26, 2025
0441bad
add new calibration
vahid-ahmadi May 27, 2025
641e67e
format
vahid-ahmadi May 27, 2025
ec48a43
edit tests
vahid-ahmadi May 27, 2025
a7804ad
edit shape
vahid-ahmadi May 27, 2025
daa254c
edit condition
vahid-ahmadi May 27, 2025
d5fe162
delete prints
vahid-ahmadi May 27, 2025
ae424f5
edit worker student condition
vahid-ahmadi May 28, 2025
78ae416
edit conditions
vahid-ahmadi May 28, 2025
2a32696
edit condition 12
vahid-ahmadi May 28, 2025
8fdaed3
edit cond12
vahid-ahmadi May 28, 2025
e2de6f8
move code 2 part
vahid-ahmadi May 28, 2025
868d84c
put targets in cps.py
vahid-ahmadi Jun 2, 2025
eefcf07
edit workers and students logic
vahid-ahmadi Jun 2, 2025
82123a4
remove CTC Reform Impact Calibration
vahid-ahmadi Jun 2, 2025
5d05bb8
relax CTC reform impact
vahid-ahmadi Jun 2, 2025
7ca653a
format
vahid-ahmadi Jun 2, 2025
79b06d7
add years targets
vahid-ahmadi Jun 2, 2025
5cfff0e
change tolerance error
vahid-ahmadi Jun 2, 2025
2eaa0c6
edit logics and print pop
vahid-ahmadi Jun 9, 2025
365d0fc
edit functions
vahid-ahmadi Jun 9, 2025
fb78b59
add documentation
vahid-ahmadi Jun 16, 2025
a6dced5
add documentation
vahid-ahmadi Jun 18, 2025
c38cef6
Merge branch 'main' into ssn-impute-new
vahid-ahmadi Jun 18, 2025
e64df1a
Fix merge conflict in test_enhanced_cps.py
vahid-ahmadi Jun 19, 2025
c6253c4
Fix Unicode encoding error in documentation update
vahid-ahmadi Jun 19, 2025
ce739be
add ctc reform into doc
vahid-ahmadi Jun 20, 2025
b090af7
family adjustment probabilistic
vahid-ahmadi Jun 20, 2025
2131da5
update documentation
vahid-ahmadi Jun 20, 2025
79205c3
edit doc
vahid-ahmadi Jun 23, 2025
b2f97f5
package versions
vahid-ahmadi Jun 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
added:
- SSN card type imputation algorithm.
- Family correlation adjustment to align parent-child SSN status.
19 changes: 19 additions & 0 deletions policyengine_us_data/datasets/cps/census_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,5 +302,24 @@ class CensusCPS_2018(CensusCPS):
"PRCITSHP",
"NOW_GRP",
"POCCU2",
"PEINUSYR",
"MCARE",
"PEN_SC1",
"PEN_SC2",
"RESNSS1",
"RESNSS2",
"IHSFLG",
"CAID",
"CHAMPVA",
"PEIO1COW",
"A_MJOCC",
"SS_YN",
"PEAFEVER",
"SSI_YN",
"RESNSSI1",
"RESNSSI2",
"PENATVTY",
"PEIOOCC",
"MIL",
"A_HRS1",
]
321 changes: 295 additions & 26 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def generate(self):
logging.info("Adding previous year income variables")
add_previous_year_income(self, cps)
logging.info("Adding SSN card type")
add_ssn_card_type(cps, person)
add_ssn_card_type(cps, person, spm_unit)
logging.info("Adding family variables")
add_spm_variables(cps, spm_unit)
logging.info("Adding household variables")
Expand Down Expand Up @@ -838,42 +838,311 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
].values


def add_ssn_card_type(cps: h5py.File, person: pd.DataFrame) -> None:
def add_ssn_card_type(
cps: h5py.File, person: pd.DataFrame, spm_unit: pd.DataFrame
) -> None:
"""
Deterministically assign SSA card type based on PRCITSHP and student/employment status.
Code:
- 1: Citizen (PRCITSHP 1–4)
- 2: Foreign-born, noncitizen but likely on valid EAD (student or worker)
- 0: Other noncitizens (to refine or default)
Assign SSN card type using PRCITSHP, employment status, and ASEC-UA conditions.
Codes:
- 0: "NONE" - Likely undocumented immigrants
- 1: "CITIZEN" - US citizens (born or naturalized)
- 2: "NON_CITIZEN_VALID_EAD" - Non-citizens with work/study authorization
- 3: "OTHER_NON_CITIZEN" - Non-citizens with indicators of legal status
"""
# Initialize all persons as code 0
ssn_card_type = np.full(len(person), 0)

# Code 1: Citizens
ssn_card_type[np.isin(person.PRCITSHP, [1, 2, 3, 4])] = 1
# ============================================================================
# PRIMARY CLASSIFICATIONS
# ============================================================================

# Code 1: All US Citizens (naturalized and born)
citizens_mask = np.isin(person.PRCITSHP, [1, 2, 3, 4])
ssn_card_type[citizens_mask] = 1
noncitizens = person.PRCITSHP == 5

# ============================================================================
# ASEC UNDOCUMENTED ALGORITHM CONDITIONS (13 of 14)
# Remove individuals with indicators of legal status from code 0 pool
# ============================================================================

# paper source: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4662801
# Helper mask: Only apply conditions to non-citizens without clear authorization
potentially_undocumented = ~np.isin(ssn_card_type, [1, 2])

# CONDITION 1: Pre-1982 Arrivals (IRCA Amnesty Eligible)
# PEINUSYR values indicating arrival before 1982:
# 01 = Before 1950
# 02 = 1950–1959
# 03 = 1960–1964
# 04 = 1965–1969
# 05 = 1970–1974
# 06 = 1975–1979
# 07 = 1980–1981
arrived_before_1982 = np.isin(person.PEINUSYR, [1, 2, 3, 4, 5, 6, 7])

# CONDITION 2: Eligible Naturalized Citizens
is_naturalized = person.PRCITSHP == 4
is_adult = person.A_AGE >= 18
# 5+ years in US (codes 8-26: 1982-2019)
has_five_plus_years = np.isin(person.PEINUSYR, list(range(8, 27)))
# 3+ years in US + married (codes 8-27: 1982-2021)
has_three_plus_years = np.isin(person.PEINUSYR, list(range(8, 28)))
is_married = person.A_MARITL.isin([1, 2]) & (person.A_SPOUSE > 0)
eligible_naturalized = (
is_naturalized
& is_adult
& (has_five_plus_years | (has_three_plus_years & is_married))
)

# CONDITION 3: Medicare Recipients
has_medicare = person.MCARE == 1

# CONDITION 4: Federal Retirement Benefits
has_federal_pension = np.isin(person.PEN_SC1, [3]) | np.isin(
person.PEN_SC2, [3]
) # Federal government pension

# CONDITION 5: Social Security Disability
has_ss_disability = np.isin(person.RESNSS1, [2]) | np.isin(
person.RESNSS2, [2]
) # Disabled (adult or child)

# CONDITION 6: Indian Health Service Coverage
has_ihs = person.IHSFLG == 1

# CONDITION 7: Medicaid Recipients (simplified - no state adjustments)
has_medicaid = person.CAID == 1

# CONDITION 8: CHAMPVA Recipients
has_champva = person.CHAMPVA == 1

# CONDITION 9: Military Health Insurance
has_military_insurance = person.MIL == 1

# CONDITION 10: Government Employees
is_government_worker = np.isin(
person.PEIO1COW, [1, 2, 3]
) # Fed/state/local gov
is_military_occupation = person.A_MJOCC == 11 # Military occupation
is_government_employee = is_government_worker | is_military_occupation

# CONDITION 11: Social Security Recipients
has_social_security = person.SS_YN == 1

# CONDITION 12: Housing Assistance
spm_housing_map = dict(zip(spm_unit.SPM_ID, spm_unit.SPM_CAPHOUSESUB))
has_housing_assistance = person.SPM_ID.map(spm_housing_map).fillna(0) > 0

# CONDITION 13: Veterans/Military Personnel
is_veteran = person.PEAFEVER == 1
is_current_military = person.A_MJOCC == 11
is_military_connected = is_veteran | is_current_military

# CONDITION 14: SSI Recipients (simplified - assumes all SSI is for recipient)
has_ssi = person.SSI_YN == 1

# ============================================================================
# CONSOLIDATED ASSIGNMENT OF ASSUMED DOCUMENTED STATUS
# ============================================================================

# Combine all conditions that indicate legal status
assumed_documented = (
arrived_before_1982
| eligible_naturalized
| has_medicare
| has_federal_pension
| has_ss_disability
| has_ihs
| has_medicaid
| has_champva
| has_military_insurance
| is_government_employee
| has_social_security
| has_housing_assistance
| is_military_connected
| has_ssi
)

# Apply single assignment for all conditions
ssn_card_type[potentially_undocumented & assumed_documented] = 3

# ============================================================================
# CODE 2 NON-CITIZEN WITH WORK/STUDY AUTHORIZATION
# ============================================================================

# Code 2: Non-citizens with work/study authorization (likely valid EAD)
worker_mask = (
(ssn_card_type != 3)
& noncitizens
& ((person.WSAL_VAL > 0) | (person.SEMP_VAL > 0))
)
student_mask = (ssn_card_type != 3) & noncitizens & (person.A_HSCOL == 2)

np.random.seed(0)
# In 2024, the foreign born accounted for 19.2 percent of the U.S. civilian labor force.
# https://www.bls.gov/news.release/forbrn.nr0.htm
# In Jan 2024, the total U.S. civilian labor forceis reported as 167.1 million people.
# https://fred.stlouisfed.org/series/CLF16OV
# Unauthorized immigrant workers is 8.3 million.
# https://www.pewresearch.org/short-reads/2024/07/22/what-we-know-about-unauthorized-immigrants-living-in-the-us/
# share of undocumented immigrant workers who are unauthorized to work is: 8.3 / (0.192 * 167.1)
worker_ids = person[worker_mask].index
n_worker_ead = int(0.74 * len(worker_ids))
selected_workers = np.random.choice(
worker_ids, size=n_worker_ead, replace=False
)

# undocumented immigrant students who account for approximately 21 percent of the total 1.9 million immigrant students
# https://www.higheredimmigrationportal.org/research/immigrant-origin-students-in-u-s-higher-education-updated-august-2024/
student_ids = person[student_mask].index
n_student_ead = int(0.79 * len(student_ids))
selected_students = np.random.choice(
student_ids, size=n_student_ead, replace=False
)

# Assign code 2
ssn_card_type[selected_workers] = 2
ssn_card_type[selected_students] = 2

final_counts = pd.Series(ssn_card_type).value_counts().sort_index()

# ============================================================================
# FAMILY CORRELATION ADJUSTMENT
# ============================================================================

# Identify parent-child relationships using household and family data
correlation_probability = 0.8
# Only applies to families with codes 0 or 3 (not citizens or valid EAD holders)
rng_family = np.random.default_rng(seed=123)

# Create a DataFrame for easier family processing
family_df = pd.DataFrame(
{
"person_id": person.PH_SEQ * 100 + person.P_SEQ,
"household_id": person.PH_SEQ,
"family_id": person.PH_SEQ * 10 + person.PF_SEQ,
"age": person.A_AGE,
"parent1_line": person.PEPAR1, # Line number of first parent
"parent2_line": person.PEPAR2, # Line number of second parent
"line_number": person.A_LINENO,
"ssn_code": ssn_card_type,
}
)

# Identify children (those with parent pointers)
children = family_df[
(family_df.parent1_line > 0) | (family_df.parent2_line > 0)
]

# Code 2: Noncitizens (PRCITSHP == 5) who are working or studying
noncitizen_mask = person.PRCITSHP == 5
is_worker = (person.WSAL_VAL > 0) | (person.SEMP_VAL > 0) # worker
is_student = person.A_HSCOL == 2 # student
ead_like_mask = noncitizen_mask & (is_worker | is_student)
ssn_card_type[ead_like_mask] = 2
families_adjusted = 0

# Step 3: Refine remaining 0s into 0 or 3
share_code_3 = 0.3 # IRS/SSA target share of SSA-benefit-only cards
for _, child in children.iterrows():
# Only process if child is eligible (codes 0 or 3)
if child.ssn_code not in [0, 3]:
continue

# Find parents in the same household
household_members = family_df[
family_df.household_id == child.household_id
]

parents = household_members[
(household_members.line_number == child.parent1_line)
| (household_members.line_number == child.parent2_line)
]

if len(parents) > 0:
# Only consider parents who are eligible (codes 0 or 3)
eligible_parents = parents[parents.ssn_code.isin([0, 3])]

# Skip if no eligible parents
if len(eligible_parents) == 0:
continue

child_has_code_0 = child.ssn_code == 0
parents_have_code_0 = (eligible_parents.ssn_code == 0).any()

# Check if alignment is needed (80% probability)
if child_has_code_0 != parents_have_code_0:
if rng_family.random() < correlation_probability:
child_idx = np.where(
family_df.person_id == child.person_id
)[0][0]

if parents_have_code_0 and not child_has_code_0:
# Change child to code 0 if parent has code 0
if (
ssn_card_type[child_idx] == 3
): # Only change if currently code 3
ssn_card_type[child_idx] = 0
families_adjusted += 1
elif child_has_code_0 and not parents_have_code_0:
# Change child to code 3 if parent doesn't have code 0
ssn_card_type[child_idx] = 3
families_adjusted += 1

# Calculate actual correlation (only among eligible families)
children_with_parents = []
for _, child in children.iterrows():
# Only consider eligible children
if child.ssn_code not in [0, 3]:
continue

household_members = family_df[
family_df.household_id == child.household_id
]
parents = household_members[
(household_members.line_number == child.parent1_line)
| (household_members.line_number == child.parent2_line)
]

if len(parents) > 0:
# Only consider eligible parents
eligible_parents = parents[parents.ssn_code.isin([0, 3])]

if len(eligible_parents) > 0:
child_code_0 = child.ssn_code == 0
parent_code_0 = (eligible_parents.ssn_code == 0).any()
children_with_parents.append((child_code_0, parent_code_0))

if children_with_parents:
matches = sum(
1
for child_code, parent_code in children_with_parents
if child_code == parent_code
)
correlation = matches / len(children_with_parents)
else:
pass

# ============================================================================
# RANDOM REFINEMENT OF REMAINING CODE 0s
# ============================================================================

# Apply random assignment to remaining code 0 non-citizens
# 30% assigned to code 3, 70% remain as code 0 (likely undocumented)
share_code_3 = 0.3
rng = np.random.default_rng(seed=42)
to_refine = (ssn_card_type == 0) & noncitizen_mask
refine_indices = np.where(to_refine)[0]

remaining_zeros = (ssn_card_type == 0) & (~citizens_mask)
refine_indices = np.where(remaining_zeros)[0]

if len(refine_indices) > 0:
draw = rng.random(len(refine_indices))
assign_code_3 = draw < share_code_3
ssn_card_type[refine_indices[assign_code_3]] = 3
random_draw = rng.random(len(refine_indices))
assign_to_code_3 = random_draw < share_code_3
random_count = assign_to_code_3.sum()
ssn_card_type[refine_indices[assign_to_code_3]] = 3

# ============================================================================
# CONVERT TO STRING LABELS AND STORE
# ============================================================================

code_to_str = {
0: "NONE",
1: "CITIZEN",
2: "NON_CITIZEN_VALID_EAD",
3: "OTHER_NON_CITIZEN",
0: "NONE", # Likely undocumented immigrants
1: "CITIZEN", # US citizens
2: "NON_CITIZEN_VALID_EAD", # Non-citizens with work/study authorization
3: "OTHER_NON_CITIZEN", # Non-citizens with indicators of legal status
}
ssn_card_type_str = (
pd.Series(ssn_card_type).map(code_to_str).astype("S").values
Expand Down
Loading
Loading