Skip to content

Commit 42c3a18

Browse files
committed
add parent and child corr
1 parent 9cbb58f commit 42c3a18

File tree

1 file changed

+131
-0
lines changed
  • policyengine_us_data/datasets/cps

1 file changed

+131
-0
lines changed

policyengine_us_data/datasets/cps/cps.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,137 @@ def add_ssn_card_type(cps: h5py.File, person: pd.DataFrame) -> None:
10121012
f"Condition 14 (SSI recipients): {condition_14_count:,} people moved to code 3"
10131013
)
10141014

1015+
# ============================================================================
1016+
# DISTRIBUTION AFTER ASEC CONDITIONS
1017+
# ============================================================================
1018+
1019+
final_counts = pd.Series(ssn_card_type).value_counts().sort_index()
1020+
print(f"\nDistribution after ASEC conditions:")
1021+
print(f"Code 0 (NONE - likely undocumented): {final_counts.get(0, 0):,}")
1022+
print(f"Code 1 (CITIZEN): {final_counts.get(1, 0):,}")
1023+
print(f"Code 2 (NON_CITIZEN_VALID_EAD): {final_counts.get(2, 0):,}")
1024+
print(f"Code 3 (OTHER_NON_CITIZEN): {final_counts.get(3, 0):,}")
1025+
1026+
# ============================================================================
1027+
# FAMILY CORRELATION ADJUSTMENT
1028+
# ============================================================================
1029+
1030+
print(f"\n--- Family Correlation Adjustment ---")
1031+
1032+
# Identify parent-child relationships using household and family data
1033+
correlation_probability = 0.8
1034+
rng_family = np.random.default_rng(seed=123)
1035+
1036+
# Create a DataFrame for easier family processing
1037+
family_df = pd.DataFrame(
1038+
{
1039+
"person_id": person.PH_SEQ * 100 + person.P_SEQ,
1040+
"household_id": person.PH_SEQ,
1041+
"family_id": person.PH_SEQ * 10 + person.PF_SEQ,
1042+
"age": person.A_AGE,
1043+
"parent1_line": person.PEPAR1, # Line number of first parent
1044+
"parent2_line": person.PEPAR2, # Line number of second parent
1045+
"line_number": person.A_LINENO,
1046+
"ssn_code": ssn_card_type,
1047+
}
1048+
)
1049+
1050+
# Identify children (those with parent pointers)
1051+
children = family_df[
1052+
(family_df.parent1_line > 0) | (family_df.parent2_line > 0)
1053+
]
1054+
1055+
families_adjusted = 0
1056+
1057+
for _, child in children.iterrows():
1058+
# Only process if child is eligible (codes 0 or 3)
1059+
if child.ssn_code not in [0, 3]:
1060+
continue
1061+
1062+
# Find parents in the same household
1063+
household_members = family_df[
1064+
family_df.household_id == child.household_id
1065+
]
1066+
1067+
parents = household_members[
1068+
(household_members.line_number == child.parent1_line)
1069+
| (household_members.line_number == child.parent2_line)
1070+
]
1071+
1072+
if len(parents) > 0:
1073+
# Only consider parents who are eligible (codes 0 or 3)
1074+
eligible_parents = parents[parents.ssn_code.isin([0, 3])]
1075+
1076+
# Skip if no eligible parents
1077+
if len(eligible_parents) == 0:
1078+
continue
1079+
1080+
child_has_code_0 = child.ssn_code == 0
1081+
parents_have_code_0 = (eligible_parents.ssn_code == 0).any()
1082+
1083+
# Check if alignment is needed (80% probability)
1084+
if child_has_code_0 != parents_have_code_0:
1085+
if rng_family.random() < correlation_probability:
1086+
child_idx = np.where(
1087+
family_df.person_id == child.person_id
1088+
)[0][0]
1089+
1090+
if parents_have_code_0 and not child_has_code_0:
1091+
# Change child to code 0 if parent has code 0
1092+
if (
1093+
ssn_card_type[child_idx] == 3
1094+
): # Only change if currently code 3
1095+
ssn_card_type[child_idx] = 0
1096+
families_adjusted += 1
1097+
elif child_has_code_0 and not parents_have_code_0:
1098+
# Change child to code 3 if parent doesn't have code 0
1099+
ssn_card_type[child_idx] = 3
1100+
families_adjusted += 1
1101+
1102+
print(
1103+
f"Family correlation adjustments: {families_adjusted:,} people affected"
1104+
)
1105+
1106+
# Calculate actual correlation (only among eligible families)
1107+
children_with_parents = []
1108+
for _, child in children.iterrows():
1109+
# Only consider eligible children
1110+
if child.ssn_code not in [0, 3]:
1111+
continue
1112+
1113+
household_members = family_df[
1114+
family_df.household_id == child.household_id
1115+
]
1116+
parents = household_members[
1117+
(household_members.line_number == child.parent1_line)
1118+
| (household_members.line_number == child.parent2_line)
1119+
]
1120+
1121+
if len(parents) > 0:
1122+
# Only consider eligible parents
1123+
eligible_parents = parents[parents.ssn_code.isin([0, 3])]
1124+
1125+
if len(eligible_parents) > 0:
1126+
child_code_0 = child.ssn_code == 0
1127+
parent_code_0 = (eligible_parents.ssn_code == 0).any()
1128+
children_with_parents.append((child_code_0, parent_code_0))
1129+
1130+
if children_with_parents:
1131+
matches = sum(
1132+
1
1133+
for child_code, parent_code in children_with_parents
1134+
if child_code == parent_code
1135+
)
1136+
correlation = matches / len(children_with_parents)
1137+
print(f"Achieved parent-child code 0 correlation: {correlation:.1%}")
1138+
print(
1139+
f"Eligible parent-child pairs analyzed: {len(children_with_parents):,}"
1140+
)
1141+
else:
1142+
print(
1143+
f"No eligible parent-child relationships found for correlation calculation"
1144+
)
1145+
10151146
# ============================================================================
10161147
# RANDOM REFINEMENT OF REMAINING CODE 0s
10171148
# ============================================================================

0 commit comments

Comments
 (0)