Skip to content

Commit b202c52

Browse files
Impute tips (#220)
* Add initial notebook * Add update to notebook * Add to data creation * Add tips * Update python version * Use hf file * Don't retrain * Model, not dataset * Don't download the full CSV * Add missing import * Add test * Remove notebook * Add tip income * Add calibration of tip income * Format * Address comments
1 parent 0f33a2f commit b202c52

File tree

11 files changed

+212
-3
lines changed

11 files changed

+212
-3
lines changed

.github/workflows/code_changes.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
- name: Set up Python
3232
uses: actions/setup-python@v2
3333
with:
34-
python-version: '3.10'
34+
python-version: '3.11'
3535

3636
- name: Install package
3737
run: uv pip install -e .[dev] --system

.github/workflows/pr_code_changes.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
- name: Set up Python
3131
uses: actions/setup-python@v2
3232
with:
33-
python-version: '3.10'
33+
python-version: '3.11'
3434

3535
- name: Install package
3636
run: uv pip install -e .[dev] --system

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
!spm_threshold_agi.csv
1111
**/_build
1212
!population_by_state.csv
13+
**/*.pkl

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: minor
2+
changes:
3+
added:
4+
- Tip income.

policyengine_us_data/datasets/cps/cps.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def generate(self):
5454
add_previous_year_income(self, cps)
5555
add_spm_variables(cps, spm_unit)
5656
add_household_variables(cps, household)
57+
add_tips(self, cps)
5758
add_rent(self, cps, person, household)
5859

5960
raw_data.close()
@@ -648,6 +649,52 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
648649
].values
649650

650651

652+
def add_tips(self, cps: h5py.File):
653+
self.save_dataset(cps)
654+
from policyengine_us import Microsimulation
655+
656+
sim = Microsimulation(dataset=self)
657+
cps = sim.calculate_dataframe(
658+
[
659+
"person_id",
660+
"household_id",
661+
"employment_income",
662+
"age",
663+
"household_weight",
664+
],
665+
2025,
666+
)
667+
668+
cps["is_under_18"] = cps.age < 18
669+
cps["is_under_6"] = cps.age < 6
670+
cps["count_under_18"] = (
671+
cps.groupby("household_id")["is_under_18"]
672+
.sum()
673+
.loc[cps.household_id.values]
674+
.values
675+
)
676+
cps["count_under_6"] = (
677+
cps.groupby("household_id")["is_under_6"]
678+
.sum()
679+
.loc[cps.household_id.values]
680+
.values
681+
)
682+
cps = pd.DataFrame(cps)
683+
684+
# Impute tips
685+
686+
from policyengine_us_data.datasets.sipp import get_tip_model
687+
688+
model = get_tip_model()
689+
690+
cps["tip_income"] = model.predict(
691+
X_test=cps,
692+
mean_quantile=0.5,
693+
)[0.5].tip_income.values
694+
695+
self.save_dataset(cps)
696+
697+
651698
class CPS_2019(CPS):
652699
name = "cps_2019"
653700
label = "CPS 2019"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .sipp import train_tip_model, get_tip_model
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import pandas as pd
2+
from microdf import MicroDataFrame
3+
import numpy as np
4+
from policyengine_us import Microsimulation
5+
from microimpute.models import QRF
6+
from policyengine_us_data.storage import STORAGE_FOLDER
7+
import pickle
8+
from huggingface_hub import hf_hub_download
9+
10+
11+
def train_tip_model():
12+
DOWNLOAD_FULL_SIPP = False
13+
14+
if DOWNLOAD_FULL_SIPP:
15+
hf_hub_download(
16+
repo_id="PolicyEngine/policyengine-us-data",
17+
filename="pu2023.csv",
18+
repo_type="model",
19+
local_dir=STORAGE_FOLDER,
20+
)
21+
cols = [
22+
"SSUID",
23+
"PNUM",
24+
"MONTHCODE",
25+
"ERESIDENCEID",
26+
"ERELRPE",
27+
"SPANEL",
28+
"SWAVE",
29+
"WPFINWGT",
30+
"ESEX",
31+
"TAGE",
32+
"TAGE_EHC",
33+
"ERACE",
34+
"EORIGIN",
35+
"EEDUC",
36+
"EDEPCLM",
37+
"EMS",
38+
"EFSTATUS",
39+
"TJB1_TXAMT",
40+
"TJB1_MSUM",
41+
"TJB1_OCC",
42+
"TJB1_IND",
43+
"AJB1_TXAMT",
44+
"TPTOTINC",
45+
]
46+
47+
for col in cols:
48+
if "JB1" in col:
49+
for i in range(2, 8):
50+
cols.append(col.replace("JB1", f"JB{i}"))
51+
52+
df = pd.read_csv(
53+
STORAGE_FOLDER / "pu2023.csv",
54+
delimiter="|",
55+
usecols=cols,
56+
)
57+
58+
else:
59+
hf_hub_download(
60+
repo_id="PolicyEngine/policyengine-us-data",
61+
filename="pu2023_slim.csv",
62+
repo_type="model",
63+
local_dir=STORAGE_FOLDER,
64+
)
65+
df = pd.read_csv(
66+
STORAGE_FOLDER / "pu2023_slim.csv",
67+
)
68+
# Sum tip columns (AJB*_TXAMT + TJB*_TXAMT) across all jobs.
69+
df["tip_income"] = (
70+
df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1)
71+
* 12
72+
)
73+
df["employment_income"] = df.TPTOTINC * 12
74+
df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12)
75+
df["is_under_6"] = (df.TAGE < 6) & (df.MONTHCODE == 12)
76+
df["count_under_18"] = (
77+
df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values
78+
)
79+
df["count_under_6"] = (
80+
df.groupby("SSUID")["is_under_6"].sum().loc[df.SSUID.values].values
81+
)
82+
df["household_weight"] = df.WPFINWGT
83+
df["household_id"] = df.SSUID
84+
df["age"] = df.TAGE
85+
86+
sipp = df[
87+
[
88+
"household_id",
89+
"employment_income",
90+
"tip_income",
91+
"count_under_18",
92+
"count_under_6",
93+
"age",
94+
"household_weight",
95+
]
96+
]
97+
98+
sipp = sipp[~sipp.isna().any(axis=1)]
99+
100+
sipp = sipp.loc[
101+
np.random.choice(
102+
sipp.index,
103+
size=100_000,
104+
replace=True,
105+
p=sipp.household_weight / sipp.household_weight.sum(),
106+
)
107+
]
108+
109+
model = QRF()
110+
111+
model = model.fit(
112+
X_train=sipp,
113+
predictors=[
114+
"employment_income",
115+
"age",
116+
"count_under_18",
117+
"count_under_6",
118+
],
119+
imputed_variables=["tip_income"],
120+
)
121+
122+
return model
123+
124+
125+
def get_tip_model() -> QRF:
126+
model_path = STORAGE_FOLDER / "tips.pkl"
127+
128+
if not model_path.exists():
129+
model = train_tip_model()
130+
131+
with open(model_path, "wb") as f:
132+
pickle.dump(model, f)
133+
else:
134+
with open(model_path, "rb") as f:
135+
model = pickle.load(f)
136+
137+
return model

policyengine_us_data/storage/download_public_prerequisites.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from policyengine_us_data.utils.github import download
22
from pathlib import Path
33
from policyengine_us_data.storage import STORAGE_FOLDER
4+
from huggingface_hub import hf_hub_download
5+
46

57
download(
68
"PolicyEngine",

policyengine_us_data/tests/test_datasets/test_enhanced_cps.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,17 @@ def test_ecps_has_mortgage_interest():
3838
assert sim.calculate("deductible_mortgage_interest").sum() > 1
3939

4040

41+
def test_ecps_has_tips():
42+
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
43+
from policyengine_us import Microsimulation
44+
45+
sim = Microsimulation(dataset=EnhancedCPS_2024)
46+
# Ensure we impute at least $50 billion in tip income.
47+
# We currently target $38 billion * 1.4 = $53.2 billion.
48+
TIP_INCOME_MINIMUM = 50e9
49+
assert sim.calculate("tip_income").sum() > TIP_INCOME_MINIMUM
50+
51+
4152
def test_ecps_replicates_jct_tax_expenditures():
4253
from policyengine_us import Microsimulation
4354
from policyengine_core.reforms import Reform

policyengine_us_data/utils/loss.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,11 @@ def build_loss_matrix(dataset: type, time_period):
256256
# Rough estimate, not CPS derived
257257
"real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections
258258
"rent": 735e9, # ACS total uprated by CPI
259+
# Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
260+
# shows $38,316,190,000 in Box 7: Social security tips (2018)
261+
# Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
262+
# Assume 40% through 2024
263+
"tip_income": 38e9 * 1.4,
259264
}
260265

261266
for variable_name, target in HARD_CODED_TOTALS.items():

0 commit comments

Comments
 (0)