Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/code_changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.10'
python-version: '3.11'

- name: Install package
run: uv pip install -e .[dev] --system
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_code_changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.10'
python-version: '3.11'

- name: Install package
run: uv pip install -e .[dev] --system
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
!spm_threshold_agi.csv
**/_build
!population_by_state.csv
**/*.pkl
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Tip income.
47 changes: 47 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def generate(self):
add_previous_year_income(self, cps)
add_spm_variables(cps, spm_unit)
add_household_variables(cps, household)
add_tips(self, cps)
add_rent(self, cps, person, household)

raw_data.close()
Expand Down Expand Up @@ -648,6 +649,52 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
].values


def add_tips(self, cps: h5py.File):
self.save_dataset(cps)
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=self)
cps = sim.calculate_dataframe(
[
"person_id",
"household_id",
"employment_income",
"age",
"household_weight",
],
2025,
)

cps["is_under_18"] = cps.age < 18
cps["is_under_6"] = cps.age < 6
cps["count_under_18"] = (
cps.groupby("household_id")["is_under_18"]
.sum()
.loc[cps.household_id.values]
.values
)
cps["count_under_6"] = (
cps.groupby("household_id")["is_under_6"]
.sum()
.loc[cps.household_id.values]
.values
)
cps = pd.DataFrame(cps)

# Impute tips

from policyengine_us_data.datasets.sipp import get_tip_model

model = get_tip_model()

cps["tip_income"] = model.predict(
X_test=cps,
mean_quantile=0.5,
)[0.5].tip_income.values

self.save_dataset(cps)


class CPS_2019(CPS):
name = "cps_2019"
label = "CPS 2019"
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/datasets/sipp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sipp import train_tip_model, get_tip_model
137 changes: 137 additions & 0 deletions policyengine_us_data/datasets/sipp/sipp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import pandas as pd
from microdf import MicroDataFrame
import numpy as np
from policyengine_us import Microsimulation
from microimpute.models import QRF
from policyengine_us_data.storage import STORAGE_FOLDER
import pickle
from huggingface_hub import hf_hub_download


def train_tip_model():
DOWNLOAD_FULL_SIPP = False

if DOWNLOAD_FULL_SIPP:
hf_hub_download(
repo_id="PolicyEngine/policyengine-us-data",
filename="pu2023.csv",
repo_type="model",
local_dir=STORAGE_FOLDER,
)
cols = [
"SSUID",
"PNUM",
"MONTHCODE",
"ERESIDENCEID",
"ERELRPE",
"SPANEL",
"SWAVE",
"WPFINWGT",
"ESEX",
"TAGE",
"TAGE_EHC",
"ERACE",
"EORIGIN",
"EEDUC",
"EDEPCLM",
"EMS",
"EFSTATUS",
"TJB1_TXAMT",
"TJB1_MSUM",
"TJB1_OCC",
"TJB1_IND",
"AJB1_TXAMT",
"TPTOTINC",
]

for col in cols:
if "JB1" in col:
for i in range(2, 8):
cols.append(col.replace("JB1", f"JB{i}"))

df = pd.read_csv(
STORAGE_FOLDER / "pu2023.csv",
delimiter="|",
usecols=cols,
)

else:
hf_hub_download(
repo_id="PolicyEngine/policyengine-us-data",
filename="pu2023_slim.csv",
repo_type="model",
local_dir=STORAGE_FOLDER,
)
df = pd.read_csv(
STORAGE_FOLDER / "pu2023_slim.csv",
)
# Sum tip columns (AJB*_TXAMT + TJB*_TXAMT) across all jobs.
df["tip_income"] = (
df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1)
* 12
)
df["employment_income"] = df.TPTOTINC * 12
df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12)
df["is_under_6"] = (df.TAGE < 6) & (df.MONTHCODE == 12)
df["count_under_18"] = (
df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values
)
df["count_under_6"] = (
df.groupby("SSUID")["is_under_6"].sum().loc[df.SSUID.values].values
)
df["household_weight"] = df.WPFINWGT
df["household_id"] = df.SSUID
df["age"] = df.TAGE

sipp = df[
[
"household_id",
"employment_income",
"tip_income",
"count_under_18",
"count_under_6",
"age",
"household_weight",
]
]

sipp = sipp[~sipp.isna().any(axis=1)]

sipp = sipp.loc[
np.random.choice(
sipp.index,
size=100_000,
replace=True,
p=sipp.household_weight / sipp.household_weight.sum(),
)
]

model = QRF()

model = model.fit(
X_train=sipp,
predictors=[
"employment_income",
"age",
"count_under_18",
"count_under_6",
],
imputed_variables=["tip_income"],
)

return model


def get_tip_model() -> QRF:
model_path = STORAGE_FOLDER / "tips.pkl"

if not model_path.exists():
model = train_tip_model()

with open(model_path, "wb") as f:
pickle.dump(model, f)
else:
with open(model_path, "rb") as f:
model = pickle.load(f)

return model
2 changes: 2 additions & 0 deletions policyengine_us_data/storage/download_public_prerequisites.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from policyengine_us_data.utils.github import download
from pathlib import Path
from policyengine_us_data.storage import STORAGE_FOLDER
from huggingface_hub import hf_hub_download


download(
"PolicyEngine",
Expand Down
11 changes: 11 additions & 0 deletions policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ def test_ecps_has_mortgage_interest():
assert sim.calculate("deductible_mortgage_interest").sum() > 1


def test_ecps_has_tips():
from policyengine_us_data.datasets.cps import EnhancedCPS_2024
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=EnhancedCPS_2024)
# Ensure we impute at least $50 billion in tip income.
# We currently target $38 billion * 1.4 = $53.2 billion.
TIP_INCOME_MINIMUM = 50e9
assert sim.calculate("tip_income").sum() > TIP_INCOME_MINIMUM


def test_ecps_replicates_jct_tax_expenditures():
from policyengine_us import Microsimulation
from policyengine_core.reforms import Reform
Expand Down
5 changes: 5 additions & 0 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,11 @@ def build_loss_matrix(dataset: type, time_period):
# Rough estimate, not CPS derived
"real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections
"rent": 735e9, # ACS total uprated by CPI
# Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
# shows $38,316,190,000 in Box 7: Social security tips (2018)
# Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
# Assume 40% through 2024
"tip_income": 38e9 * 1.4,
}

for variable_name, target in HARD_CODED_TOTALS.items():
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@ authors = [
{name = "PolicyEngine", email = "[email protected]"},
]
license = {file = "LICENSE"}
requires-python = ">=3.10, <3.13.0"
requires-python = ">=3.11, <3.13.0"
dependencies = [
"policyengine_us>=1.197.0",
"policyengine_core>=3.14.1",
"requests",
"tqdm",
"microdf_python>=0.4.3",
"microimpute",
]

[project.optional-dependencies]
Expand Down
Loading