diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 6e4484d8..7e6f8384 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -31,7 +31,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.10' + python-version: '3.11' - name: Install package run: uv pip install -e .[dev] --system diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index f1c9ebd8..bf9b49cf 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -30,7 +30,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.10' + python-version: '3.11' - name: Install package run: uv pip install -e .[dev] --system diff --git a/.gitignore b/.gitignore index c2b68b7b..a63f8215 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ !spm_threshold_agi.csv **/_build !population_by_state.csv +**/*.pkl diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..868039ab 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Tip income. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 26579b82..eefbd2e6 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -54,6 +54,7 @@ def generate(self): add_previous_year_income(self, cps) add_spm_variables(cps, spm_unit) add_household_variables(cps, household) + add_tips(self, cps) add_rent(self, cps, person, household) raw_data.close() @@ -648,6 +649,52 @@ def add_previous_year_income(self, cps: h5py.File) -> None: ].values +def add_tips(self, cps: h5py.File): + self.save_dataset(cps) + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self) + cps = sim.calculate_dataframe( + [ + "person_id", + "household_id", + "employment_income", + "age", + "household_weight", + ], + 2025, + ) + + cps["is_under_18"] = cps.age < 18 + cps["is_under_6"] = cps.age < 6 + cps["count_under_18"] = ( + cps.groupby("household_id")["is_under_18"] + .sum() + .loc[cps.household_id.values] + .values + ) + cps["count_under_6"] = ( + cps.groupby("household_id")["is_under_6"] + .sum() + .loc[cps.household_id.values] + .values + ) + cps = pd.DataFrame(cps) + + # Impute tips + + from policyengine_us_data.datasets.sipp import get_tip_model + + model = get_tip_model() + + cps["tip_income"] = model.predict( + X_test=cps, + mean_quantile=0.5, + )[0.5].tip_income.values + + self.save_dataset(cps) + + class CPS_2019(CPS): name = "cps_2019" label = "CPS 2019" diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py new file mode 100644 index 00000000..09538d25 --- /dev/null +++ b/policyengine_us_data/datasets/sipp/__init__.py @@ -0,0 +1 @@ +from .sipp import train_tip_model, get_tip_model diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py new file mode 100644 index 00000000..b303ef77 --- /dev/null +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -0,0 +1,137 @@ +import pandas as pd +from microdf import MicroDataFrame +import numpy as np +from policyengine_us import Microsimulation +from microimpute.models import QRF +from policyengine_us_data.storage import STORAGE_FOLDER +import pickle +from huggingface_hub import hf_hub_download + + +def train_tip_model(): + DOWNLOAD_FULL_SIPP = False + + if DOWNLOAD_FULL_SIPP: + hf_hub_download( + repo_id="PolicyEngine/policyengine-us-data", + filename="pu2023.csv", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + cols = [ + "SSUID", + "PNUM", + "MONTHCODE", + "ERESIDENCEID", + "ERELRPE", + "SPANEL", + "SWAVE", + "WPFINWGT", + "ESEX", + "TAGE", + "TAGE_EHC", + "ERACE", + "EORIGIN", + "EEDUC", + "EDEPCLM", + "EMS", + "EFSTATUS", + "TJB1_TXAMT", + "TJB1_MSUM", + "TJB1_OCC", + "TJB1_IND", + "AJB1_TXAMT", + "TPTOTINC", + ] + + for col in cols: + if "JB1" in col: + for i in range(2, 8): + cols.append(col.replace("JB1", f"JB{i}")) + + df = pd.read_csv( + STORAGE_FOLDER / "pu2023.csv", + delimiter="|", + usecols=cols, + ) + + else: + hf_hub_download( + repo_id="PolicyEngine/policyengine-us-data", + filename="pu2023_slim.csv", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + df = pd.read_csv( + STORAGE_FOLDER / "pu2023_slim.csv", + ) + # Sum tip columns (AJB*_TXAMT + TJB*_TXAMT) across all jobs. + df["tip_income"] = ( + df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1) + * 12 + ) + df["employment_income"] = df.TPTOTINC * 12 + df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12) + df["is_under_6"] = (df.TAGE < 6) & (df.MONTHCODE == 12) + df["count_under_18"] = ( + df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values + ) + df["count_under_6"] = ( + df.groupby("SSUID")["is_under_6"].sum().loc[df.SSUID.values].values + ) + df["household_weight"] = df.WPFINWGT + df["household_id"] = df.SSUID + df["age"] = df.TAGE + + sipp = df[ + [ + "household_id", + "employment_income", + "tip_income", + "count_under_18", + "count_under_6", + "age", + "household_weight", + ] + ] + + sipp = sipp[~sipp.isna().any(axis=1)] + + sipp = sipp.loc[ + np.random.choice( + sipp.index, + size=100_000, + replace=True, + p=sipp.household_weight / sipp.household_weight.sum(), + ) + ] + + model = QRF() + + model = model.fit( + X_train=sipp, + predictors=[ + "employment_income", + "age", + "count_under_18", + "count_under_6", + ], + imputed_variables=["tip_income"], + ) + + return model + + +def get_tip_model() -> QRF: + model_path = STORAGE_FOLDER / "tips.pkl" + + if not model_path.exists(): + model = train_tip_model() + + with open(model_path, "wb") as f: + pickle.dump(model, f) + else: + with open(model_path, "rb") as f: + model = pickle.load(f) + + return model diff --git a/policyengine_us_data/storage/download_public_prerequisites.py b/policyengine_us_data/storage/download_public_prerequisites.py index 230b149f..d6f96b8f 100644 --- a/policyengine_us_data/storage/download_public_prerequisites.py +++ b/policyengine_us_data/storage/download_public_prerequisites.py @@ -1,6 +1,8 @@ from policyengine_us_data.utils.github import download from pathlib import Path from policyengine_us_data.storage import STORAGE_FOLDER +from huggingface_hub import hf_hub_download + download( "PolicyEngine", diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index a0195e41..2d22fcea 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -38,6 +38,17 @@ def test_ecps_has_mortgage_interest(): assert sim.calculate("deductible_mortgage_interest").sum() > 1 +def test_ecps_has_tips(): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=EnhancedCPS_2024) + # Ensure we impute at least $50 billion in tip income. + # We currently target $38 billion * 1.4 = $53.2 billion. + TIP_INCOME_MINIMUM = 50e9 + assert sim.calculate("tip_income").sum() > TIP_INCOME_MINIMUM + + def test_ecps_replicates_jct_tax_expenditures(): from policyengine_us import Microsimulation from policyengine_core.reforms import Reform diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 3eacf024..7f1df9af 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -256,6 +256,11 @@ def build_loss_matrix(dataset: type, time_period): # Rough estimate, not CPS derived "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections "rent": 735e9, # ACS total uprated by CPI + # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics + # shows $38,316,190,000 in Box 7: Social security tips (2018) + # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC + # Assume 40% through 2024 + "tip_income": 38e9 * 1.4, } for variable_name, target in HARD_CODED_TOTALS.items(): diff --git a/pyproject.toml b/pyproject.toml index 7d6a1733..2d6b01a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,13 +15,14 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.10, <3.13.0" +requires-python = ">=3.11, <3.13.0" dependencies = [ "policyengine_us>=1.197.0", "policyengine_core>=3.14.1", "requests", "tqdm", "microdf_python>=0.4.3", + "microimpute", ] [project.optional-dependencies]