Impute tips (#220)

nikhilwoodruff · web-flow · commit b202c5209757 · 2025-05-13T13:47:45.000+01:00
* Add initial notebook

* Add update to notebook

* Add to data creation

* Add tips

* Update python version

* Use hf file

* Don't retrain

* Model, not dataset

* Don't download the full CSV

* Add missing import

* Add test

* Remove notebook

* Add tip income

* Add calibration of tip income

* Format

* Address comments
diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
@@ -31,7 +31,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.10'
+                python-version: '3.11'
               
           - name: Install package
             run: uv pip install -e .[dev] --system
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
@@ -30,7 +30,7 @@ jobs:
           - name: Set up Python
             uses: actions/setup-python@v2
             with:
-                python-version: '3.10'
+                python-version: '3.11'
               
           - name: Install package
             run: uv pip install -e .[dev] --system
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@
 !spm_threshold_agi.csv
 **/_build
 !population_by_state.csv
+**/*.pkl
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Tip income.
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -54,6 +54,7 @@ def generate(self):
         add_previous_year_income(self, cps)
         add_spm_variables(cps, spm_unit)
         add_household_variables(cps, household)
+        add_tips(self, cps)
         add_rent(self, cps, person, household)
 
         raw_data.close()
@@ -648,6 +649,52 @@ def add_previous_year_income(self, cps: h5py.File) -> None:
     ].values
 
 
+def add_tips(self, cps: h5py.File):
+    self.save_dataset(cps)
+    from policyengine_us import Microsimulation
+
+    sim = Microsimulation(dataset=self)
+    cps = sim.calculate_dataframe(
+        [
+            "person_id",
+            "household_id",
+            "employment_income",
+            "age",
+            "household_weight",
+        ],
+        2025,
+    )
+
+    cps["is_under_18"] = cps.age < 18
+    cps["is_under_6"] = cps.age < 6
+    cps["count_under_18"] = (
+        cps.groupby("household_id")["is_under_18"]
+        .sum()
+        .loc[cps.household_id.values]
+        .values
+    )
+    cps["count_under_6"] = (
+        cps.groupby("household_id")["is_under_6"]
+        .sum()
+        .loc[cps.household_id.values]
+        .values
+    )
+    cps = pd.DataFrame(cps)
+
+    # Impute tips
+
+    from policyengine_us_data.datasets.sipp import get_tip_model
+
+    model = get_tip_model()
+
+    cps["tip_income"] = model.predict(
+        X_test=cps,
+        mean_quantile=0.5,
+    )[0.5].tip_income.values
+
+    self.save_dataset(cps)
+
+
 class CPS_2019(CPS):
     name = "cps_2019"
     label = "CPS 2019"
diff --git a/policyengine_us_data/datasets/sipp/__init__.py b/policyengine_us_data/datasets/sipp/__init__.py
@@ -0,0 +1 @@
+from .sipp import train_tip_model, get_tip_model
diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py
@@ -0,0 +1,137 @@
+import pandas as pd
+from microdf import MicroDataFrame
+import numpy as np
+from policyengine_us import Microsimulation
+from microimpute.models import QRF
+from policyengine_us_data.storage import STORAGE_FOLDER
+import pickle
+from huggingface_hub import hf_hub_download
+
+
+def train_tip_model():
+    DOWNLOAD_FULL_SIPP = False
+
+    if DOWNLOAD_FULL_SIPP:
+        hf_hub_download(
+            repo_id="PolicyEngine/policyengine-us-data",
+            filename="pu2023.csv",
+            repo_type="model",
+            local_dir=STORAGE_FOLDER,
+        )
+        cols = [
+            "SSUID",
+            "PNUM",
+            "MONTHCODE",
+            "ERESIDENCEID",
+            "ERELRPE",
+            "SPANEL",
+            "SWAVE",
+            "WPFINWGT",
+            "ESEX",
+            "TAGE",
+            "TAGE_EHC",
+            "ERACE",
+            "EORIGIN",
+            "EEDUC",
+            "EDEPCLM",
+            "EMS",
+            "EFSTATUS",
+            "TJB1_TXAMT",
+            "TJB1_MSUM",
+            "TJB1_OCC",
+            "TJB1_IND",
+            "AJB1_TXAMT",
+            "TPTOTINC",
+        ]
+
+        for col in cols:
+            if "JB1" in col:
+                for i in range(2, 8):
+                    cols.append(col.replace("JB1", f"JB{i}"))
+
+        df = pd.read_csv(
+            STORAGE_FOLDER / "pu2023.csv",
+            delimiter="|",
+            usecols=cols,
+        )
+
+    else:
+        hf_hub_download(
+            repo_id="PolicyEngine/policyengine-us-data",
+            filename="pu2023_slim.csv",
+            repo_type="model",
+            local_dir=STORAGE_FOLDER,
+        )
+        df = pd.read_csv(
+            STORAGE_FOLDER / "pu2023_slim.csv",
+        )
+    # Sum tip columns (AJB*_TXAMT + TJB*_TXAMT) across all jobs.
+    df["tip_income"] = (
+        df[df.columns[df.columns.str.contains("TXAMT")]].fillna(0).sum(axis=1)
+        * 12
+    )
+    df["employment_income"] = df.TPTOTINC * 12
+    df["is_under_18"] = (df.TAGE < 18) & (df.MONTHCODE == 12)
+    df["is_under_6"] = (df.TAGE < 6) & (df.MONTHCODE == 12)
+    df["count_under_18"] = (
+        df.groupby("SSUID")["is_under_18"].sum().loc[df.SSUID.values].values
+    )
+    df["count_under_6"] = (
+        df.groupby("SSUID")["is_under_6"].sum().loc[df.SSUID.values].values
+    )
+    df["household_weight"] = df.WPFINWGT
+    df["household_id"] = df.SSUID
+    df["age"] = df.TAGE
+
+    sipp = df[
+        [
+            "household_id",
+            "employment_income",
+            "tip_income",
+            "count_under_18",
+            "count_under_6",
+            "age",
+            "household_weight",
+        ]
+    ]
+
+    sipp = sipp[~sipp.isna().any(axis=1)]
+
+    sipp = sipp.loc[
+        np.random.choice(
+            sipp.index,
+            size=100_000,
+            replace=True,
+            p=sipp.household_weight / sipp.household_weight.sum(),
+        )
+    ]
+
+    model = QRF()
+
+    model = model.fit(
+        X_train=sipp,
+        predictors=[
+            "employment_income",
+            "age",
+            "count_under_18",
+            "count_under_6",
+        ],
+        imputed_variables=["tip_income"],
+    )
+
+    return model
+
+
+def get_tip_model() -> QRF:
+    model_path = STORAGE_FOLDER / "tips.pkl"
+
+    if not model_path.exists():
+        model = train_tip_model()
+
+        with open(model_path, "wb") as f:
+            pickle.dump(model, f)
+    else:
+        with open(model_path, "rb") as f:
+            model = pickle.load(f)
+
+    return model
diff --git a/policyengine_us_data/storage/download_public_prerequisites.py b/policyengine_us_data/storage/download_public_prerequisites.py
@@ -1,6 +1,8 @@
 from policyengine_us_data.utils.github import download
 from pathlib import Path
 from policyengine_us_data.storage import STORAGE_FOLDER
+from huggingface_hub import hf_hub_download
+
 
 download(
     "PolicyEngine",
diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -38,6 +38,17 @@ def test_ecps_has_mortgage_interest():
     assert sim.calculate("deductible_mortgage_interest").sum() > 1
 
 
+def test_ecps_has_tips():
+    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
+    from policyengine_us import Microsimulation
+
+    sim = Microsimulation(dataset=EnhancedCPS_2024)
+    # Ensure we impute at least $50 billion in tip income.
+    # We currently target $38 billion * 1.4 = $53.2 billion.
+    TIP_INCOME_MINIMUM = 50e9
+    assert sim.calculate("tip_income").sum() > TIP_INCOME_MINIMUM
+
+
 def test_ecps_replicates_jct_tax_expenditures():
     from policyengine_us import Microsimulation
     from policyengine_core.reforms import Reform
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -256,6 +256,11 @@ def build_loss_matrix(dataset: type, time_period):
         # Rough estimate, not CPS derived
         "real_estate_taxes": 500e9,  # Rough estimate between 350bn and 600bn total property tax collections
         "rent": 735e9,  # ACS total uprated by CPI
+        # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
+        # shows $38,316,190,000 in Box 7: Social security tips (2018)
+        # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
+        # Assume 40% through 2024
+        "tip_income": 38e9 * 1.4,
     }
 
     for variable_name, target in HARD_CODED_TOTALS.items():
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,13 +15,14 @@ authors = [
     {name = "PolicyEngine", email = "hello@policyengine.org"},
 ]
 license = {file = "LICENSE"}
-requires-python = ">=3.10, <3.13.0"
+requires-python = ">=3.11, <3.13.0"
 dependencies = [
     "policyengine_us>=1.197.0",
     "policyengine_core>=3.14.1",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",
+    "microimpute",
 ]
 
 [project.optional-dependencies]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .sipp import train_tip_model, get_tip_model`