Merge pull request #256 from PolicyEngine/qbid-suite

baogorek · web-flow · commit 2092e75aae0a · 2025-06-19T17:34:18.000-04:00
QBID variables added, simulations improved, changelog entry workflow logic
diff --git a/.github/check-changelog-entry.sh b/.github/check-changelog-entry.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Fails if changelog_entry.yaml is empty or contains only whitespace
+if [ ! -s changelog_entry.yaml ] || ! grep -q '[^[:space:]]' changelog_entry.yaml; then
+  echo "changelog_entry.yaml is empty. Please add a changelog entry before merging."
+  exit 1
+fi
diff --git a/.github/workflows/pr_changelog.yaml b/.github/workflows/pr_changelog.yaml
@@ -0,0 +1,14 @@
+name: Changelog entry
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  require-entry:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Ensure changelog entry exists
+        run: .github/check-changelog-entry.sh
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,6 @@
 **/.DS_STORE
 **/*.h5
 **/*.csv
-!uprating_factors.csv
-!uprating_growth_factors.csv
 !healthcare_spending.csv
 !eitc.csv
 !spm_threshold_agi.csv
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,11 @@
+- bump: minor
+  changes:
+    added:
+      - Added automated checks for changelog entry
+      - New "would be qualified income" variables simulated
+      - REIT, PTP, and BDC dividend income variables simulated
+      - UBIA property is being simulated
+      - Farm Operations Income added
+    changed:
+      - W2 Wages from Qualified business is now being simulated with random variables
+      - qualified business income sources have been redefined based on IRS PUF inputs
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -15,6 +15,7 @@
 from policyengine_us_data.utils import QRF
 import logging
 
+
 test_lite = os.environ.get("TEST_LITE")
 
 
@@ -570,6 +571,21 @@ def add_personal_income_variables(
     cps["other_medical_expenses"] = person.PMED_VAL
     cps["medicare_part_b_premiums"] = person.PEMCPREM
 
+    # Get QBI simulation parameters ---
+    yamlfilename = (
+        files("policyengine_us_data")
+        / "datasets"
+        / "puf"
+        / "qbi_assumptions.yaml"
+    )
+    with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
+        p = yaml.safe_load(yamlfile)
+    assert isinstance(p, dict)
+
+    rng = np.random.default_rng(seed=43)
+    for var, prob in p["qbi_qualification_probabilities"].items():
+        cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob
+
 
 def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
     SPM_RENAMES = dict(
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -26,6 +26,8 @@
     "taxable_ira_distributions",
     "self_employment_income",
     "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "short_term_capital_gains",
     "qualified_dividend_income",
     "charitable_cash_donations",
@@ -69,6 +71,15 @@
     "unreported_payroll_tax",
     "recapture_of_investment_credit",
     "deductible_mortgage_interest",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
+    "farm_operations_income",
+    "estate_income_would_be_qualified",
+    "farm_operations_income_would_be_qualified",
+    "farm_rent_income_would_be_qualified",
+    "partnership_s_corp_income_would_be_qualified",
+    "rental_income_would_be_qualified",
+    "self_employment_income_would_be_qualified",
 ]
 
 
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -1,18 +1,167 @@
+import os
+import yaml
+from importlib.resources import files
+
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
 from microdf import MicroDataFrame
+
 from policyengine_core.data import Dataset
 from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_us_data.datasets.puf.uprate_puf import uprate_puf
 from policyengine_us_data.datasets.puf.irs_puf import IRS_PUF_2015
 from policyengine_us_data.utils.uprating import (
     create_policyengine_uprating_factors_table,
 )
-import os
+
 
 rng = np.random.default_rng(seed=64)
 
+# Get Qualified Business Income simulation parameters ---
+yamlfilename = (
+    files("policyengine_us_data") / "datasets" / "puf" / "qbi_assumptions.yaml"
+)
+with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
+    QBI_PARAMS = yaml.safe_load(yamlfile)
+assert isinstance(QBI_PARAMS, dict)
+
+
+# Helper functions ---
+def sample_bernoulli_lognormal(n, prob, log_mean, log_sigma, rng):
+    """Generate a Bernoulli-lognormal mixture."""
+    positive = np.random.binomial(1, prob, size=n)
+    amounts = np.where(
+        positive,
+        rng.lognormal(mean=log_mean, sigma=log_sigma, size=n),
+        0.0,
+    )
+    return amounts
+
+
+def conditionally_sample_lognormal(flag, target_mean, log_sigma, rng):
+    """Generate a lognormal conditional on a binary flag."""
+    mu = np.log(target_mean) - (log_sigma**2 / 2)
+    return np.where(
+        flag,
+        rng.lognormal(
+            mean=mu,
+            sigma=log_sigma,
+        ),
+        0.0,
+    )
+
+
+def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True):
+    """
+    Simulate two Section 199A guard-rail quantities for every record
+      - W-2 wages paid by the business
+      - Unadjusted basis immediately after acquisition (UBIA) of property
+
+    Parameters
+    ----------
+    puf : pandas.DataFrame
+        Must contain the income columns created in your preprocessing block.
+    seed : int, optional
+        For reproducible random draws.
+    diagnostics : bool, default True
+        Print high-level checks after the simulation runs.
+
+    Returns
+    -------
+    w2_wages : 1-D NumPy array
+    ubia     : 1-D NumPy array
+    """
+    rng = np.random.default_rng(seed)
+
+    # Extract Qualified Business Income simulation parameters
+    qbi_probs = QBI_PARAMS["qbi_qualification_probabilities"]
+    margin_params = QBI_PARAMS["profit_margin_distribution"]
+    logit_params = QBI_PARAMS["has_employees_logit"]
+
+    labor_params = QBI_PARAMS["labor_ratio_distribution"]
+    rental_labor = labor_params["rental"]
+    non_rental_labor = labor_params["non_rental"]
+
+    rental_beta_a = rental_labor["beta_a"]
+    rental_beta_b = rental_labor["beta_b"]
+    rental_scale = rental_labor["scale"]
+
+    non_rental_beta_a = non_rental_labor["beta_a"]
+    non_rental_beta_b = non_rental_labor["beta_b"]
+    non_rental_scale = non_rental_labor["scale"]
+
+    depr_sigma = QBI_PARAMS["depreciation_proxy_sigma"]
+
+    ubia_params = QBI_PARAMS["ubia_simulation"]
+    ubia_multiple_of_qbi = ubia_params["multiple_of_qbi"]
+    ubia_sigma = ubia_params["sigma"]
+
+    # Estimate qualified business income
+    qbi = sum(
+        puf[income_type] * prob for income_type, prob in qbi_probs.items()
+    ).to_numpy()
+
+    # Simulate gross receipts by drawing a profit margin
+    margins = (
+        rng.beta(margin_params["beta_a"], margin_params["beta_b"], qbi.size)
+        * margin_params["scale"]
+        + margin_params["shift"]
+    )
+    revenues = np.maximum(qbi, 0) / margins
+
+    logit = (
+        logit_params["intercept"] + logit_params["slope_per_dollar"] * revenues
+    )
+
+    # Set p = 0 when simulated receipts == 0 (no revenue means no payroll)
+    pr_has_employees = np.where(
+        revenues == 0.0, 0.0, 1.0 / (1.0 + np.exp(-logit))
+    )
+    has_employees = rng.binomial(1, pr_has_employees)
+
+    # Labor share simulation
+    is_rental = puf["rental_income"].to_numpy() > 0
+
+    labor_ratios = np.where(
+        is_rental,
+        rng.beta(rental_beta_a, rental_beta_b, qbi.size) * rental_scale,
+        rng.beta(non_rental_beta_a, non_rental_beta_b, qbi.size)
+        * non_rental_scale,
+    )
+
+    w2_wages = revenues * labor_ratios * has_employees
+
+    # A depreciation stand-in that scales with rents
+    depreciation_proxy = conditionally_sample_lognormal(
+        is_rental,
+        puf["rental_income"],
+        depr_sigma,
+        rng,
+    )
+
+    # UBIA simulation: lognormal, but only for capital-heavy records
+    is_capital_intensive = is_rental | (depreciation_proxy > 0)
+
+    ubia = conditionally_sample_lognormal(
+        is_capital_intensive,
+        ubia_multiple_of_qbi * np.maximum(qbi, 0),
+        ubia_sigma,
+        rng,
+    )
+
+    if diagnostics:
+        share_qbi_pos = np.mean(qbi > 0)
+        share_wages = np.mean((w2_wages > 0) & (qbi > 0))
+        print(f"Share with QBI > 0: {share_qbi_pos:6.2%}")
+        print(f"Among those, share with W-2 wages: {share_wages:6.2%}")
+        if np.any(w2_wages > 0):
+            print(f"Mean W-2 (if >0): ${np.mean(w2_wages[w2_wages>0]):,.0f}")
+        if np.any(ubia > 0):
+            print(f"Median UBIA (if >0): ${np.median(ubia[ubia>0]):,.0f}")
+
+    return w2_wages, ubia
+
 
 def impute_pension_contributions_to_puf(puf_df):
     from policyengine_us import Microsimulation
@@ -154,8 +303,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     puf["educator_expense"] = puf.E03220
     puf["employment_income"] = puf.E00200
     puf["estate_income"] = puf.E26390 - puf.E26400
+    # Schedule J, separate from QBI
     puf["farm_income"] = puf.T27800
-    puf["farm_rent_income"] = puf.E27200
     puf["health_savings_account_ald"] = puf.E03290
     puf["interest_deduction"] = puf.E19200
     puf["long_term_capital_gains"] = puf.P23250
@@ -170,11 +319,21 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # that can be deducted under the miscellaneous deduction.
     puf["unreimbursed_business_employee_expenses"] = puf.E20400
     puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650
-    puf["partnership_s_corp_income"] = puf.E26270
     puf["qualified_dividend_income"] = puf.E00650
     puf["qualified_tuition_expenses"] = puf.E03230
     puf["real_estate_taxes"] = puf.E18500
+    # Schedule E rent and royalty
     puf["rental_income"] = puf.E25850 - puf.E25860
+    # Schedule E active S-Corp income
+    s_corp_income = puf.E26190 - puf.E26180
+    # Schedule E active partnership income
+    partnership_income = puf.E25980 - puf.E25960
+    puf["partnership_s_corp_income"] = s_corp_income + partnership_income
+    # Schedule F active farming operations
+    puf["farm_operations_income"] = puf.E02100
+    # Schedule E farm rental income
+    puf["farm_rent_income"] = puf.E27200
+    # Schedule C Sole Proprietorship
     puf["self_employment_income"] = puf.E00900
     puf["self_employed_health_insurance_ald"] = puf.E03270
     puf["self_employed_pension_contribution_ald"] = puf.E03300
@@ -211,15 +370,38 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # Ignore f2441 (AMT form attached)
     # Ignore cmbtp (estimate of AMT income not in AGI)
     # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
-    qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
-    # 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
-    # https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
-    W2_WAGES_SCALE = 0.101
-    puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE
 
-    # Remove aggregate records
-    puf = puf[puf.MARS != 0]
+    # --- Qualified Business Income Deduction (QBID) simulation ---
+    w2, ubia = simulate_w2_and_ubia_from_puf(puf, seed=42)
+    puf["w2_wages_from_qualified_business"] = w2
+    puf["unadjusted_basis_qualified_property"] = ubia
 
+    puf_qbi_sources_for_sstb = puf[QBI_PARAMS["sstb_prob_map_by_name"].keys()]
+    largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1)
+
+    pr_sstb = largest_qbi_source_name.map(
+        QBI_PARAMS["sstb_prob_map_by_name"]
+    ).fillna(0.0)
+    puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)
+
+    reit_params = QBI_PARAMS["reit_ptp_income_distribution"]
+    p_reit_ptp = reit_params["probability_of_receiving"]
+    mu_reit_ptp = reit_params["log_normal_mu"]
+    sigma_reit_ptp = reit_params["log_normal_sigma"]
+
+    puf["qualified_reit_and_ptp_income"] = sample_bernoulli_lognormal(
+        len(puf), p_reit_ptp, mu_reit_ptp, sigma_reit_ptp, rng
+    )
+
+    bdc_params = QBI_PARAMS["bdc_income_distribution"]
+    p_bdc = bdc_params["probability_of_receiving"]
+    mu_bdc = bdc_params["log_normal_mu"]
+    sigma_bdc = bdc_params["log_normal_sigma"]
+
+    puf["qualified_bdc_income"] = sample_bernoulli_lognormal(
+        len(puf), p_bdc, mu_bdc, sigma_bdc, rng
+    )
+    # -------- End of Qualified Business Income Deduction (QBID) -------
     puf["filing_status"] = puf.MARS.map(
         {
             1: "SINGLE",
@@ -248,6 +430,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "educator_expense",
     "employment_income",
     "estate_income",
+    "farm_operations_income",
     "farm_income",
     "farm_rent_income",
     "health_savings_account_ald",
@@ -257,7 +440,6 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "unreimbursed_business_employee_expenses",
     "non_qualified_dividend_income",
     "non_sch_d_capital_gains",
-    "partnership_s_corp_income",
     "qualified_dividend_income",
     "qualified_tuition_expenses",
     "real_estate_taxes",
@@ -293,7 +475,12 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "unreported_payroll_tax",
     "pre_tax_contributions",
     "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "deductible_mortgage_interest",
+    "partnership_s_corp_income",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
 ]
 
 
diff --git a/policyengine_us_data/datasets/puf/qbi_assumptions.yaml b/policyengine_us_data/datasets/puf/qbi_assumptions.yaml