PolicyEngine · baogorek · Jun 19, 2025 · May 19, 2025 · May 19, 2025 · May 21, 2025
diff --git a/.github/check-changelog-entry.sh b/.github/check-changelog-entry.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Fails if changelog_entry.yaml is empty or contains only whitespace
+if [ ! -s changelog_entry.yaml ] || ! grep -q '[^[:space:]]' changelog_entry.yaml; then
+  echo "changelog_entry.yaml is empty. Please add a changelog entry before merging."
+  exit 1
+fi
diff --git a/.github/workflows/pr_changelog.yaml b/.github/workflows/pr_changelog.yaml
@@ -0,0 +1,14 @@
+name: Changelog entry
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  require-entry:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Ensure changelog entry exists
+        run: .github/check-changelog-entry.sh
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,6 @@
 **/.DS_STORE
 **/*.h5
 **/*.csv
-!uprating_factors.csv
-!uprating_growth_factors.csv
 !healthcare_spending.csv
 !eitc.csv
 !spm_threshold_agi.csv

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,11 @@
+- bump: minor
+  changes:
+    added:
+      - Added automated checks for changelog entry
+      - New "would be qualified income" variables simulated
+      - REIT, PTP, and BDC dividend income variables simulated
+      - UBIA property is being simulated
+      - Farm Operations Income added
+    changed:
+      - W2 Wages from Qualified business is now being simulated with random variables
+      - qualified business income sources have been redefined based on IRS PUF inputs
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -15,6 +15,7 @@
 from policyengine_us_data.utils import QRF
 import logging
 
+
 test_lite = os.environ.get("TEST_LITE")
 
 
@@ -730,6 +731,20 @@ def add_personal_income_variables(
     cps["other_medical_expenses"] = person.PMED_VAL
     cps["medicare_part_b_premiums"] = person.PEMCPREM
 
+    # Discussion #237, O3 chat: https://chatgpt.com/share/6823cb37-7a28-8001-b2bb-0c0a7f47401c
+    QBI_QUALIFICATION_PROBABILITIES = {
+        "self_employment_income": 0.8,
+        "farm_operations_income": 0.95,
+        "farm_rent_income": 0.5,
+        "rental_income": 0.4,
+        "estate_income": 0.5,
+        "partnership_s_corp_income": 0.85,
+    }
+
+    rng = np.random.default_rng(seed=43)
+    for var, prob in QBI_QUALIFICATION_PROBABILITIES.items():
+        cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob
+
 
 def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
     SPM_RENAMES = dict(

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -26,6 +26,8 @@
     "taxable_ira_distributions",
     "self_employment_income",
     "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "short_term_capital_gains",
     "qualified_dividend_income",
     "charitable_cash_donations",
@@ -69,6 +71,15 @@
     "unreported_payroll_tax",
     "recapture_of_investment_credit",
     "deductible_mortgage_interest",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
+    "farm_operations_income",
+    "estate_income_would_be_qualified",
+    "farm_operations_income_would_be_qualified",
+    "farm_rent_income_would_be_qualified",
+    "partnership_s_corp_income_would_be_qualified",
+    "rental_income_would_be_qualified",
+    "self_employment_income_would_be_qualified",
 ]
 
 

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -14,6 +14,120 @@
 rng = np.random.default_rng(seed=64)
 
 
+def lognormal_sample(n, prob, mu, sigma):
+    """Generate a Bernoulli-lognormal mixture."""
+    positive = np.random.binomial(1, prob, size=n)
+    amounts = np.where(
+        positive == 1,
+        np.random.lognormal(mean=mu, sigma=sigma, size=n),
+        0.0,
+    )
+    return amounts
+
+
+def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True):
+    """
+    Simulate two Section 199A guard-rail quantities for every record
+      • W-2 wages paid by the business
+      • Unadjusted basis immediately after acquisition (UBIA) of property
+
+    Simulation help from https://chatgpt.com/c/6835f838-a2b0-8006-ba95-c9187f2477ad
+
+
+    Parameters
+    ----------
+    puf : pandas.DataFrame
+        Must contain the income columns created in your preprocessing block.
+    seed : int, optional
+        For reproducible random draws.
+    diagnostics : bool, default True
+        Print high-level checks after the simulation runs.
+
+    Returns
+    -------
+    w2_wages : 1-D NumPy array
+    ubia     : 1-D NumPy array
+    """
+
+    # ––––––––––––––––– 0.  Setup –––––––––––––––––––––––––––––––––––––––––––
+    rng = np.random.default_rng(seed)
+
+    # 1. Qualified business income ----------------------------------------------------------------
+    qbi = (
+        puf["self_employment_income"]
+        + puf["farm_operations_income"]
+        + puf["farm_rent_income"]
+        + puf["rental_income"]
+        + puf["estate_income"]
+        + puf["partnership_s_corp_income"]
+    ).to_numpy()
+
+    # Replace NANs with 0 so later math does not propagate missing values
+    qbi = np.nan_to_num(qbi, copy=False)
+
+    # 2. Simulate gross receipts by drawing a profit margin ---------------------------------------
+    margins = (
+        rng.beta(2, 3, qbi.size) * (0.25 - 0.05) + 0.05
+    )  # 5 – 25 %, μ≈12 %
+    revenues = np.maximum(qbi, 0) / margins  # force non-negative QBI
+
+    # 3. Probability the filer has employees (Census NES: ~14 % of pass-throughs) -----------------
+    logit = -2.2 + 1.2e-6 * revenues
+    pr_has_employees = 1 / (1 + np.exp(-logit))
+    has_employees = rng.binomial(1, pr_has_employees)
+
+    # 4. Draw a labor share; lower for rental/real-estate, higher for operating businesses --------
+    is_rental = puf["rental_income"].to_numpy() > 0
+
+    labor_ratios = np.where(
+        is_rental,
+        rng.beta(1.5, 8, qbi.size) * 0.08,  # peak 4–6 % of receipts
+        rng.beta(2.0, 2, qbi.size) * 0.25,  # peak 12–18 %
+    )
+
+    w2_wages = revenues * labor_ratios * has_employees
+
+    # 5. A simple depreciation proxy (only needed to flag capital-intensive firms) ----------------
+    #    You do not have a depreciation column; create a rough stand-in that scales with rents.
+    depreciation_proxy = np.where(
+        is_rental,
+        rng.lognormal(
+            mean=np.log(np.abs(puf["rental_income"].to_numpy()) + 1.0),
+            sigma=0.8,
+        ),
+        0.0,
+    )
+
+    # 6. UBIA simulation – log-normal, but only for capital-heavy records -------------------------
+    is_capital_intensive = is_rental | (depreciation_proxy > 0)
+
+    ubia = np.where(
+        is_capital_intensive,
+        rng.lognormal(mean=np.log(4 * np.maximum(qbi, 0) + 1.0), sigma=1.0),
+        0.0,
+    )
+
+    # Trim crazy outliers so UBIA does not dominate QBI limits
+    ubia = np.minimum(ubia, 20 * np.abs(qbi))
+
+    # 7. Quick plausibility checks ----------------------------------------------------------------
+    if diagnostics:
+        share_qbi_pos = np.mean(qbi > 0)
+        share_wages = np.mean((w2_wages > 0) & (qbi > 0))
+        print(f"• Share with QBI > 0                : {share_qbi_pos:6.2%}")
+        print(f"• Among those, share with W-2 wages : {share_wages:6.2%}")
+        if np.any(w2_wages > 0):
+            print(
+                f"• Mean W-2 (if >0)                 : ${np.mean(w2_wages[w2_wages>0]):,.0f}"
+            )
+        if np.any(ubia > 0):
+            print(
+                f"• Median UBIA (if >0)              : ${np.median(ubia[ubia>0]):,.0f}"
+            )
+
+    return w2_wages, ubia
+
+
 def impute_pension_contributions_to_puf(puf_df):
     from policyengine_us import Microsimulation
     from policyengine_us_data.datasets.cps import CPS_2021
@@ -154,8 +268,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     puf["educator_expense"] = puf.E03220
     puf["employment_income"] = puf.E00200
     puf["estate_income"] = puf.E26390 - puf.E26400
-    puf["farm_income"] = puf.T27800
-    puf["farm_rent_income"] = puf.E27200
+    puf["farm_income"] = puf.T27800  # Schedule J, separate from QBI
     puf["health_savings_account_ald"] = puf.E03290
     puf["interest_deduction"] = puf.E19200
     puf["long_term_capital_gains"] = puf.P23250
@@ -170,12 +283,24 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # that can be deducted under the miscellaneous deduction.
     puf["unreimbursed_business_employee_expenses"] = puf.E20400
     puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650
-    puf["partnership_s_corp_income"] = puf.E26270
     puf["qualified_dividend_income"] = puf.E00650
     puf["qualified_tuition_expenses"] = puf.E03230
     puf["real_estate_taxes"] = puf.E18500
-    puf["rental_income"] = puf.E25850 - puf.E25860
-    puf["self_employment_income"] = puf.E00900
+    puf["rental_income"] = (
+        puf.E25850 - puf.E25860
+    )  # Schedule E rent and royalty
+    s_corp_income = puf.E26190 - puf.E26180  # Schedule E active S-Corp income
+    partnership_income = (
+        puf.E25980 - puf.E25960
+    )  # Schedule E active partnership income
+    puf["partnership_s_corp_income"] = s_corp_income + partnership_income
+    puf["farm_operations_income"] = (
+        puf.E02100
+    )  # Schedule F active farming operations
+    puf["farm_rent_income"] = puf.E27200  # Schedule E farm rental income
+    puf["self_employment_income"] = (
+        puf.E00900
+    )  # Schedule C Sole Proprietorship
     puf["self_employed_health_insurance_ald"] = puf.E03270
     puf["self_employed_pension_contribution_ald"] = puf.E03300
     puf["short_term_capital_gains"] = puf.P22250
@@ -211,15 +336,40 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # Ignore f2441 (AMT form attached)
     # Ignore cmbtp (estimate of AMT income not in AGI)
     # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
-    qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
-    # 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
-    # https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
-    W2_WAGES_SCALE = 0.101
-    puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE
 
-    # Remove aggregate records
-    puf = puf[puf.MARS != 0]
+    # --- Qualified Business Income Deduction (QBID) simulation ---
+    w2, ubia = simulate_w2_and_ubia_from_puf(puf, seed=42)
+    puf["w2_wages_from_qualified_business"] = w2
+    puf["unadjusted_basis_qualified_property"] = ubia
+
+    # Discussion #237, O3 chat: https://chatgpt.com/share/6823cb37-7a28-8001-b2bb-0c0a7f47401c
+    sstb_prob_map_by_name = {
+        "E00900": 0.20,
+        "E26270": 0.15,
+        "E26390": 0.10,
+        "E26400": 0.10,
+    }
+
+    puf_qbi_sources_for_sstb = puf[sstb_prob_map_by_name.keys()]
+    largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1)
+
+    pr_sstb = largest_qbi_source_name.map(sstb_prob_map_by_name).fillna(0.0)
+    puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)
 
+    # REIT and BCD income: chatgpt.com/c/6835f502-5b48-8006-833a-76170a0acd40
+    p_reit_ptp = 0.07  # 7 % with income > 0
+    mu_reit_ptp, sigma_reit_ptp = 8.04, 1.20
+    puf["qualified_reit_and_ptp_income"] = lognormal_sample(
+        len(puf), p_reit_ptp, mu_reit_ptp, sigma_reit_ptp
+    )
+
+    # Business-development-company dividends
+    p_bdc = 0.003  # 0.3 % with income > 0
+    mu_bdc, sigma_bdc = 8.71, 1.00
+    puf["qualified_bdc_income"] = lognormal_sample(
+        len(puf), p_bdc, mu_bdc, sigma_bdc
+    )
+    # -------- End of QBID -------
     puf["filing_status"] = puf.MARS.map(
         {
             1: "SINGLE",
@@ -248,6 +398,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "educator_expense",
     "employment_income",
     "estate_income",
+    "farm_operations_income",
     "farm_income",
     "farm_rent_income",
     "health_savings_account_ald",
@@ -257,7 +408,6 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "unreimbursed_business_employee_expenses",
     "non_qualified_dividend_income",
     "non_sch_d_capital_gains",
-    "partnership_s_corp_income",
     "qualified_dividend_income",
     "qualified_tuition_expenses",
     "real_estate_taxes",
@@ -293,7 +443,12 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "unreported_payroll_tax",
     "pre_tax_contributions",
     "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "deductible_mortgage_interest",
+    "partnership_s_corp_income",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
 ]