diff --git a/.github/check-changelog-entry.sh b/.github/check-changelog-entry.sh new file mode 100755 index 00000000..82dd76ae --- /dev/null +++ b/.github/check-changelog-entry.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Fails if changelog_entry.yaml is empty or contains only whitespace +if [ ! -s changelog_entry.yaml ] || ! grep -q '[^[:space:]]' changelog_entry.yaml; then + echo "changelog_entry.yaml is empty. Please add a changelog entry before merging." + exit 1 +fi diff --git a/.github/workflows/pr_changelog.yaml b/.github/workflows/pr_changelog.yaml new file mode 100644 index 00000000..5fd4ce87 --- /dev/null +++ b/.github/workflows/pr_changelog.yaml @@ -0,0 +1,14 @@ +name: Changelog entry +on: + pull_request: + branches: [main] + +jobs: + require-entry: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Ensure changelog entry exists + run: .github/check-changelog-entry.sh diff --git a/.gitignore b/.gitignore index b8a20b24..264d780a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,6 @@ **/.DS_STORE **/*.h5 **/*.csv -!uprating_factors.csv -!uprating_growth_factors.csv !healthcare_spending.csv !eitc.csv !spm_threshold_agi.csv diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..fb8f2049 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,11 @@ +- bump: minor + changes: + added: + - Added automated checks for changelog entry + - New "would be qualified income" variables simulated + - REIT, PTP, and BDC dividend income variables simulated + - UBIA property is being simulated + - Farm Operations Income added + changed: + - W2 Wages from Qualified business is now being simulated with random variables + - qualified business income sources have been redefined based on IRS PUF inputs diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index ebcc3754..a08a42bc 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -15,6 +15,7 @@ from policyengine_us_data.utils import QRF import logging + test_lite = os.environ.get("TEST_LITE") @@ -570,6 +571,21 @@ def add_personal_income_variables( cps["other_medical_expenses"] = person.PMED_VAL cps["medicare_part_b_premiums"] = person.PEMCPREM + # Get QBI simulation parameters --- + yamlfilename = ( + files("policyengine_us_data") + / "datasets" + / "puf" + / "qbi_assumptions.yaml" + ) + with open(yamlfilename, "r", encoding="utf-8") as yamlfile: + p = yaml.safe_load(yamlfile) + assert isinstance(p, dict) + + rng = np.random.default_rng(seed=43) + for var, prob in p["qbi_qualification_probabilities"].items(): + cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob + def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None: SPM_RENAMES = dict( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index f066cd25..630fac4f 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,6 +26,8 @@ "taxable_ira_distributions", "self_employment_income", "w2_wages_from_qualified_business", + "unadjusted_basis_qualified_property", + "business_is_sstb", "short_term_capital_gains", "qualified_dividend_income", "charitable_cash_donations", @@ -69,6 +71,15 @@ "unreported_payroll_tax", "recapture_of_investment_credit", "deductible_mortgage_interest", + "qualified_reit_and_ptp_income", + "qualified_bdc_income", + "farm_operations_income", + "estate_income_would_be_qualified", + "farm_operations_income_would_be_qualified", + "farm_rent_income_would_be_qualified", + "partnership_s_corp_income_would_be_qualified", + "rental_income_would_be_qualified", + "self_employment_income_would_be_qualified", ] diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 4b7c7c63..f1a057c4 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -1,7 +1,12 @@ +import os +import yaml +from importlib.resources import files + from tqdm import tqdm import numpy as np import pandas as pd from microdf import MicroDataFrame + from policyengine_core.data import Dataset from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.datasets.puf.uprate_puf import uprate_puf @@ -9,10 +14,154 @@ from policyengine_us_data.utils.uprating import ( create_policyengine_uprating_factors_table, ) -import os + rng = np.random.default_rng(seed=64) +# Get Qualified Business Income simulation parameters --- +yamlfilename = ( + files("policyengine_us_data") / "datasets" / "puf" / "qbi_assumptions.yaml" +) +with open(yamlfilename, "r", encoding="utf-8") as yamlfile: + QBI_PARAMS = yaml.safe_load(yamlfile) +assert isinstance(QBI_PARAMS, dict) + + +# Helper functions --- +def sample_bernoulli_lognormal(n, prob, log_mean, log_sigma, rng): + """Generate a Bernoulli-lognormal mixture.""" + positive = np.random.binomial(1, prob, size=n) + amounts = np.where( + positive, + rng.lognormal(mean=log_mean, sigma=log_sigma, size=n), + 0.0, + ) + return amounts + + +def conditionally_sample_lognormal(flag, target_mean, log_sigma, rng): + """Generate a lognormal conditional on a binary flag.""" + mu = np.log(target_mean) - (log_sigma**2 / 2) + return np.where( + flag, + rng.lognormal( + mean=mu, + sigma=log_sigma, + ), + 0.0, + ) + + +def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True): + """ + Simulate two Section 199A guard-rail quantities for every record + - W-2 wages paid by the business + - Unadjusted basis immediately after acquisition (UBIA) of property + + Parameters + ---------- + puf : pandas.DataFrame + Must contain the income columns created in your preprocessing block. + seed : int, optional + For reproducible random draws. + diagnostics : bool, default True + Print high-level checks after the simulation runs. + + Returns + ------- + w2_wages : 1-D NumPy array + ubia : 1-D NumPy array + """ + rng = np.random.default_rng(seed) + + # Extract Qualified Business Income simulation parameters + qbi_probs = QBI_PARAMS["qbi_qualification_probabilities"] + margin_params = QBI_PARAMS["profit_margin_distribution"] + logit_params = QBI_PARAMS["has_employees_logit"] + + labor_params = QBI_PARAMS["labor_ratio_distribution"] + rental_labor = labor_params["rental"] + non_rental_labor = labor_params["non_rental"] + + rental_beta_a = rental_labor["beta_a"] + rental_beta_b = rental_labor["beta_b"] + rental_scale = rental_labor["scale"] + + non_rental_beta_a = non_rental_labor["beta_a"] + non_rental_beta_b = non_rental_labor["beta_b"] + non_rental_scale = non_rental_labor["scale"] + + depr_sigma = QBI_PARAMS["depreciation_proxy_sigma"] + + ubia_params = QBI_PARAMS["ubia_simulation"] + ubia_multiple_of_qbi = ubia_params["multiple_of_qbi"] + ubia_sigma = ubia_params["sigma"] + + # Estimate qualified business income + qbi = sum( + puf[income_type] * prob for income_type, prob in qbi_probs.items() + ).to_numpy() + + # Simulate gross receipts by drawing a profit margin + margins = ( + rng.beta(margin_params["beta_a"], margin_params["beta_b"], qbi.size) + * margin_params["scale"] + + margin_params["shift"] + ) + revenues = np.maximum(qbi, 0) / margins + + logit = ( + logit_params["intercept"] + logit_params["slope_per_dollar"] * revenues + ) + + # Set p = 0 when simulated receipts == 0 (no revenue means no payroll) + pr_has_employees = np.where( + revenues == 0.0, 0.0, 1.0 / (1.0 + np.exp(-logit)) + ) + has_employees = rng.binomial(1, pr_has_employees) + + # Labor share simulation + is_rental = puf["rental_income"].to_numpy() > 0 + + labor_ratios = np.where( + is_rental, + rng.beta(rental_beta_a, rental_beta_b, qbi.size) * rental_scale, + rng.beta(non_rental_beta_a, non_rental_beta_b, qbi.size) + * non_rental_scale, + ) + + w2_wages = revenues * labor_ratios * has_employees + + # A depreciation stand-in that scales with rents + depreciation_proxy = conditionally_sample_lognormal( + is_rental, + puf["rental_income"], + depr_sigma, + rng, + ) + + # UBIA simulation: lognormal, but only for capital-heavy records + is_capital_intensive = is_rental | (depreciation_proxy > 0) + + ubia = conditionally_sample_lognormal( + is_capital_intensive, + ubia_multiple_of_qbi * np.maximum(qbi, 0), + ubia_sigma, + rng, + ) + + if diagnostics: + share_qbi_pos = np.mean(qbi > 0) + share_wages = np.mean((w2_wages > 0) & (qbi > 0)) + print(f"Share with QBI > 0: {share_qbi_pos:6.2%}") + print(f"Among those, share with W-2 wages: {share_wages:6.2%}") + if np.any(w2_wages > 0): + print(f"Mean W-2 (if >0): ${np.mean(w2_wages[w2_wages>0]):,.0f}") + if np.any(ubia > 0): + print(f"Median UBIA (if >0): ${np.median(ubia[ubia>0]):,.0f}") + + return w2_wages, ubia + def impute_pension_contributions_to_puf(puf_df): from policyengine_us import Microsimulation @@ -154,8 +303,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: puf["educator_expense"] = puf.E03220 puf["employment_income"] = puf.E00200 puf["estate_income"] = puf.E26390 - puf.E26400 + # Schedule J, separate from QBI puf["farm_income"] = puf.T27800 - puf["farm_rent_income"] = puf.E27200 puf["health_savings_account_ald"] = puf.E03290 puf["interest_deduction"] = puf.E19200 puf["long_term_capital_gains"] = puf.P23250 @@ -170,11 +319,21 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: # that can be deducted under the miscellaneous deduction. puf["unreimbursed_business_employee_expenses"] = puf.E20400 puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650 - puf["partnership_s_corp_income"] = puf.E26270 puf["qualified_dividend_income"] = puf.E00650 puf["qualified_tuition_expenses"] = puf.E03230 puf["real_estate_taxes"] = puf.E18500 + # Schedule E rent and royalty puf["rental_income"] = puf.E25850 - puf.E25860 + # Schedule E active S-Corp income + s_corp_income = puf.E26190 - puf.E26180 + # Schedule E active partnership income + partnership_income = puf.E25980 - puf.E25960 + puf["partnership_s_corp_income"] = s_corp_income + partnership_income + # Schedule F active farming operations + puf["farm_operations_income"] = puf.E02100 + # Schedule E farm rental income + puf["farm_rent_income"] = puf.E27200 + # Schedule C Sole Proprietorship puf["self_employment_income"] = puf.E00900 puf["self_employed_health_insurance_ald"] = puf.E03270 puf["self_employed_pension_contribution_ald"] = puf.E03300 @@ -211,15 +370,38 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: # Ignore f2441 (AMT form attached) # Ignore cmbtp (estimate of AMT income not in AGI) # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income) - qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200) - # 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID - # https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854 - W2_WAGES_SCALE = 0.101 - puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE - # Remove aggregate records - puf = puf[puf.MARS != 0] + # --- Qualified Business Income Deduction (QBID) simulation --- + w2, ubia = simulate_w2_and_ubia_from_puf(puf, seed=42) + puf["w2_wages_from_qualified_business"] = w2 + puf["unadjusted_basis_qualified_property"] = ubia + puf_qbi_sources_for_sstb = puf[QBI_PARAMS["sstb_prob_map_by_name"].keys()] + largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1) + + pr_sstb = largest_qbi_source_name.map( + QBI_PARAMS["sstb_prob_map_by_name"] + ).fillna(0.0) + puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb) + + reit_params = QBI_PARAMS["reit_ptp_income_distribution"] + p_reit_ptp = reit_params["probability_of_receiving"] + mu_reit_ptp = reit_params["log_normal_mu"] + sigma_reit_ptp = reit_params["log_normal_sigma"] + + puf["qualified_reit_and_ptp_income"] = sample_bernoulli_lognormal( + len(puf), p_reit_ptp, mu_reit_ptp, sigma_reit_ptp, rng + ) + + bdc_params = QBI_PARAMS["bdc_income_distribution"] + p_bdc = bdc_params["probability_of_receiving"] + mu_bdc = bdc_params["log_normal_mu"] + sigma_bdc = bdc_params["log_normal_sigma"] + + puf["qualified_bdc_income"] = sample_bernoulli_lognormal( + len(puf), p_bdc, mu_bdc, sigma_bdc, rng + ) + # -------- End of Qualified Business Income Deduction (QBID) ------- puf["filing_status"] = puf.MARS.map( { 1: "SINGLE", @@ -248,6 +430,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: "educator_expense", "employment_income", "estate_income", + "farm_operations_income", "farm_income", "farm_rent_income", "health_savings_account_ald", @@ -257,7 +440,6 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: "unreimbursed_business_employee_expenses", "non_qualified_dividend_income", "non_sch_d_capital_gains", - "partnership_s_corp_income", "qualified_dividend_income", "qualified_tuition_expenses", "real_estate_taxes", @@ -293,7 +475,12 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: "unreported_payroll_tax", "pre_tax_contributions", "w2_wages_from_qualified_business", + "unadjusted_basis_qualified_property", + "business_is_sstb", "deductible_mortgage_interest", + "partnership_s_corp_income", + "qualified_reit_and_ptp_income", + "qualified_bdc_income", ] diff --git a/policyengine_us_data/datasets/puf/qbi_assumptions.yaml b/policyengine_us_data/datasets/puf/qbi_assumptions.yaml new file mode 100644 index 00000000..943b87f1 --- /dev/null +++ b/policyengine_us_data/datasets/puf/qbi_assumptions.yaml @@ -0,0 +1,64 @@ +# Probabilistic assumptions for Section 199A qualification and SSTB classification +# QBI assumptions: https://chatgpt.com/share/6823cb37-7a28-8001-b2bb-0c0a7f47401c +# UBIA assumptions: https://chatgpt.com/share/683b12a5-78dc-8006-81c9-479858312b30 +# REIT/PTP and BDC assumptions: https://chatgpt.com/c/6835f502-5b48-8006-833a-76170a0acd40 + + +qbi_qualification_probabilities: + self_employment_income: 0.8 + farm_operations_income: 0.95 + farm_rent_income: 0.5 + rental_income: 0.4 + estate_income: 0.5 + partnership_s_corp_income: 0.85 + +sstb_prob_map_by_name: + E00900: 0.20 + E26270: 0.15 + E26390: 0.10 + E26400: 0.10 + +# Below, we assume that 7% of filers have nonzero REIT/PTP income, +# and of those 7%, their REIT/PTP income is lognormal distributed with +# mean of exp(8.04) = $3,103, and standard deviation 1.20 of the lognormal. +reit_ptp_income_distribution: + probability_of_receiving: 0.07 + log_normal_mu: 8.04 + log_normal_sigma: 1.20 + +bdc_income_distribution: + probability_of_receiving: 0.003 + log_normal_mu: 8.71 + log_normal_sigma: 1.00 + +profit_margin_distribution: + beta_a: 2.0 + beta_b: 3.0 + scale: 0.20 + shift: 0.05 + +# Logistic model: p = 1 / (1 + exp(-(b0 + b1 * receipts))) +# * b1 = 1.2e-6: odds roughly triple for each +$1 M; pr 50% near $1M +# * b0 = -3.1: tuned so mean pr is roughly 14% in this PUF (matches SOI share) +has_employees_logit: + intercept: -3.1 + slope_per_dollar: 1.2e-6 + +labor_ratio_distribution: + rental: + beta_a: 1.5 + beta_b: 8.0 + scale: 0.08 + non_rental: + beta_a: 2.0 + beta_b: 2.0 + scale: 0.25 + +depreciation_proxy_sigma: 0.8 + +# lognormal(mean = 4 * QBI, sigma = 1) +# produce a right-skewed spread whose mean is roughly 4 * QBI, +# matching aggregate SOI ratios. +ubia_simulation: + multiple_of_qbi: 4.0 + sigma: 1.0