Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3cf6658
qbid inputs in puf
baogorek May 19, 2025
cc249f8
Getting the new variables in all files
baogorek May 19, 2025
3be9ebd
checking on the variables I deleted
baogorek May 21, 2025
1722f02
first pass at new variables
baogorek May 22, 2025
c33e3bf
fixing issues, bringing original variables back in some cases
baogorek May 22, 2025
6f01054
Use per-component probabilities for QBI qualification
baogorek May 27, 2025
a30ff1a
Merge pull request #274 from PolicyEngine/codex/add-would_be_qualifie…
baogorek May 27, 2025
df63e10
new percentages
baogorek May 27, 2025
5c793cf
cps proportions
baogorek May 27, 2025
4cf5804
adding better simulations in IRS/PUF data
baogorek May 27, 2025
9942f34
dataset updates with _would_be formulas added
baogorek May 27, 2025
f1d5052
uncommenting code
baogorek May 28, 2025
47a6322
Format CPS and PUF datasets with Black
baogorek May 28, 2025
6e78204
Merge pull request #277 from PolicyEngine/codex/fix-code-style-with-b…
baogorek May 28, 2025
8acb06a
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek May 28, 2025
7f76edd
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek May 29, 2025
1596072
Add changelog entry check workflow
baogorek May 30, 2025
0a22ac0
Merge pull request #280 from PolicyEngine/codex/add-github-action-for…
baogorek May 30, 2025
a6878b6
added a changelog entry
baogorek May 30, 2025
17e7e8f
removing uprating factors exemption from .gitignore
baogorek May 30, 2025
f1c2927
moved dictionary of values to utils. improving formulas and documenta…
baogorek May 31, 2025
dff8acb
slighly modified w2 wages and ubia simulation
baogorek May 31, 2025
806c745
turning O3 link into actual share link
baogorek May 31, 2025
13b8337
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek Jun 17, 2025
7be076b
adding to assumptions and merging in main branch
baogorek Jun 17, 2025
12a3f5f
parameters in yaml
baogorek Jun 17, 2025
3c10e5b
Update policyengine_us_data/datasets/puf/puf.py
baogorek Jun 17, 2025
1c07561
Revert "Update policyengine_us_data/datasets/puf/puf.py"
baogorek Jun 17, 2025
01646f9
parameters in yaml file
baogorek Jun 17, 2025
d6eacfd
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek Jun 19, 2025
fe669fc
assumptions and encapsulation of wage and ubia simulation
baogorek Jun 19, 2025
155a3ee
left out rng
baogorek Jun 19, 2025
bebd0f3
left out rng again
baogorek Jun 19, 2025
de0ef90
yaml comment
baogorek Jun 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/check-changelog-entry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

# Fails if changelog_entry.yaml is empty or contains only whitespace
if [ ! -s changelog_entry.yaml ] || ! grep -q '[^[:space:]]' changelog_entry.yaml; then
echo "changelog_entry.yaml is empty. Please add a changelog entry before merging."
exit 1
fi
14 changes: 14 additions & 0 deletions .github/workflows/pr_changelog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Changelog entry
on:
pull_request:
branches: [main]

jobs:
require-entry:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Ensure changelog entry exists
run: .github/check-changelog-entry.sh
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
**/.DS_STORE
**/*.h5
**/*.csv
!uprating_factors.csv
!uprating_growth_factors.csv
!healthcare_spending.csv
!eitc.csv
!spm_threshold_agi.csv
Expand Down
11 changes: 11 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
- bump: minor
changes:
added:
- Added automated checks for changelog entry
- New "would be qualified income" variables simulated
- REIT, PTP, and BDC dividend income variables simulated
- UBIA property is being simulated
- Farm Operations Income added
changed:
- W2 Wages from Qualified business is now being simulated with random variables
- qualified business income sources have been redefined based on IRS PUF inputs
16 changes: 16 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from policyengine_us_data.utils import QRF
import logging


test_lite = os.environ.get("TEST_LITE")


Expand Down Expand Up @@ -570,6 +571,21 @@ def add_personal_income_variables(
cps["other_medical_expenses"] = person.PMED_VAL
cps["medicare_part_b_premiums"] = person.PEMCPREM

# Get QBI simulation parameters ---
yamlfilename = (
files("policyengine_us_data")
/ "datasets"
/ "puf"
/ "qbi_assumptions.yaml"
)
with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
p = yaml.safe_load(yamlfile)
assert isinstance(p, dict)

rng = np.random.default_rng(seed=43)
for var, prob in p["qbi_qualification_probabilities"].items():
cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob


def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
SPM_RENAMES = dict(
Expand Down
11 changes: 11 additions & 0 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
"taxable_ira_distributions",
"self_employment_income",
"w2_wages_from_qualified_business",
"unadjusted_basis_qualified_property",
"business_is_sstb",
"short_term_capital_gains",
"qualified_dividend_income",
"charitable_cash_donations",
Expand Down Expand Up @@ -69,6 +71,15 @@
"unreported_payroll_tax",
"recapture_of_investment_credit",
"deductible_mortgage_interest",
"qualified_reit_and_ptp_income",
"qualified_bdc_income",
"farm_operations_income",
"estate_income_would_be_qualified",
"farm_operations_income_would_be_qualified",
"farm_rent_income_would_be_qualified",
"partnership_s_corp_income_would_be_qualified",
"rental_income_would_be_qualified",
"self_employment_income_would_be_qualified",
]


Expand Down
209 changes: 198 additions & 11 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,167 @@
import os
import yaml
from importlib.resources import files

from tqdm import tqdm
import numpy as np
import pandas as pd
from microdf import MicroDataFrame

from policyengine_core.data import Dataset
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us_data.datasets.puf.uprate_puf import uprate_puf
from policyengine_us_data.datasets.puf.irs_puf import IRS_PUF_2015
from policyengine_us_data.utils.uprating import (
create_policyengine_uprating_factors_table,
)
import os


rng = np.random.default_rng(seed=64)

# Get Qualified Business Income simulation parameters ---
yamlfilename = (
files("policyengine_us_data") / "datasets" / "puf" / "qbi_assumptions.yaml"
)
with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
QBI_PARAMS = yaml.safe_load(yamlfile)
assert isinstance(QBI_PARAMS, dict)


# Helper functions ---
def sample_bernoulli_lognormal(n, prob, log_mean, log_sigma, rng):
"""Generate a Bernoulli-lognormal mixture."""
positive = np.random.binomial(1, prob, size=n)
amounts = np.where(
positive,
rng.lognormal(mean=log_mean, sigma=log_sigma, size=n),
0.0,
)
return amounts


def conditionally_sample_lognormal(flag, target_mean, log_sigma, rng):
"""Generate a lognormal conditional on a binary flag."""
mu = np.log(target_mean) - (log_sigma**2 / 2)
return np.where(
flag,
rng.lognormal(
mean=mu,
sigma=log_sigma,
),
0.0,
)


def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True):
"""
Simulate two Section 199A guard-rail quantities for every record
- W-2 wages paid by the business
- Unadjusted basis immediately after acquisition (UBIA) of property

Parameters
----------
puf : pandas.DataFrame
Must contain the income columns created in your preprocessing block.
seed : int, optional
For reproducible random draws.
diagnostics : bool, default True
Print high-level checks after the simulation runs.

Returns
-------
w2_wages : 1-D NumPy array
ubia : 1-D NumPy array
"""
rng = np.random.default_rng(seed)

# Extract Qualified Business Income simulation parameters
qbi_probs = QBI_PARAMS["qbi_qualification_probabilities"]
margin_params = QBI_PARAMS["profit_margin_distribution"]
logit_params = QBI_PARAMS["has_employees_logit"]

labor_params = QBI_PARAMS["labor_ratio_distribution"]
rental_labor = labor_params["rental"]
non_rental_labor = labor_params["non_rental"]

rental_beta_a = rental_labor["beta_a"]
rental_beta_b = rental_labor["beta_b"]
rental_scale = rental_labor["scale"]

non_rental_beta_a = non_rental_labor["beta_a"]
non_rental_beta_b = non_rental_labor["beta_b"]
non_rental_scale = non_rental_labor["scale"]

depr_sigma = QBI_PARAMS["depreciation_proxy_sigma"]

ubia_params = QBI_PARAMS["ubia_simulation"]
ubia_multiple_of_qbi = ubia_params["multiple_of_qbi"]
ubia_sigma = ubia_params["sigma"]

# Estimate qualified business income
qbi = sum(
puf[income_type] * prob for income_type, prob in qbi_probs.items()
).to_numpy()

# Simulate gross receipts by drawing a profit margin
margins = (
rng.beta(margin_params["beta_a"], margin_params["beta_b"], qbi.size)
* margin_params["scale"]
+ margin_params["shift"]
)
revenues = np.maximum(qbi, 0) / margins

logit = (
logit_params["intercept"] + logit_params["slope_per_dollar"] * revenues
)

# Set p = 0 when simulated receipts == 0 (no revenue means no payroll)
pr_has_employees = np.where(
revenues == 0.0, 0.0, 1.0 / (1.0 + np.exp(-logit))
)
has_employees = rng.binomial(1, pr_has_employees)

# Labor share simulation
is_rental = puf["rental_income"].to_numpy() > 0

labor_ratios = np.where(
is_rental,
rng.beta(rental_beta_a, rental_beta_b, qbi.size) * rental_scale,
rng.beta(non_rental_beta_a, non_rental_beta_b, qbi.size)
* non_rental_scale,
)

w2_wages = revenues * labor_ratios * has_employees

# A depreciation stand-in that scales with rents
depreciation_proxy = conditionally_sample_lognormal(
is_rental,
puf["rental_income"],
depr_sigma,
rng,
)

# UBIA simulation: lognormal, but only for capital-heavy records
is_capital_intensive = is_rental | (depreciation_proxy > 0)

ubia = conditionally_sample_lognormal(
is_capital_intensive,
ubia_multiple_of_qbi * np.maximum(qbi, 0),
ubia_sigma,
rng,
)

if diagnostics:
share_qbi_pos = np.mean(qbi > 0)
share_wages = np.mean((w2_wages > 0) & (qbi > 0))
print(f"Share with QBI > 0: {share_qbi_pos:6.2%}")
print(f"Among those, share with W-2 wages: {share_wages:6.2%}")
if np.any(w2_wages > 0):
print(f"Mean W-2 (if >0): ${np.mean(w2_wages[w2_wages>0]):,.0f}")
if np.any(ubia > 0):
print(f"Median UBIA (if >0): ${np.median(ubia[ubia>0]):,.0f}")

return w2_wages, ubia


def impute_pension_contributions_to_puf(puf_df):
from policyengine_us import Microsimulation
Expand Down Expand Up @@ -154,8 +303,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
puf["educator_expense"] = puf.E03220
puf["employment_income"] = puf.E00200
puf["estate_income"] = puf.E26390 - puf.E26400
# Schedule J, separate from QBI
puf["farm_income"] = puf.T27800
puf["farm_rent_income"] = puf.E27200
puf["health_savings_account_ald"] = puf.E03290
puf["interest_deduction"] = puf.E19200
puf["long_term_capital_gains"] = puf.P23250
Expand All @@ -170,11 +319,21 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# that can be deducted under the miscellaneous deduction.
puf["unreimbursed_business_employee_expenses"] = puf.E20400
puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650
puf["partnership_s_corp_income"] = puf.E26270
puf["qualified_dividend_income"] = puf.E00650
puf["qualified_tuition_expenses"] = puf.E03230
puf["real_estate_taxes"] = puf.E18500
# Schedule E rent and royalty
puf["rental_income"] = puf.E25850 - puf.E25860
# Schedule E active S-Corp income
s_corp_income = puf.E26190 - puf.E26180
# Schedule E active partnership income
partnership_income = puf.E25980 - puf.E25960
puf["partnership_s_corp_income"] = s_corp_income + partnership_income
# Schedule F active farming operations
puf["farm_operations_income"] = puf.E02100
# Schedule E farm rental income
puf["farm_rent_income"] = puf.E27200
# Schedule C Sole Proprietorship
puf["self_employment_income"] = puf.E00900
puf["self_employed_health_insurance_ald"] = puf.E03270
puf["self_employed_pension_contribution_ald"] = puf.E03300
Expand Down Expand Up @@ -211,15 +370,38 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# Ignore f2441 (AMT form attached)
# Ignore cmbtp (estimate of AMT income not in AGI)
# Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
# 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
# https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
W2_WAGES_SCALE = 0.101
puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE

# Remove aggregate records
puf = puf[puf.MARS != 0]
# --- Qualified Business Income Deduction (QBID) simulation ---
w2, ubia = simulate_w2_and_ubia_from_puf(puf, seed=42)
puf["w2_wages_from_qualified_business"] = w2
puf["unadjusted_basis_qualified_property"] = ubia

puf_qbi_sources_for_sstb = puf[QBI_PARAMS["sstb_prob_map_by_name"].keys()]
largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1)

pr_sstb = largest_qbi_source_name.map(
QBI_PARAMS["sstb_prob_map_by_name"]
).fillna(0.0)
puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)

reit_params = QBI_PARAMS["reit_ptp_income_distribution"]
p_reit_ptp = reit_params["probability_of_receiving"]
mu_reit_ptp = reit_params["log_normal_mu"]
sigma_reit_ptp = reit_params["log_normal_sigma"]

puf["qualified_reit_and_ptp_income"] = sample_bernoulli_lognormal(
len(puf), p_reit_ptp, mu_reit_ptp, sigma_reit_ptp, rng
)

bdc_params = QBI_PARAMS["bdc_income_distribution"]
p_bdc = bdc_params["probability_of_receiving"]
mu_bdc = bdc_params["log_normal_mu"]
sigma_bdc = bdc_params["log_normal_sigma"]

puf["qualified_bdc_income"] = sample_bernoulli_lognormal(
len(puf), p_bdc, mu_bdc, sigma_bdc, rng
)
# -------- End of Qualified Business Income Deduction (QBID) -------
puf["filing_status"] = puf.MARS.map(
{
1: "SINGLE",
Expand Down Expand Up @@ -248,6 +430,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"educator_expense",
"employment_income",
"estate_income",
"farm_operations_income",
"farm_income",
"farm_rent_income",
"health_savings_account_ald",
Expand All @@ -257,7 +440,6 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"unreimbursed_business_employee_expenses",
"non_qualified_dividend_income",
"non_sch_d_capital_gains",
"partnership_s_corp_income",
"qualified_dividend_income",
"qualified_tuition_expenses",
"real_estate_taxes",
Expand Down Expand Up @@ -293,7 +475,12 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"unreported_payroll_tax",
"pre_tax_contributions",
"w2_wages_from_qualified_business",
"unadjusted_basis_qualified_property",
"business_is_sstb",
"deductible_mortgage_interest",
"partnership_s_corp_income",
"qualified_reit_and_ptp_income",
"qualified_bdc_income",
]


Expand Down
Loading
Loading