Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
3cf6658
qbid inputs in puf
baogorek May 19, 2025
cc249f8
Getting the new variables in all files
baogorek May 19, 2025
3be9ebd
checking on the variables I deleted
baogorek May 21, 2025
1722f02
first pass at new variables
baogorek May 22, 2025
c33e3bf
fixing issues, bringing original variables back in some cases
baogorek May 22, 2025
6f01054
Use per-component probabilities for QBI qualification
baogorek May 27, 2025
a30ff1a
Merge pull request #274 from PolicyEngine/codex/add-would_be_qualifie…
baogorek May 27, 2025
df63e10
new percentages
baogorek May 27, 2025
5c793cf
cps proportions
baogorek May 27, 2025
4cf5804
adding better simulations in IRS/PUF data
baogorek May 27, 2025
9942f34
dataset updates with _would_be formulas added
baogorek May 27, 2025
f1d5052
uncommenting code
baogorek May 28, 2025
47a6322
Format CPS and PUF datasets with Black
baogorek May 28, 2025
6e78204
Merge pull request #277 from PolicyEngine/codex/fix-code-style-with-b…
baogorek May 28, 2025
8acb06a
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek May 28, 2025
7f76edd
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek May 29, 2025
1596072
Add changelog entry check workflow
baogorek May 30, 2025
0a22ac0
Merge pull request #280 from PolicyEngine/codex/add-github-action-for…
baogorek May 30, 2025
a6878b6
added a changelog entry
baogorek May 30, 2025
17e7e8f
removing uprating factors exemption from .gitignore
baogorek May 30, 2025
f1c2927
moved dictionary of values to utils. improving formulas and documenta…
baogorek May 31, 2025
dff8acb
slighly modified w2 wages and ubia simulation
baogorek May 31, 2025
806c745
turning O3 link into actual share link
baogorek May 31, 2025
13b8337
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek Jun 17, 2025
7be076b
adding to assumptions and merging in main branch
baogorek Jun 17, 2025
12a3f5f
parameters in yaml
baogorek Jun 17, 2025
3c10e5b
Update policyengine_us_data/datasets/puf/puf.py
baogorek Jun 17, 2025
1c07561
Revert "Update policyengine_us_data/datasets/puf/puf.py"
baogorek Jun 17, 2025
01646f9
parameters in yaml file
baogorek Jun 17, 2025
d6eacfd
Merge branch 'main' of github.com:PolicyEngine/policyengine-us-data i…
baogorek Jun 19, 2025
fe669fc
assumptions and encapsulation of wage and ubia simulation
baogorek Jun 19, 2025
155a3ee
left out rng
baogorek Jun 19, 2025
bebd0f3
left out rng again
baogorek Jun 19, 2025
de0ef90
yaml comment
baogorek Jun 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/check-changelog-entry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

# Fails if changelog_entry.yaml is empty or contains only whitespace
if [ ! -s changelog_entry.yaml ] || ! grep -q '[^[:space:]]' changelog_entry.yaml; then
echo "changelog_entry.yaml is empty. Please add a changelog entry before merging."
exit 1
fi
14 changes: 14 additions & 0 deletions .github/workflows/pr_changelog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Changelog entry
on:
pull_request:
branches: [main]

jobs:
require-entry:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Ensure changelog entry exists
run: .github/check-changelog-entry.sh
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
**/.DS_STORE
**/*.h5
**/*.csv
!uprating_factors.csv
!uprating_growth_factors.csv
!healthcare_spending.csv
!eitc.csv
!spm_threshold_agi.csv
Expand Down
11 changes: 11 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
- bump: minor
changes:
added:
- Added automated checks for changelog entry
- New "would be qualified income" variables simulated
- REIT, PTP, and BDC dividend income variables simulated
- UBIA property is being simulated
- Farm Operations Income added
changed:
- W2 Wages from Qualified business is now being simulated with random variables
- qualified business income sources have been redefined based on IRS PUF inputs
15 changes: 15 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from policyengine_us_data.utils import QRF
import logging


test_lite = os.environ.get("TEST_LITE")


Expand Down Expand Up @@ -730,6 +731,20 @@ def add_personal_income_variables(
cps["other_medical_expenses"] = person.PMED_VAL
cps["medicare_part_b_premiums"] = person.PEMCPREM

# Discussion #237, O3 chat: https://chatgpt.com/share/6823cb37-7a28-8001-b2bb-0c0a7f47401c
QBI_QUALIFICATION_PROBABILITIES = {
"self_employment_income": 0.8,
"farm_operations_income": 0.95,
"farm_rent_income": 0.5,
"rental_income": 0.4,
"estate_income": 0.5,
"partnership_s_corp_income": 0.85,
}

rng = np.random.default_rng(seed=43)
for var, prob in QBI_QUALIFICATION_PROBABILITIES.items():
cps[f"{var}_would_be_qualified"] = rng.random(len(person)) < prob


def add_spm_variables(cps: h5py.File, spm_unit: DataFrame) -> None:
SPM_RENAMES = dict(
Expand Down
11 changes: 11 additions & 0 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
"taxable_ira_distributions",
"self_employment_income",
"w2_wages_from_qualified_business",
"unadjusted_basis_qualified_property",
"business_is_sstb",
"short_term_capital_gains",
"qualified_dividend_income",
"charitable_cash_donations",
Expand Down Expand Up @@ -69,6 +71,15 @@
"unreported_payroll_tax",
"recapture_of_investment_credit",
"deductible_mortgage_interest",
"qualified_reit_and_ptp_income",
"qualified_bdc_income",
"farm_operations_income",
"estate_income_would_be_qualified",
"farm_operations_income_would_be_qualified",
"farm_rent_income_would_be_qualified",
"partnership_s_corp_income_would_be_qualified",
"rental_income_would_be_qualified",
"self_employment_income_would_be_qualified",
]


Expand Down
181 changes: 168 additions & 13 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,120 @@
rng = np.random.default_rng(seed=64)


def lognormal_sample(n, prob, mu, sigma):
"""Generate a Bernoulli-lognormal mixture."""
positive = np.random.binomial(1, prob, size=n)
amounts = np.where(
positive == 1,
np.random.lognormal(mean=mu, sigma=sigma, size=n),
0.0,
)
return amounts


def simulate_w2_and_ubia_from_puf(puf, *, seed=None, diagnostics=True):
"""
Simulate two Section 199A guard-rail quantities for every record
• W-2 wages paid by the business
• Unadjusted basis immediately after acquisition (UBIA) of property

Simulation help from https://chatgpt.com/c/6835f838-a2b0-8006-ba95-c9187f2477ad


Parameters
----------
puf : pandas.DataFrame
Must contain the income columns created in your preprocessing block.
seed : int, optional
For reproducible random draws.
diagnostics : bool, default True
Print high-level checks after the simulation runs.

Returns
-------
w2_wages : 1-D NumPy array
ubia : 1-D NumPy array
"""

# ––––––––––––––––– 0. Setup –––––––––––––––––––––––––––––––––––––––––––
rng = np.random.default_rng(seed)

# 1. Qualified business income ----------------------------------------------------------------
qbi = (
puf["self_employment_income"]
+ puf["farm_operations_income"]
+ puf["farm_rent_income"]
+ puf["rental_income"]
+ puf["estate_income"]
+ puf["partnership_s_corp_income"]
).to_numpy()

# Replace NANs with 0 so later math does not propagate missing values
qbi = np.nan_to_num(qbi, copy=False)

# 2. Simulate gross receipts by drawing a profit margin ---------------------------------------
margins = (
rng.beta(2, 3, qbi.size) * (0.25 - 0.05) + 0.05
) # 5 – 25 %, μ≈12 %
revenues = np.maximum(qbi, 0) / margins # force non-negative QBI

# 3. Probability the filer has employees (Census NES: ~14 % of pass-throughs) -----------------
logit = -2.2 + 1.2e-6 * revenues
pr_has_employees = 1 / (1 + np.exp(-logit))
has_employees = rng.binomial(1, pr_has_employees)

# 4. Draw a labor share; lower for rental/real-estate, higher for operating businesses --------
is_rental = puf["rental_income"].to_numpy() > 0

labor_ratios = np.where(
is_rental,
rng.beta(1.5, 8, qbi.size) * 0.08, # peak 4–6 % of receipts
rng.beta(2.0, 2, qbi.size) * 0.25, # peak 12–18 %
)

w2_wages = revenues * labor_ratios * has_employees

# 5. A simple depreciation proxy (only needed to flag capital-intensive firms) ----------------
# You do not have a depreciation column; create a rough stand-in that scales with rents.
depreciation_proxy = np.where(
is_rental,
rng.lognormal(
mean=np.log(np.abs(puf["rental_income"].to_numpy()) + 1.0),
sigma=0.8,
),
0.0,
)

# 6. UBIA simulation – log-normal, but only for capital-heavy records -------------------------
is_capital_intensive = is_rental | (depreciation_proxy > 0)

ubia = np.where(
is_capital_intensive,
rng.lognormal(mean=np.log(4 * np.maximum(qbi, 0) + 1.0), sigma=1.0),
0.0,
)

# Trim crazy outliers so UBIA does not dominate QBI limits
ubia = np.minimum(ubia, 20 * np.abs(qbi))

# 7. Quick plausibility checks ----------------------------------------------------------------
if diagnostics:
share_qbi_pos = np.mean(qbi > 0)
share_wages = np.mean((w2_wages > 0) & (qbi > 0))
print(f"• Share with QBI > 0 : {share_qbi_pos:6.2%}")
print(f"• Among those, share with W-2 wages : {share_wages:6.2%}")
if np.any(w2_wages > 0):
print(
f"• Mean W-2 (if >0) : ${np.mean(w2_wages[w2_wages>0]):,.0f}"
)
if np.any(ubia > 0):
print(
f"• Median UBIA (if >0) : ${np.median(ubia[ubia>0]):,.0f}"
)

return w2_wages, ubia


def impute_pension_contributions_to_puf(puf_df):
from policyengine_us import Microsimulation
from policyengine_us_data.datasets.cps import CPS_2021
Expand Down Expand Up @@ -154,8 +268,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
puf["educator_expense"] = puf.E03220
puf["employment_income"] = puf.E00200
puf["estate_income"] = puf.E26390 - puf.E26400
puf["farm_income"] = puf.T27800
puf["farm_rent_income"] = puf.E27200
puf["farm_income"] = puf.T27800 # Schedule J, separate from QBI
puf["health_savings_account_ald"] = puf.E03290
puf["interest_deduction"] = puf.E19200
puf["long_term_capital_gains"] = puf.P23250
Expand All @@ -170,12 +283,24 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# that can be deducted under the miscellaneous deduction.
puf["unreimbursed_business_employee_expenses"] = puf.E20400
puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650
puf["partnership_s_corp_income"] = puf.E26270
puf["qualified_dividend_income"] = puf.E00650
puf["qualified_tuition_expenses"] = puf.E03230
puf["real_estate_taxes"] = puf.E18500
puf["rental_income"] = puf.E25850 - puf.E25860
puf["self_employment_income"] = puf.E00900
puf["rental_income"] = (
puf.E25850 - puf.E25860
) # Schedule E rent and royalty
s_corp_income = puf.E26190 - puf.E26180 # Schedule E active S-Corp income
partnership_income = (
puf.E25980 - puf.E25960
) # Schedule E active partnership income
puf["partnership_s_corp_income"] = s_corp_income + partnership_income
puf["farm_operations_income"] = (
puf.E02100
) # Schedule F active farming operations
puf["farm_rent_income"] = puf.E27200 # Schedule E farm rental income
puf["self_employment_income"] = (
puf.E00900
) # Schedule C Sole Proprietorship
puf["self_employed_health_insurance_ald"] = puf.E03270
puf["self_employed_pension_contribution_ald"] = puf.E03300
puf["short_term_capital_gains"] = puf.P22250
Expand Down Expand Up @@ -211,15 +336,40 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# Ignore f2441 (AMT form attached)
# Ignore cmbtp (estimate of AMT income not in AGI)
# Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
# 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
# https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
W2_WAGES_SCALE = 0.101
puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE

# Remove aggregate records
puf = puf[puf.MARS != 0]
# --- Qualified Business Income Deduction (QBID) simulation ---
w2, ubia = simulate_w2_and_ubia_from_puf(puf, seed=42)
puf["w2_wages_from_qualified_business"] = w2
puf["unadjusted_basis_qualified_property"] = ubia

# Discussion #237, O3 chat: https://chatgpt.com/share/6823cb37-7a28-8001-b2bb-0c0a7f47401c
sstb_prob_map_by_name = {
"E00900": 0.20,
"E26270": 0.15,
"E26390": 0.10,
"E26400": 0.10,
}

puf_qbi_sources_for_sstb = puf[sstb_prob_map_by_name.keys()]
largest_qbi_source_name = puf_qbi_sources_for_sstb.idxmax(axis=1)

pr_sstb = largest_qbi_source_name.map(sstb_prob_map_by_name).fillna(0.0)
puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)

# REIT and BCD income: chatgpt.com/c/6835f502-5b48-8006-833a-76170a0acd40
p_reit_ptp = 0.07 # 7 % with income > 0
mu_reit_ptp, sigma_reit_ptp = 8.04, 1.20
puf["qualified_reit_and_ptp_income"] = lognormal_sample(
len(puf), p_reit_ptp, mu_reit_ptp, sigma_reit_ptp
)

# Business-development-company dividends
p_bdc = 0.003 # 0.3 % with income > 0
mu_bdc, sigma_bdc = 8.71, 1.00
puf["qualified_bdc_income"] = lognormal_sample(
len(puf), p_bdc, mu_bdc, sigma_bdc
)
# -------- End of QBID -------
puf["filing_status"] = puf.MARS.map(
{
1: "SINGLE",
Expand Down Expand Up @@ -248,6 +398,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"educator_expense",
"employment_income",
"estate_income",
"farm_operations_income",
"farm_income",
"farm_rent_income",
"health_savings_account_ald",
Expand All @@ -257,7 +408,6 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"unreimbursed_business_employee_expenses",
"non_qualified_dividend_income",
"non_sch_d_capital_gains",
"partnership_s_corp_income",
"qualified_dividend_income",
"qualified_tuition_expenses",
"real_estate_taxes",
Expand Down Expand Up @@ -293,7 +443,12 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"unreported_payroll_tax",
"pre_tax_contributions",
"w2_wages_from_qualified_business",
"unadjusted_basis_qualified_property",
"business_is_sstb",
"deductible_mortgage_interest",
"partnership_s_corp_income",
"qualified_reit_and_ptp_income",
"qualified_bdc_income",
]


Expand Down
Loading