Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,10 @@ def generate(self):
data = sim.dataset.load_dataset()
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
1, 0.1, len(original_weights)
)
# TODO: make the call
#original_weights = original_weights.values + np.random.normal(
# 1, 0.1, len(original_weights)
#)
for year in range(self.start_year, self.end_year + 1):
loss_matrix, targets_array = build_loss_matrix(
self.input_dataset, year
Expand Down
35 changes: 20 additions & 15 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
"taxable_ira_distributions",
"self_employment_income",
"w2_wages_from_qualified_business",
"qbi", # TODO: temporary
"unadjusted_basis_qualified_property",
"business_is_sstb",
"short_term_capital_gains",
"qualified_dividend_income",
"charitable_cash_donations",
Expand Down Expand Up @@ -121,33 +124,35 @@ def generate(self):
cps_sim = Microsimulation(dataset=self.cps)
data = cps_sim.dataset.load_dataset()
new_data = {}

for variable in list(data) + IMPUTED_VARIABLES:

# My simplification of the process - only using the CPS with imputed PUF variables
for variable in list(data) + IMPUTED_VARIABLES: # data is cps variables, IMPUTED variables are from PUF
# 0. Say variable is from the PUF, so it's in IMPUTED_VARIABLES and not data (which is CPS)
variable_metadata = cps_sim.tax_benefit_system.variables.get(
variable
)
if variable in data:
values = data[variable][...]
else:
values = cps_sim.calculate(variable).values
#else:
# values = cps_sim.calculate(variable).values # 1. since puf imputed var is not in cps_sim, these are all zeros
if variable in IMPUTED_VARIABLES:
pred_values = y[variable].values
entity = variable_metadata.entity.key
if entity != "person":
pred_values = cps_sim.populations[
entity
].value_from_first_person(pred_values)
values = np.concatenate([values, pred_values])
elif variable == "person_id":
values = np.concatenate([values, values + values.max()])
elif "_id" in variable:
values = np.concatenate([values, values + values.max()])
elif "_weight" in variable:
values = np.concatenate([values, values * 0])
else:
values = np.concatenate([values, values])
].value_from_first_person(pred_values) # should this ever be a sum, depending on the variable?
values = pred_values #np.concatenate([values, pred_values]) # 2. But pred values won't be zero, so you'll have 0s an non-zeros
#elif variable == "person_id":
# values = np.concatenate([values, values + values.max()])
#elif "_id" in variable:
# values = np.concatenate([values, values + values.max()])
#elif "_weight" in variable:
# values = np.concatenate([values, values * 0]) # 3. weights are zero when any imputed var is non-zero
#else:
# values = np.concatenate([values, values])
new_data[variable] = {
self.time_period: values,
self.time_period: values, # e.g., {2024: array([...])}
}

self.save_dataset(new_data)
Expand Down
173 changes: 169 additions & 4 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def decode_age_dependent(age_range: int) -> int:

def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# Add variable renames
puf.S006 = puf.S006 / 100
puf.S006 = puf.S006 / 100 # "The decimal place is implied." Docs say to divide by 100
# puf["adjusted_gross_income"] = puf.E00100
puf["alimony_expense"] = puf.E03500
puf["alimony_income"] = puf.E00800
Expand Down Expand Up @@ -203,11 +203,173 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
# Ignore f2441 (AMT form attached)
# Ignore cmbtp (estimate of AMT income not in AGI)
# Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
qbi = np.maximum(0,
puf.E00900 # Business or profession (Schedule C) net profit/loss (+/-)
+ puf.E26270 # Combined partnership and S corporation net income/loss (Schedule K-1)
+ puf.E02100 # Schedule F net profit/loss
+ puf.E27200 # Farm rent net income or loss
+ puf.E02000 # Schedule E net profit/loss (rent, royalty, trust, pass-through investment / business income)
)
# 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
# https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
W2_WAGES_SCALE = 0.101
puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE

regr_df = pd.DataFrame({
'y_sched_c_00900': puf.E00900,
'y_sched_e_02000': puf.E02000,
'y_sched_f_02100': puf.E02100,
'y_sched_k1_26270': puf.E26270,
'x_farm_rent_27200': puf.E27200,
'x_rent_royalty_inc_loss_25700': puf.P25700,
'x_rent_royalty_inc_25850': puf.E25850, # Strictly positive
'x_rent_royalty_loss_25860': puf.E25860, # Strictly positive
'x_estate_income_26390': puf.E26390,
'x_estate_loss_26400': puf.E26400,
# End of the variables Max mentioned
'z_health_insurance_deduction_03270': puf.E03270, # aparently this is relevant to QBI
'x_total_partnership_passive_income_25940': puf.E25940, # Should not count
'x_total_partnership_nonpassive_income_25980': puf.E25980, # Counts towards QBI
'x_total_partnership_passive_loss_25920': puf.E25920, # Should not count towards QBI
'x_total_partnership_nonpassive_loss_25960': puf.E25960, # Counts towards QBI
'z_partnership_sec179_deduction_26110': puf.E26110, # Some of it will count
'x_smallbiz_total_passive_income_26170': puf.E26170, # Should not count
'x_smallbiz_total_nonpassive_income_26190': puf.E26190, # Should count
'x_smallbiz_total_passive_loss_26160': puf.E26160, # Should not count
'x_smallbiz_total_nonpassive_loss_26180': puf.E26180, # Should count
'z_smallbiz_sec179_deduction_26100': puf.E26100 # Some of it will count
})


regr_df.x_rent_royalty_inc_25850
regr_df.x_rent_royalty_loss_25860
regr_df.x_rent_royalty_inc_loss_25700
np.corrcoef(regr_df.x_rent_royalty_inc_loss_25700,
regr_df.x_rent_royalty_inc_25850 - regr_df.x_rent_royalty_loss_25860)

#'y_sched_c_00900'
#'y_sched_e_02000'
#'y_sched_f_02100'
#'y_sched_k1_26270'

y_variable_to_regress = 'y_sched_e_02000'
x_predictor_variables = [
'x_farm_rent_27200',
'x_rent_royalty_inc_loss_25700',
'x_rent_royalty_inc_25850',
'x_rent_royalty_loss_25860',
'x_estate_income_26390',
'x_estate_loss_26400',
'x_total_partnership_passive_income_25940',
'x_total_partnership_nonpassive_income_25980',
'x_total_partnership_passive_loss_25920',
'x_total_partnership_nonpassive_loss_25960',
'x_smallbiz_total_passive_income_26170',
'x_smallbiz_total_nonpassive_income_26190',
'x_smallbiz_total_passive_loss_26160',
'x_smallbiz_total_nonpassive_loss_26180'
]
Y_target = regr_df[y_variable_to_regress]

import statsmodels.api as sm
X_data = regr_df[x_predictor_variables].copy()
X_data_with_const = sm.add_constant(X_data, has_constant='add')

model = sm.OLS(Y_target, X_data_with_const, missing='drop')
results = model.fit()
print(f"--------Y: {y_variable_to_regress} ----------")
results.summary()

# wages simulation
MIN_MARGIN = .03 # Minimum profit margin
MAX_MARGIN = .15 # Maximum profit margin

MIN_LABOR_RATIO = 0.15 # 15% of revenue goes to W2 wages at minimum
MAX_LABOR_RATIO = 0.35 # 35% of revenue goes to W2 wages at maximum

margins = MIN_MARGIN + (MAX_MARGIN - MIN_MARGIN) * np.random.beta(2, 2, size=qbi.shape[0])
revenues = qbi / margins
#noise_factor = np.random.normal(1, 0.1, size=qbi.shape[0])
#revenues = revenues * noise_factor
labor_ratios = MIN_LABOR_RATIO + (MAX_LABOR_RATIO - MIN_LABOR_RATIO) * np.random.beta(2, 2, size=revenues.shape[0])
hypothetical_w2_gross_income = revenues * labor_ratios

pr_has_w2_employees = 1 / (1 + np.exp(-0.5E-5 * (revenues - 4E5)))
# p_df = pd.DataFrame({'r': revenues, 'p': pr_has_w2_employees})
# p_df.loc[(p_df.r > 8E5) & (p_df.r < 9E5)]
has_w2_employees = np.random.binomial(n=1, p=pr_has_w2_employees)

puf["w2_wages_from_qualified_business"] = 200000 #hypothetical_w2_gross_income * has_w2_employees

# TODO: remove eventually (I think)
puf["qbi"] = 100000 #qbi

#W2_WAGES_SCALE = 0.101
#puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE

# Unadjusted Basis Qualified Property (UBIA - IA stands for "Immediately after acquisition")
hypothetical_ubia = np.maximum(0, -2.2E4 + 3.0E-1 * revenues + 4E4 * np.random.normal(size=len(pr_has_w2_employees)))

pr_has_qualified_property = np.repeat(.75, len(has_w2_employees))
has_qualified_property = np.random.binomial(n=1, p=pr_has_qualified_property)

puf["unadjusted_basis_qualified_property"] = 2E6 #hypothetical_ubia * has_qualified_property

largest_qbi_source = np.argmax(puf[["E00900", "E02000", "E02100", "E26270"]], axis=1)
largest_qbi_source = np.where(qbi <= 0, -1, largest_qbi_source)

pr_sstb = np.where(largest_qbi_source == -1, 0,
np.where(largest_qbi_source == 0, 0.40, # Schedule C
np.where(largest_qbi_source == 1, 0.03, # Schedule E
np.where(largest_qbi_source == 2, 0.30, # Schedule F
np.where(largest_qbi_source == 3, 0.005, # Schedule K-1
largest_qbi_source)))))

puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)


def estimate_ubia_from_depreciation(depreciation_amount, business_type=None):
"""
Estimate the Unadjusted Basis Immediately After Acquisition (UBIA)
from annual depreciation expense.

Parameters:
-----------
depreciation_amount : float
The total depreciation amount (E25550)
business_type : str, optional
Type of business if known (affects avg property life assumption)

Returns:
--------
float : Estimated UBIA value
"""
# Default assumptions if no business type is provided
avg_property_life = 7 # Average property life in years (typical for business equipment)
qualified_property_ratio = 0.8 # Assume 80% of depreciable property qualifies for UBIA

# Adjust assumptions based on business type
if business_type == 'real_estate':
avg_property_life = 27.5 # Residential real estate
qualified_property_ratio = 0.95 # Higher qualification ratio
elif business_type == 'manufacturing':
avg_property_life = 10 # Manufacturing equipment
qualified_property_ratio = 0.85
elif business_type == 'service':
avg_property_life = 5 # Service businesses
qualified_property_ratio = 0.7 # Lower equipment ratio

# Simple straight-line depreciation formula reversal
# Annual Depreciation = Original Cost / Useful Life
# Therefore: Original Cost = Annual Depreciation * Useful Life
estimated_total_property_basis = depreciation_amount * avg_property_life

# Apply ratio to get qualified property only
estimated_ubia = estimated_total_property_basis * qualified_property_ratio

return estimated_ubia

# Apply the function to your data
# Example usage:
puf['unadjusted_basis_qualified_property'] = puf.E25550.apply(estimate_ubia_from_depreciation)

# Remove aggregate records
puf = puf[puf.MARS != 0]
Expand Down Expand Up @@ -285,6 +447,9 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
"unreported_payroll_tax",
"pre_tax_contributions",
"w2_wages_from_qualified_business",
"qbi", # TODO: temporary
"unadjusted_basis_qualified_property",
"business_is_sstb",
"deductible_mortgage_interest",
]

Expand Down
5 changes: 3 additions & 2 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from .soi import pe_to_soi, get_soi
from policyengine_us_data.utils.soi import pe_to_soi, get_soi
import numpy as np
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_core.reforms import Reform
Expand Down Expand Up @@ -394,7 +394,8 @@ def _add_tax_expenditure_targets(
"medical_expense_deduction": 11.4e9,
"charitable_deduction": 65.301e9,
"interest_deduction": 24.8e9,
"qualified_business_income_deduction": 63.1e9,
# TODO
#"qualified_business_income_deduction": 63.1e9,
}

def make_repeal_class(deduction_var):
Expand Down
Loading