diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 350589e5..6b86bdb1 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -138,9 +138,10 @@ def generate(self): data = sim.dataset.load_dataset() data["household_weight"] = {} original_weights = sim.calculate("household_weight") - original_weights = original_weights.values + np.random.normal( - 1, 0.1, len(original_weights) - ) + # TODO: make the call + #original_weights = original_weights.values + np.random.normal( + # 1, 0.1, len(original_weights) + #) for year in range(self.start_year, self.end_year + 1): loss_matrix, targets_array = build_loss_matrix( self.input_dataset, year diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 691c9b4b..380fdb16 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,6 +26,9 @@ "taxable_ira_distributions", "self_employment_income", "w2_wages_from_qualified_business", + "qbi", # TODO: temporary + "unadjusted_basis_qualified_property", + "business_is_sstb", "short_term_capital_gains", "qualified_dividend_income", "charitable_cash_donations", @@ -121,33 +124,35 @@ def generate(self): cps_sim = Microsimulation(dataset=self.cps) data = cps_sim.dataset.load_dataset() new_data = {} - - for variable in list(data) + IMPUTED_VARIABLES: + + # My simplification of the process - only using the CPS with imputed PUF variables + for variable in list(data) + IMPUTED_VARIABLES: # data is cps variables, IMPUTED variables are from PUF + # 0. Say variable is from the PUF, so it's in IMPUTED_VARIABLES and not data (which is CPS) variable_metadata = cps_sim.tax_benefit_system.variables.get( variable ) if variable in data: values = data[variable][...] - else: - values = cps_sim.calculate(variable).values + #else: + # values = cps_sim.calculate(variable).values # 1. since puf imputed var is not in cps_sim, these are all zeros if variable in IMPUTED_VARIABLES: pred_values = y[variable].values entity = variable_metadata.entity.key if entity != "person": pred_values = cps_sim.populations[ entity - ].value_from_first_person(pred_values) - values = np.concatenate([values, pred_values]) - elif variable == "person_id": - values = np.concatenate([values, values + values.max()]) - elif "_id" in variable: - values = np.concatenate([values, values + values.max()]) - elif "_weight" in variable: - values = np.concatenate([values, values * 0]) - else: - values = np.concatenate([values, values]) + ].value_from_first_person(pred_values) # should this ever be a sum, depending on the variable? + values = pred_values #np.concatenate([values, pred_values]) # 2. But pred values won't be zero, so you'll have 0s an non-zeros + #elif variable == "person_id": + # values = np.concatenate([values, values + values.max()]) + #elif "_id" in variable: + # values = np.concatenate([values, values + values.max()]) + #elif "_weight" in variable: + # values = np.concatenate([values, values * 0]) # 3. weights are zero when any imputed var is non-zero + #else: + # values = np.concatenate([values, values]) new_data[variable] = { - self.time_period: values, + self.time_period: values, # e.g., {2024: array([...])} } self.save_dataset(new_data) diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 9b73a148..c3e99c82 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -133,7 +133,7 @@ def decode_age_dependent(age_range: int) -> int: def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: # Add variable renames - puf.S006 = puf.S006 / 100 + puf.S006 = puf.S006 / 100 # "The decimal place is implied." Docs say to divide by 100 # puf["adjusted_gross_income"] = puf.E00100 puf["alimony_expense"] = puf.E03500 puf["alimony_income"] = puf.E00800 @@ -203,11 +203,173 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: # Ignore f2441 (AMT form attached) # Ignore cmbtp (estimate of AMT income not in AGI) # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income) - qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200) + qbi = np.maximum(0, + puf.E00900 # Business or profession (Schedule C) net profit/loss (+/-) + + puf.E26270 # Combined partnership and S corporation net income/loss (Schedule K-1) + + puf.E02100 # Schedule F net profit/loss + + puf.E27200 # Farm rent net income or loss + + puf.E02000 # Schedule E net profit/loss (rent, royalty, trust, pass-through investment / business income) + ) # 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID # https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854 - W2_WAGES_SCALE = 0.101 - puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE + + regr_df = pd.DataFrame({ + 'y_sched_c_00900': puf.E00900, + 'y_sched_e_02000': puf.E02000, + 'y_sched_f_02100': puf.E02100, + 'y_sched_k1_26270': puf.E26270, + 'x_farm_rent_27200': puf.E27200, + 'x_rent_royalty_inc_loss_25700': puf.P25700, + 'x_rent_royalty_inc_25850': puf.E25850, # Strictly positive + 'x_rent_royalty_loss_25860': puf.E25860, # Strictly positive + 'x_estate_income_26390': puf.E26390, + 'x_estate_loss_26400': puf.E26400, + # End of the variables Max mentioned + 'z_health_insurance_deduction_03270': puf.E03270, # aparently this is relevant to QBI + 'x_total_partnership_passive_income_25940': puf.E25940, # Should not count + 'x_total_partnership_nonpassive_income_25980': puf.E25980, # Counts towards QBI + 'x_total_partnership_passive_loss_25920': puf.E25920, # Should not count towards QBI + 'x_total_partnership_nonpassive_loss_25960': puf.E25960, # Counts towards QBI + 'z_partnership_sec179_deduction_26110': puf.E26110, # Some of it will count + 'x_smallbiz_total_passive_income_26170': puf.E26170, # Should not count + 'x_smallbiz_total_nonpassive_income_26190': puf.E26190, # Should count + 'x_smallbiz_total_passive_loss_26160': puf.E26160, # Should not count + 'x_smallbiz_total_nonpassive_loss_26180': puf.E26180, # Should count + 'z_smallbiz_sec179_deduction_26100': puf.E26100 # Some of it will count + }) + + + regr_df.x_rent_royalty_inc_25850 + regr_df.x_rent_royalty_loss_25860 + regr_df.x_rent_royalty_inc_loss_25700 + np.corrcoef(regr_df.x_rent_royalty_inc_loss_25700, + regr_df.x_rent_royalty_inc_25850 - regr_df.x_rent_royalty_loss_25860) + + #'y_sched_c_00900' + #'y_sched_e_02000' + #'y_sched_f_02100' + #'y_sched_k1_26270' + + y_variable_to_regress = 'y_sched_e_02000' + x_predictor_variables = [ + 'x_farm_rent_27200', + 'x_rent_royalty_inc_loss_25700', + 'x_rent_royalty_inc_25850', + 'x_rent_royalty_loss_25860', + 'x_estate_income_26390', + 'x_estate_loss_26400', + 'x_total_partnership_passive_income_25940', + 'x_total_partnership_nonpassive_income_25980', + 'x_total_partnership_passive_loss_25920', + 'x_total_partnership_nonpassive_loss_25960', + 'x_smallbiz_total_passive_income_26170', + 'x_smallbiz_total_nonpassive_income_26190', + 'x_smallbiz_total_passive_loss_26160', + 'x_smallbiz_total_nonpassive_loss_26180' + ] + Y_target = regr_df[y_variable_to_regress] + + import statsmodels.api as sm + X_data = regr_df[x_predictor_variables].copy() + X_data_with_const = sm.add_constant(X_data, has_constant='add') + + model = sm.OLS(Y_target, X_data_with_const, missing='drop') + results = model.fit() + print(f"--------Y: {y_variable_to_regress} ----------") + results.summary() + + # wages simulation + MIN_MARGIN = .03 # Minimum profit margin + MAX_MARGIN = .15 # Maximum profit margin + + MIN_LABOR_RATIO = 0.15 # 15% of revenue goes to W2 wages at minimum + MAX_LABOR_RATIO = 0.35 # 35% of revenue goes to W2 wages at maximum + + margins = MIN_MARGIN + (MAX_MARGIN - MIN_MARGIN) * np.random.beta(2, 2, size=qbi.shape[0]) + revenues = qbi / margins + #noise_factor = np.random.normal(1, 0.1, size=qbi.shape[0]) + #revenues = revenues * noise_factor + labor_ratios = MIN_LABOR_RATIO + (MAX_LABOR_RATIO - MIN_LABOR_RATIO) * np.random.beta(2, 2, size=revenues.shape[0]) + hypothetical_w2_gross_income = revenues * labor_ratios + + pr_has_w2_employees = 1 / (1 + np.exp(-0.5E-5 * (revenues - 4E5))) + # p_df = pd.DataFrame({'r': revenues, 'p': pr_has_w2_employees}) + # p_df.loc[(p_df.r > 8E5) & (p_df.r < 9E5)] + has_w2_employees = np.random.binomial(n=1, p=pr_has_w2_employees) + + puf["w2_wages_from_qualified_business"] = 200000 #hypothetical_w2_gross_income * has_w2_employees + + # TODO: remove eventually (I think) + puf["qbi"] = 100000 #qbi + + #W2_WAGES_SCALE = 0.101 + #puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE + + # Unadjusted Basis Qualified Property (UBIA - IA stands for "Immediately after acquisition") + hypothetical_ubia = np.maximum(0, -2.2E4 + 3.0E-1 * revenues + 4E4 * np.random.normal(size=len(pr_has_w2_employees))) + + pr_has_qualified_property = np.repeat(.75, len(has_w2_employees)) + has_qualified_property = np.random.binomial(n=1, p=pr_has_qualified_property) + + puf["unadjusted_basis_qualified_property"] = 2E6 #hypothetical_ubia * has_qualified_property + + largest_qbi_source = np.argmax(puf[["E00900", "E02000", "E02100", "E26270"]], axis=1) + largest_qbi_source = np.where(qbi <= 0, -1, largest_qbi_source) + + pr_sstb = np.where(largest_qbi_source == -1, 0, + np.where(largest_qbi_source == 0, 0.40, # Schedule C + np.where(largest_qbi_source == 1, 0.03, # Schedule E + np.where(largest_qbi_source == 2, 0.30, # Schedule F + np.where(largest_qbi_source == 3, 0.005, # Schedule K-1 + largest_qbi_source))))) + + puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb) + + + def estimate_ubia_from_depreciation(depreciation_amount, business_type=None): + """ + Estimate the Unadjusted Basis Immediately After Acquisition (UBIA) + from annual depreciation expense. + + Parameters: + ----------- + depreciation_amount : float + The total depreciation amount (E25550) + business_type : str, optional + Type of business if known (affects avg property life assumption) + + Returns: + -------- + float : Estimated UBIA value + """ + # Default assumptions if no business type is provided + avg_property_life = 7 # Average property life in years (typical for business equipment) + qualified_property_ratio = 0.8 # Assume 80% of depreciable property qualifies for UBIA + + # Adjust assumptions based on business type + if business_type == 'real_estate': + avg_property_life = 27.5 # Residential real estate + qualified_property_ratio = 0.95 # Higher qualification ratio + elif business_type == 'manufacturing': + avg_property_life = 10 # Manufacturing equipment + qualified_property_ratio = 0.85 + elif business_type == 'service': + avg_property_life = 5 # Service businesses + qualified_property_ratio = 0.7 # Lower equipment ratio + + # Simple straight-line depreciation formula reversal + # Annual Depreciation = Original Cost / Useful Life + # Therefore: Original Cost = Annual Depreciation * Useful Life + estimated_total_property_basis = depreciation_amount * avg_property_life + + # Apply ratio to get qualified property only + estimated_ubia = estimated_total_property_basis * qualified_property_ratio + + return estimated_ubia + + # Apply the function to your data + # Example usage: + puf['unadjusted_basis_qualified_property'] = puf.E25550.apply(estimate_ubia_from_depreciation) # Remove aggregate records puf = puf[puf.MARS != 0] @@ -285,6 +447,9 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: "unreported_payroll_tax", "pre_tax_contributions", "w2_wages_from_qualified_business", + "qbi", # TODO: temporary + "unadjusted_basis_qualified_property", + "business_is_sstb", "deductible_mortgage_interest", ] diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 76884a03..011fb8ca 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -1,5 +1,5 @@ import pandas as pd -from .soi import pe_to_soi, get_soi +from policyengine_us_data.utils.soi import pe_to_soi, get_soi import numpy as np from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_core.reforms import Reform @@ -394,7 +394,8 @@ def _add_tax_expenditure_targets( "medical_expense_deduction": 11.4e9, "charitable_deduction": 65.301e9, "interest_deduction": 24.8e9, - "qualified_business_income_deduction": 63.1e9, + # TODO + #"qualified_business_income_deduction": 63.1e9, } def make_repeal_class(deduction_var):