PolicyEngine · baogorek · May 13, 2025 · May 13, 2025 · May 14, 2025 · May 15, 2025
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -138,9 +138,10 @@ def generate(self):
         data = sim.dataset.load_dataset()
         data["household_weight"] = {}
         original_weights = sim.calculate("household_weight")
-        original_weights = original_weights.values + np.random.normal(
-            1, 0.1, len(original_weights)
-        )
+        # TODO: make the call
+        #original_weights = original_weights.values + np.random.normal(
+        #    1, 0.1, len(original_weights)
+        #)
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -26,6 +26,9 @@
     "taxable_ira_distributions",
     "self_employment_income",
     "w2_wages_from_qualified_business",
+    "qbi", # TODO: temporary
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "short_term_capital_gains",
     "qualified_dividend_income",
     "charitable_cash_donations",
@@ -121,33 +124,35 @@ def generate(self):
         cps_sim = Microsimulation(dataset=self.cps)
         data = cps_sim.dataset.load_dataset()
         new_data = {}
-
-        for variable in list(data) + IMPUTED_VARIABLES:
+
+        # My simplification of the process  - only using the CPS with imputed PUF variables
+        for variable in list(data) + IMPUTED_VARIABLES:  # data is cps variables, IMPUTED variables are from PUF
+            # 0. Say variable is from the PUF, so it's in IMPUTED_VARIABLES and not data (which is CPS)
             variable_metadata = cps_sim.tax_benefit_system.variables.get(
                 variable
             )
             if variable in data:
                 values = data[variable][...]
-            else:
-                values = cps_sim.calculate(variable).values
+            #else:
+            #    values = cps_sim.calculate(variable).values   # 1. since puf imputed var is not in cps_sim, these are all zeros
             if variable in IMPUTED_VARIABLES:
                 pred_values = y[variable].values
                 entity = variable_metadata.entity.key
                 if entity != "person":
                     pred_values = cps_sim.populations[
                         entity
-                    ].value_from_first_person(pred_values)
-                values = np.concatenate([values, pred_values])
-            elif variable == "person_id":
-                values = np.concatenate([values, values + values.max()])
-            elif "_id" in variable:
-                values = np.concatenate([values, values + values.max()])
-            elif "_weight" in variable:
-                values = np.concatenate([values, values * 0])
-            else:
-                values = np.concatenate([values, values])
+                    ].value_from_first_person(pred_values)  # should this ever be a sum, depending on the variable?
+                values = pred_values #np.concatenate([values, pred_values])  # 2. But pred values won't be zero, so you'll have 0s an non-zeros
+            #elif variable == "person_id":
+            #    values = np.concatenate([values, values + values.max()])
+            #elif "_id" in variable:
+            #    values = np.concatenate([values, values + values.max()])
+            #elif "_weight" in variable:
+            #    values = np.concatenate([values, values * 0])  # 3. weights are zero when any imputed var is non-zero
+            #else:
+            #    values = np.concatenate([values, values])
             new_data[variable] = {
-                self.time_period: values,
+                    self.time_period: values,  # e.g., {2024: array([...])}
             }
 
         self.save_dataset(new_data)

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -133,7 +133,7 @@ def decode_age_dependent(age_range: int) -> int:
 
 def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # Add variable renames
-    puf.S006 = puf.S006 / 100
+    puf.S006 = puf.S006 / 100  # "The decimal place is implied." Docs say to divide by 100
     # puf["adjusted_gross_income"] = puf.E00100
     puf["alimony_expense"] = puf.E03500
     puf["alimony_income"] = puf.E00800
@@ -203,11 +203,173 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     # Ignore f2441 (AMT form attached)
     # Ignore cmbtp (estimate of AMT income not in AGI)
     # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income)
-    qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200)
+    qbi = np.maximum(0,
+        puf.E00900  # Business or profession (Schedule C) net profit/loss (+/-)
+        + puf.E26270  #  Combined partnership and S corporation net income/loss (Schedule K-1)
+        + puf.E02100  #  Schedule F net profit/loss 
+        + puf.E27200  #  Farm rent net income or loss
+        + puf.E02000  #  Schedule E net profit/loss (rent, royalty, trust, pass-through investment / business income)
+    )
     # 10.1% passthrough rate for W2 wages hits the JCT tax expenditure target for QBID
     # https://gist.github.com/nikhilwoodruff/262c80b8b17935d6fb8544647143b854
-    W2_WAGES_SCALE = 0.101
-    puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE
+
+    regr_df = pd.DataFrame({
+    'y_sched_c_00900': puf.E00900,
+    'y_sched_e_02000': puf.E02000,
+    'y_sched_f_02100': puf.E02100,
+    'y_sched_k1_26270': puf.E26270,
+    'x_farm_rent_27200': puf.E27200,
+    'x_rent_royalty_inc_loss_25700': puf.P25700,
+    'x_rent_royalty_inc_25850': puf.E25850,  # Strictly positive
+    'x_rent_royalty_loss_25860': puf.E25860,  # Strictly positive 
+    'x_estate_income_26390': puf.E26390,
+    'x_estate_loss_26400': puf.E26400,
+    # End of the variables Max mentioned
+    'z_health_insurance_deduction_03270': puf.E03270,  # aparently this is relevant to QBI
+    'x_total_partnership_passive_income_25940': puf.E25940,  # Should not count
+    'x_total_partnership_nonpassive_income_25980': puf.E25980,  # Counts towards QBI
+    'x_total_partnership_passive_loss_25920': puf.E25920,  # Should not count towards QBI
+    'x_total_partnership_nonpassive_loss_25960': puf.E25960, # Counts towards QBI
+    'z_partnership_sec179_deduction_26110': puf.E26110,  # Some of it will count
+    'x_smallbiz_total_passive_income_26170': puf.E26170, # Should not count
+    'x_smallbiz_total_nonpassive_income_26190': puf.E26190, # Should count
+    'x_smallbiz_total_passive_loss_26160': puf.E26160, # Should not count
+    'x_smallbiz_total_nonpassive_loss_26180': puf.E26180, # Should count
+    'z_smallbiz_sec179_deduction_26100': puf.E26100  # Some of it will count 
+    })
+
+
+    regr_df.x_rent_royalty_inc_25850
+    regr_df.x_rent_royalty_loss_25860
+    regr_df.x_rent_royalty_inc_loss_25700
+    np.corrcoef(regr_df.x_rent_royalty_inc_loss_25700,
+                regr_df.x_rent_royalty_inc_25850 - regr_df.x_rent_royalty_loss_25860)
+
+    #'y_sched_c_00900'
+    #'y_sched_e_02000'
+    #'y_sched_f_02100'
+    #'y_sched_k1_26270'
+
+    y_variable_to_regress = 'y_sched_e_02000'
+    x_predictor_variables = [
+        'x_farm_rent_27200',
+        'x_rent_royalty_inc_loss_25700',
+        'x_rent_royalty_inc_25850',
+        'x_rent_royalty_loss_25860',
+        'x_estate_income_26390',
+        'x_estate_loss_26400',
+        'x_total_partnership_passive_income_25940',
+        'x_total_partnership_nonpassive_income_25980',
+        'x_total_partnership_passive_loss_25920',
+        'x_total_partnership_nonpassive_loss_25960',
+        'x_smallbiz_total_passive_income_26170',
+        'x_smallbiz_total_nonpassive_income_26190',
+        'x_smallbiz_total_passive_loss_26160',
+        'x_smallbiz_total_nonpassive_loss_26180'
+    ]
+    Y_target = regr_df[y_variable_to_regress]
+
+    import statsmodels.api as sm
+    X_data = regr_df[x_predictor_variables].copy()
+    X_data_with_const = sm.add_constant(X_data, has_constant='add')
+
+    model = sm.OLS(Y_target, X_data_with_const, missing='drop')
+    results = model.fit()
+    print(f"--------Y: {y_variable_to_regress} ----------")
+    results.summary()
+
+    # wages simulation
+    MIN_MARGIN = .03  # Minimum profit margin
+    MAX_MARGIN = .15  # Maximum profit margin
+
+    MIN_LABOR_RATIO = 0.15  # 15% of revenue goes to W2 wages at minimum
+    MAX_LABOR_RATIO = 0.35  # 35% of revenue goes to W2 wages at maximum
+
+    margins = MIN_MARGIN + (MAX_MARGIN - MIN_MARGIN) * np.random.beta(2, 2, size=qbi.shape[0])
+    revenues = qbi / margins
+    #noise_factor = np.random.normal(1, 0.1, size=qbi.shape[0])
+    #revenues = revenues * noise_factor
+    labor_ratios = MIN_LABOR_RATIO + (MAX_LABOR_RATIO - MIN_LABOR_RATIO) * np.random.beta(2, 2, size=revenues.shape[0])
+    hypothetical_w2_gross_income = revenues * labor_ratios
+
+    pr_has_w2_employees = 1 / (1 + np.exp(-0.5E-5 * (revenues - 4E5)))
+    # p_df = pd.DataFrame({'r': revenues, 'p': pr_has_w2_employees})
+    # p_df.loc[(p_df.r > 8E5) & (p_df.r < 9E5)]
+    has_w2_employees = np.random.binomial(n=1, p=pr_has_w2_employees)
+
+    puf["w2_wages_from_qualified_business"] = 200000 #hypothetical_w2_gross_income * has_w2_employees
+
+    # TODO: remove eventually (I think)
+    puf["qbi"] = 100000 #qbi
+
+    #W2_WAGES_SCALE = 0.101
+    #puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE
+
+    # Unadjusted Basis Qualified Property (UBIA - IA stands for "Immediately after acquisition")
+    hypothetical_ubia = np.maximum(0, -2.2E4 + 3.0E-1 * revenues + 4E4 * np.random.normal(size=len(pr_has_w2_employees)))
+
+    pr_has_qualified_property = np.repeat(.75, len(has_w2_employees))
+    has_qualified_property = np.random.binomial(n=1, p=pr_has_qualified_property)
+
+    puf["unadjusted_basis_qualified_property"] = 2E6 #hypothetical_ubia * has_qualified_property
+
+    largest_qbi_source = np.argmax(puf[["E00900", "E02000", "E02100", "E26270"]], axis=1)
+    largest_qbi_source = np.where(qbi <= 0, -1, largest_qbi_source) 
+
+    pr_sstb = np.where(largest_qbi_source == -1, 0,
+          np.where(largest_qbi_source == 0, 0.40, # Schedule C
+          np.where(largest_qbi_source == 1, 0.03, # Schedule E
+          np.where(largest_qbi_source == 2, 0.30, # Schedule F 
+          np.where(largest_qbi_source == 3, 0.005, # Schedule K-1
+                  largest_qbi_source)))))
+
+    puf["business_is_sstb"] = np.random.binomial(n=1, p=pr_sstb)
+
+
+    def estimate_ubia_from_depreciation(depreciation_amount, business_type=None):
+        """
+        Estimate the Unadjusted Basis Immediately After Acquisition (UBIA)
+        from annual depreciation expense.
+
+        Parameters:
+        -----------
+        depreciation_amount : float
+            The total depreciation amount (E25550)
+        business_type : str, optional
+            Type of business if known (affects avg property life assumption)
+
+        Returns:
+        --------
+        float : Estimated UBIA value
+        """
+        # Default assumptions if no business type is provided
+        avg_property_life = 7  # Average property life in years (typical for business equipment)
+        qualified_property_ratio = 0.8  # Assume 80% of depreciable property qualifies for UBIA
+
+        # Adjust assumptions based on business type
+        if business_type == 'real_estate':
+            avg_property_life = 27.5  # Residential real estate
+            qualified_property_ratio = 0.95  # Higher qualification ratio
+        elif business_type == 'manufacturing':
+            avg_property_life = 10  # Manufacturing equipment
+            qualified_property_ratio = 0.85
+        elif business_type == 'service':
+            avg_property_life = 5  # Service businesses 
+            qualified_property_ratio = 0.7  # Lower equipment ratio
+
+        # Simple straight-line depreciation formula reversal
+        # Annual Depreciation = Original Cost / Useful Life
+        # Therefore: Original Cost = Annual Depreciation * Useful Life
+        estimated_total_property_basis = depreciation_amount * avg_property_life
+
+        # Apply ratio to get qualified property only
+        estimated_ubia = estimated_total_property_basis * qualified_property_ratio
+
+        return estimated_ubia
+
+    # Apply the function to your data
+    # Example usage:
+    puf['unadjusted_basis_qualified_property'] = puf.E25550.apply(estimate_ubia_from_depreciation)
 
     # Remove aggregate records
     puf = puf[puf.MARS != 0]
@@ -285,6 +447,9 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "unreported_payroll_tax",
     "pre_tax_contributions",
     "w2_wages_from_qualified_business",
+    "qbi",   # TODO: temporary
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
     "deductible_mortgage_interest",
 ]
 

diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from .soi import pe_to_soi, get_soi
+from policyengine_us_data.utils.soi import pe_to_soi, get_soi
 import numpy as np
 from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_core.reforms import Reform
@@ -394,7 +394,8 @@ def _add_tax_expenditure_targets(
         "medical_expense_deduction": 11.4e9,
         "charitable_deduction": 65.301e9,
         "interest_deduction": 24.8e9,
-        "qualified_business_income_deduction": 63.1e9,
+        # TODO
+        #"qualified_business_income_deduction": 63.1e9,
     }
 
     def make_repeal_class(deduction_var):