Merge pull request #165 from PolicyEngine/nikhilwoodruff/issue163

MaxGhenis · web-flow · commit 1192d9899dfd · 2025-02-17T19:57:00.000-05:00
Target itemized deduction tax expenditures
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -242,29 +242,6 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None:
     ]
     cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1)
 
-    def _assign_some_newborns_to_pregnancy(
-        age: pd.Series, person: pd.DataFrame
-    ) -> pd.Series:
-        """Takes an array of ages, returns the new age array with the given percentage of newborns assigned a negative age (in pregnancy)."""
-        age = np.where(
-            person.A_AGE == 0,
-            np.where(
-                np.random.randint(
-                    0, 2, len(person)
-                ),  # Random number of 0 or 1
-                # If 1 is flipped, select a random number between -0.75 and 0
-                # This will represent the pregnany month
-                # At -0.75 the pregnancy month is 0 and at -0.0001 the pregnancy month is 9
-                np.random.uniform(-0.75, 0, len(person)),
-                # If 0 is flipped, the child is a newborn at the age of 0 to 1
-                np.random.uniform(0, 1, len(person)),
-            ),
-            person.A_AGE,
-        )
-        return age
-
-    cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"], person)
-
     def children_per_parent(col: str) -> pd.DataFrame:
         """Calculate number of children in the household using parental
             pointers.
diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -36,21 +36,46 @@ def test_ecps_has_mortgage_interest():
     sim = Microsimulation(dataset=EnhancedCPS_2024)
 
     assert sim.calculate("deductible_mortgage_interest").sum() > 1
-    assert sim.calculate("deductible_interest_expense").sum() > 1
 
 
-def test_newborns_and_pregnancies():
-    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
+def test_ecps_replicates_jct_tax_expenditures():
     from policyengine_us import Microsimulation
+    from policyengine_core.reforms import Reform
+    from policyengine_us_data.datasets import EnhancedCPS_2024
+
+    # JCT tax expenditure targets
+    EXPENDITURE_TARGETS = {
+        "salt_deduction": 21.247e9,
+        "medical_expense_deduction": 11.4e9,
+        "charitable_deduction": 65.301e9,
+        "interest_deduction": 24.8e9,
+    }
 
-    sim = Microsimulation(dataset=EnhancedCPS_2024)
-
-    # Test for unborn children (age < 0)
-    unborn = sim.calculate("age") < 0
-    unborn_count = unborn.sum()
-    assert unborn_count > 0
-
-    # Test for newborns (0 <= age < 1)
-    newborns = (sim.calculate("age") >= 0) & (sim.calculate("age") < 1)
-    newborn_count = newborns.sum()
-    assert newborn_count > 0
+    baseline = Microsimulation(dataset=EnhancedCPS_2024)
+    income_tax_b = baseline.calculate(
+        "income_tax", period=2024, map_to="household"
+    )
+
+    for deduction, target in EXPENDITURE_TARGETS.items():
+        # Create reform that neutralizes the deduction
+        class RepealDeduction(Reform):
+            def apply(self):
+                self.neutralize_variable(deduction)
+
+        # Run reform simulation
+        reformed = Microsimulation(
+            reform=RepealDeduction, dataset=EnhancedCPS_2024
+        )
+        income_tax_r = reformed.calculate(
+            "income_tax", period=2024, map_to="household"
+        )
+
+        # Calculate tax expenditure
+        tax_expenditure = (income_tax_r - income_tax_b).sum()
+        pct_error = abs((tax_expenditure - target) / target)
+        TOLERANCE = 0.15
+
+        print(
+            f"{deduction} tax expenditure {tax_expenditure/1e9:.1f}bn differs from target {target/1e9:.1f}bn by {pct_error:.2%}"
+        )
+        assert pct_error < TOLERANCE, deduction
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -2,6 +2,7 @@
 from .soi import pe_to_soi, get_soi
 import numpy as np
 from policyengine_us_data.storage import STORAGE_FOLDER
+from policyengine_core.reforms import Reform
 
 
 def fmt(x):
@@ -132,6 +133,7 @@ def build_loss_matrix(dataset: type, time_period):
     from policyengine_us import Microsimulation
 
     sim = Microsimulation(dataset=dataset)
+    sim.default_calculation_period = time_period
     hh_id = sim.calculate("household_id", map_to="person")
     tax_unit_hh_id = sim.map_result(
         hh_id, "person", "tax_unit", how="value_from_first_person"
@@ -252,7 +254,7 @@ def build_loss_matrix(dataset: type, time_period):
         "alimony_income": 13e9,
         "alimony_expense": 13e9,
         # Rough estimate, not CPS derived
-        "real_estate_taxes": 400e9,  # Rough estimate between 350bn and 600bn total property tax collections
+        "real_estate_taxes": 500e9,  # Rough estimate between 350bn and 600bn total property tax collections
         "rent": 735e9,  # ACS total uprated by CPI
     }
 
@@ -340,18 +342,22 @@ def build_loss_matrix(dataset: type, time_period):
         )
         targets_array.append(row["population_under_5"])
 
-    # Population by number of newborns and pregancies
-
     age = sim.calculate("age").values
     infants = (age >= 0) & (age < 1)
     label = "census/infants"
     loss_matrix[label] = sim.map_result(infants, "person", "household")
-    targets_array.append(3_491_679)
+    # Total number of infants in the 1 Year ACS
+    INFANTS_2023 = 3_491_679
+    INFANTS_2022 = 3_437_933
+    # Assume infant population grows at the same rate from 2023.
+    infants_2024 = INFANTS_2023 * (INFANTS_2023 / INFANTS_2022)
+    targets_array.append(infants_2024)
+
+    # SALT tax expenditure targeting
 
-    pregnancies = (age >= -0.75) & (age < 0)
-    label = "census/pregnancies"
-    loss_matrix[label] = sim.map_result(pregnancies, "person", "household")
-    targets_array.append(2_618_759)
+    _add_tax_expenditure_targets(
+        dataset, time_period, sim, loss_matrix, targets_array
+    )
 
     if any(loss_matrix.isna().sum() > 0):
         raise ValueError("Some targets are missing from the loss matrix")
@@ -360,3 +366,55 @@ def build_loss_matrix(dataset: type, time_period):
         raise ValueError("Some targets are missing from the targets array")
 
     return loss_matrix, np.array(targets_array)
+
+
+def _add_tax_expenditure_targets(
+    dataset,
+    time_period,
+    baseline_simulation,
+    loss_matrix: pd.DataFrame,
+    targets_array: list,
+):
+    from policyengine_us import Microsimulation
+
+    income_tax_b = baseline_simulation.calculate(
+        "income_tax", map_to="household"
+    ).values
+
+    # Dictionary of itemized deductions and their target values
+    # (in billions for 2024, per the 2024 JCT Tax Expenditures report)
+    # https://www.jct.gov/publications/2024/jcx-48-24/
+    ITEMIZED_DEDUCTIONS = {
+        "salt_deduction": 21.247e9,
+        "medical_expense_deduction": 11.4e9,
+        "charitable_deduction": 65.301e9,
+        "interest_deduction": 24.8e9,
+    }
+
+    def make_repeal_class(deduction_var):
+        # Create a custom Reform subclass that neutralizes the given deduction.
+        class RepealDeduction(Reform):
+            def apply(self):
+                self.neutralize_variable(deduction_var)
+
+        return RepealDeduction
+
+    for deduction, target in ITEMIZED_DEDUCTIONS.items():
+        # Generate the custom repeal class for the current deduction.
+        RepealDeduction = make_repeal_class(deduction)
+
+        # Run the microsimulation using the repeal reform.
+        simulation = Microsimulation(dataset=dataset, reform=RepealDeduction)
+        simulation.default_calculation_period = time_period
+
+        # Calculate the baseline and reform income tax values.
+        income_tax_r = simulation.calculate(
+            "income_tax", map_to="household"
+        ).values
+
+        # Compute the tax expenditure (TE) values.
+        te_values = income_tax_r - income_tax_b
+
+        # Record the TE difference and the corresponding target value.
+        loss_matrix[f"jct/{deduction}_expenditure"] = te_values
+        targets_array.append(target)
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.10, <3.13.0"
 dependencies = [
-    "policyengine_us",
+    "policyengine_us>=1.197.0",
     "policyengine_core>=3.14.1",
     "requests",
     "tqdm",