adding matrix builder improvements

juaristi22 · juaristi22 · commit b18680d7bc6c · 2026-01-22T21:04:23.000+05:30
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -192,15 +192,21 @@ def get_calculated_variables(sim) -> List[str]:
     """
     Return variables that should be cleared for state-swap recalculation.
 
-    Includes variables with formulas, adds, or subtracts.
-
-    Excludes ID variables (person_id, household_id, etc.) because:
-    1. They have formulas that generate sequential IDs (0, 1, 2, ...)
-    2. We need the original H5 values, not regenerated sequences
-    3. PolicyEngine's random() function uses entity IDs as seeds:
-       seed = abs(entity_id * 100 + count_random_calls)
-       If IDs change, random-dependent variables (SSI resource test,
-       WIC nutritional risk, WIC takeup) produce different results.
+    Includes variables with formulas, or adds/subtracts that are lists.
+
+    Excludes:
+    1. ID variables (person_id, household_id, etc.) - needed for random seeds
+    2. Variables with string adds/subtracts (parameter paths) - these are
+       pseudo-inputs stored in H5 that would recalculate differently using
+       parameter lookups. Examples: pre_tax_contributions.
+    3. Variables in input_variables (have stored H5 values) even if they
+       have formulas - the stored values represent original survey data
+       that should be preserved. Examples: cdcc_relevant_expenses, rent.
+
+    The exclusions are critical because:
+    - The H5 file stores pre-computed values from original CPS processing
+    - If deleted, recalculation produces different values, corrupting
+      downstream calculations like income_tax
     """
     exclude_ids = {
         "person_id",
@@ -210,16 +216,36 @@ def get_calculated_variables(sim) -> List[str]:
         "family_id",
         "marital_unit_id",
     }
-    return [
-        name
-        for name, var in sim.tax_benefit_system.variables.items()
-        if (
-            var.formulas
-            or getattr(var, "adds", None)
-            or getattr(var, "subtracts", None)
-        )
-        and name not in exclude_ids
-    ]
+
+    # Get stored input variables to exclude
+    input_vars = set(sim.input_variables)
+
+    result = []
+    for name, var in sim.tax_benefit_system.variables.items():
+        if name in exclude_ids:
+            continue
+
+        # Exclude variables that have stored values (input_variables)
+        # These represent original survey data that should be preserved
+        if name in input_vars:
+            continue
+
+        # Include if has formulas
+        if var.formulas:
+            result.append(name)
+            continue
+
+        # Include if adds/subtracts is a list (explicit component aggregation)
+        # Exclude if adds/subtracts is a string (parameter path - pseudo-input)
+        adds = getattr(var, "adds", None)
+        subtracts = getattr(var, "subtracts", None)
+
+        if adds and isinstance(adds, list):
+            result.append(name)
+        elif subtracts and isinstance(subtracts, list):
+            result.append(name)
+
+    return result
 
 
 def get_pseudo_input_variables(sim) -> set:
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
@@ -38,6 +38,105 @@ def __init__(
         self.time_period = time_period
         self.cds_to_calibrate = cds_to_calibrate
         self.dataset_path = dataset_path
+        self._entity_rel_cache = None
+
+    def _build_entity_relationship(self, sim) -> pd.DataFrame:
+        """
+        Build entity relationship DataFrame mapping persons to all entity IDs.
+
+        This is used to evaluate constraints at the person level and then
+        aggregate to household level, handling variables defined at different
+        entity levels (person, tax_unit, household, spm_unit).
+
+        Returns:
+            DataFrame with person_id, household_id, tax_unit_id, spm_unit_id
+        """
+        if self._entity_rel_cache is not None:
+            return self._entity_rel_cache
+
+        self._entity_rel_cache = pd.DataFrame(
+            {
+                "person_id": sim.calculate(
+                    "person_id", map_to="person"
+                ).values,
+                "household_id": sim.calculate(
+                    "household_id", map_to="person"
+                ).values,
+                "tax_unit_id": sim.calculate(
+                    "tax_unit_id", map_to="person"
+                ).values,
+                "spm_unit_id": sim.calculate(
+                    "spm_unit_id", map_to="person"
+                ).values,
+            }
+        )
+        return self._entity_rel_cache
+
+    def _evaluate_constraints_entity_aware(
+        self, state_sim, constraints: List[dict], n_households: int
+    ) -> np.ndarray:
+        """
+        Evaluate non-geographic constraints at person level, aggregate to
+        household level using .any().
+
+        This properly handles constraints on variables defined at different
+        entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of
+        summing values at household level (which would give 2, 3, etc. for
+        households with multiple tax units), we evaluate at person level and
+        use .any() aggregation ("does this household have at least one person
+        satisfying all constraints?").
+
+        Args:
+            state_sim: Microsimulation with state_fips set
+            constraints: List of constraint dicts with variable, operation,
+                value keys (geographic constraints should be pre-filtered)
+            n_households: Number of households
+
+        Returns:
+            Boolean mask array of length n_households
+        """
+        if not constraints:
+            return np.ones(n_households, dtype=bool)
+
+        entity_rel = self._build_entity_relationship(state_sim)
+        n_persons = len(entity_rel)
+
+        person_mask = np.ones(n_persons, dtype=bool)
+
+        for c in constraints:
+            var = c["variable"]
+            op = c["operation"]
+            val = c["value"]
+
+            # Calculate constraint variable at person level
+            constraint_values = state_sim.calculate(
+                var, map_to="person"
+            ).values
+
+            # Apply operation at person level
+            person_mask &= apply_op(constraint_values, op, val)
+
+        # Aggregate to household level using .any()
+        # "At least one person in this household satisfies ALL constraints"
+        entity_rel_with_mask = entity_rel.copy()
+        entity_rel_with_mask["satisfies"] = person_mask
+
+        household_mask_series = entity_rel_with_mask.groupby("household_id")[
+            "satisfies"
+        ].any()
+
+        # Ensure we return a mask aligned with household order
+        household_ids = state_sim.calculate(
+            "household_id", map_to="household"
+        ).values
+        household_mask = np.array(
+            [
+                household_mask_series.get(hh_id, False)
+                for hh_id in household_ids
+            ]
+        )
+
+        return household_mask
 
     def _query_targets(self, target_filter: dict) -> pd.DataFrame:
         """Query targets based on filter criteria using OR logic."""
@@ -166,6 +265,9 @@ def build_matrix(
             cds_by_state[state].append((cd_idx, cd))
 
         for state, cd_list in cds_by_state.items():
+            # Clear entity relationship cache when creating new simulation
+            self._entity_rel_cache = None
+
             if self.dataset_path:
                 state_sim = self._create_state_sim(state, n_households)
             else:
@@ -184,27 +286,43 @@ def build_matrix(
                 for row_idx, (_, target) in enumerate(targets_df.iterrows()):
                     constraints = self._get_constraints(target["stratum_id"])
 
-                    mask = np.ones(n_households, dtype=bool)
+                    geo_constraints = []
+                    non_geo_constraints = []
                     for c in constraints:
+                        if c["variable"] in (
+                            "state_fips",
+                            "congressional_district_geoid",
+                        ):
+                            geo_constraints.append(c)
+                        else:
+                            non_geo_constraints.append(c)
+
+                    # Check geographic constraints first (quick fail)
+                    geo_mask = np.ones(n_households, dtype=bool)
+                    for c in geo_constraints:
                         if c["variable"] == "congressional_district_geoid":
                             if (
                                 c["operation"] in ("==", "=")
                                 and c["value"] != cd
                             ):
-                                mask[:] = False
+                                geo_mask[:] = False
                         elif c["variable"] == "state_fips":
                             if (
                                 c["operation"] in ("==", "=")
                                 and int(c["value"]) != state
                             ):
-                                mask[:] = False
-                        else:
-                            values = state_sim.calculate(
-                                c["variable"], map_to="household"
-                            ).values
-                            mask &= apply_op(
-                                values, c["operation"], c["value"]
-                            )
+                                geo_mask[:] = False
+
+                    if not geo_mask.any():
+                        continue
+
+                    # Evaluate non-geographic constraints at entity level
+                    entity_mask = self._evaluate_constraints_entity_aware(
+                        state_sim, non_geo_constraints, n_households
+                    )
+
+                    # Combine geographic and entity-aware masks
+                    mask = geo_mask & entity_mask
 
                     if not mask.any():
                         continue
diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
@@ -37,6 +37,22 @@
     ("tanf", 1e-2),
     ("tip_income", 1e-2),
     ("unemployment_compensation", 1e-2),
+    ("income_tax", 1e-2),
+    ("income_tax", 1e-2),
+    ("qualified_business_income_deduction", 1e-2),
+    ("taxable_social_security", 1e-2),
+    ("taxable_pension_income", 1e-2),
+    ("taxable_ira_distributions", 1e-2),
+    ("taxable_interest_income", 1e-2),
+    ("tax_exempt_interest_income", 1e-2),
+    ("self_employment_income", 1e-2),
+    ("salt", 1e-2),
+    ("refundable_ctc", 1e-2),
+    ("real_estate_taxes", 1e-2),
+    ("qualified_dividend_income", 1e-2),
+    ("dividend_income", 1e-2),
+    ("adjusted_gross_income", 1e-2),
+    ("eitc", 1e-2),
 ]
 
 # Combined filter config to build matrix with all variables at once
@@ -45,6 +61,20 @@
         4,  # SNAP targets
         5,  # Medicaid targets
         112,  # Unemployment compensation targets
+        117,  # Income tax targets
+        100,  # QBID targets
+        111,  # Taxable social security targets
+        114,  # Taxable pension income targets
+        105,  # Taxable IRA distributions targets
+        106,  # Taxable interest income targets
+        107,  # Tax exempt interest income targets
+        101,  # Self-employment income targets
+        116,  # Salt targets
+        115,  # Refundable CTC targets
+        103,  # Real estate taxes targets
+        109,  # Qualified dividend income targets
+        108,  # Dividend income targets
+        3,  # Adjusted gross income targets
     ],
     "variables": [
         "snap",
@@ -60,14 +90,30 @@
         "tanf",
         "tip_income",
         "unemployment_compensation",
+        "income_tax",
+        "income_tax",
+        "qualified_business_income_deduction",
+        "taxable_social_security",
+        "taxable_pension_income",
+        "taxable_ira_distributions",
+        "taxable_interest_income",
+        "tax_exempt_interest_income",
+        "self_employment_income",
+        "salt",
+        "refundable_ctc",
+        "real_estate_taxes",
+        "qualified_dividend_income",
+        "dividend_income",
+        "adjusted_gross_income",
+        "eitc",
     ],
 }
 
 # Maximum allowed mismatch rate for state-level value comparison
 MAX_MISMATCH_RATE = 0.02
 
 # Number of samples for cell-level verification tests
-N_VERIFICATION_SAMPLES = 200
+N_VERIFICATION_SAMPLES = 2000
 
 
 @pytest.fixture(scope="module")