Merge pull request #359 from PolicyEngine/bogorek-lite

MaxGhenis · web-flow · commit d6e4862d2df2 · 2025-07-14T11:32:53.000-04:00
Remove problematic targets and other minor enhancements
diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
@@ -2,6 +2,7 @@
 
 name: Code changes
 on:
+  workflow_call:
   push:
     branches:
       - main
@@ -27,7 +28,7 @@ jobs:
         contents: "read"
         # Required to auth against gcp
         id-token: "write"
-      runs-on: larger-runner
+      runs-on: ubuntu-latest
       steps:
           - name: Checkout repo
             uses: actions/checkout@v2
diff --git a/.github/workflows/manual_tests.yaml b/.github/workflows/manual_tests.yaml
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,8 @@
+- bump: patch
+  changes:
+    changed:
+    - bad targets (causing problems with estimation) removed
+    - lite mode now builds CPS_2023 in addition to CPS_2024
+    - gave reweight an epochs argument and set it at 150 for optimization
+    - updating minimum versions on policyengine-us and pandas dependencies
+    - getting rid of non-working manual workflow code
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -100,7 +100,6 @@ def downsample(self, frac: float):
         original_dtypes = {
             key: original_data[key].dtype for key in original_data
         }
-
         sim = Microsimulation(dataset=self)
         sim.subsample(frac=frac)
 
@@ -2006,6 +2005,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
 
 if __name__ == "__main__":
     if test_lite:
+        CPS_2023().generate()
         CPS_2024().generate()
     else:
         CPS_2021().generate()
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -28,6 +28,7 @@ def reweight(
     targets_array,
     dropout_rate=0.05,
     log_path="calibration_log.csv",
+    epochs=150,
 ):
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
@@ -45,7 +46,7 @@ def reweight(
         np.log(original_weights), requires_grad=True, dtype=torch.float32
     )
 
-    # TODO: replace this with a call to the python reweight.py package.
+    # TODO: replace this functionality from the microcalibrate package.
     def loss(weights):
         # Check for Nans in either the weights or the loss matrix
         if torch.isnan(weights).any():
@@ -78,7 +79,7 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = trange(500)
+    iterator = trange(epochs)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
@@ -178,18 +179,71 @@ def generate(self):
         original_weights = original_weights.values + np.random.normal(
             1, 0.1, len(original_weights)
         )
+
+        bad_targets = [
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
+            "nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
+            "nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
+            "state/RI/adjusted_gross_income/amount/-inf_1",
+            "nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
+        ]
+
+        # Run the optimization procedure to get (close to) minimum loss weights
         for year in range(self.start_year, self.end_year + 1):
             loss_matrix, targets_array = build_loss_matrix(
                 self.input_dataset, year
             )
+            zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
+            bad_mask = loss_matrix.columns.isin(bad_targets)
+            keep_mask_bool = ~(zero_mask | bad_mask)
+            keep_idx = np.where(keep_mask_bool)[0]
+            loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
+            targets_array_clean = targets_array[keep_idx]
+            assert loss_matrix_clean.shape[1] == targets_array_clean.size
+
             optimised_weights = reweight(
                 original_weights,
-                loss_matrix,
-                targets_array,
+                loss_matrix_clean,
+                targets_array_clean,
                 log_path="calibration_log.csv",
+                epochs=150,
             )
             data["household_weight"][year] = optimised_weights
 
+            print("\n\n---reweighting quick diagnostics----\n")
+            estimate = optimised_weights @ loss_matrix_clean
+            rel_error = (
+                ((estimate - targets_array_clean) + 1)
+                / (targets_array_clean + 1)
+            ) ** 2
+            print(
+                f"rel_error: min: {np.min(rel_error):.2f}, "
+                f"max: {np.max(rel_error):.2f} "
+                f"mean: {np.mean(rel_error):.2f}, "
+                f"median: {np.median(rel_error):.2f}"
+            )
+            print("Relative error over 100% for:")
+            for i in np.where(rel_error > 1)[0]:
+                print(f"target_name: {loss_matrix_clean.columns[i]}")
+                print(f"target_value: {targets_array_clean[i]}")
+                print(f"estimate_value: {estimate[i]}")
+                print(f"has rel_error: {rel_error[i]:.2f}\n")
+            print("---End of reweighting quick diagnostics------")
+
         self.save_dataset(data)
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,8 +17,9 @@ authors = [
 license = {file = "LICENSE"}
 requires-python = ">=3.11, <3.13.0"
 dependencies = [
-    "policyengine-us>=1.197.0",
-    "policyengine-core>=3.14.1",
+    "policyengine-us>=1.340.1",
+    "policyengine-core>=3.17.1",
+    "pandas>=2.3.0",
     "requests",
     "tqdm",
     "microdf_python>=0.4.3",