Increase sample size of the CPS (#325)

nikhilwoodruff · web-flow · commit d5fb50e01fd5 · 2025-07-01T17:53:54.000+01:00
* Use full CPS

* Changelog

* Reduce prod epochs

* Cut down test time

* Cut test time

* Remove redundant CPS test
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: patch
+  changes:
+    fixed:
+    - Use full CPS by default.
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -1923,7 +1923,7 @@ class CPS_2024(CPS):
     file_path = STORAGE_FOLDER / "cps_2024.h5"
     time_period = 2024
     url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5"
-    frac = 0.5
+    frac = 1
 
 
 # The below datasets are a very naïve way of preventing downsampling in the
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -63,12 +63,12 @@ def dropout_weights(weights, p):
         masked_weights[mask] = mean
         return masked_weights
 
-    optimizer = torch.optim.Adam([weights], lr=1e-1)
+    optimizer = torch.optim.Adam([weights], lr=3e-1)
     from tqdm import trange
 
     start_loss = None
 
-    iterator = trange(5_000 if not os.environ.get("TEST_LITE") else 1000)
+    iterator = trange(500 if not os.environ.get("TEST_LITE") else 500)
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
diff --git a/policyengine_us_data/tests/test_datasets/test_cps.py b/policyengine_us_data/tests/test_datasets/test_cps.py
@@ -2,34 +2,6 @@
 import numpy as np
 
 
-@pytest.mark.parametrize("year", [2022])
-def test_policyengine_cps_generates(year: int):
-    from policyengine_us_data.datasets.cps.cps import CPS_2022
-
-    dataset_by_year = {
-        2022: CPS_2022,
-    }
-
-    dataset_by_year[year](require=True)
-
-
-@pytest.mark.parametrize("year", [2022])
-def test_policyengine_cps_loads(year: int):
-    from policyengine_us_data.datasets.cps.cps import CPS_2022
-
-    dataset_by_year = {
-        2022: CPS_2022,
-    }
-
-    dataset = dataset_by_year[year]
-
-    from policyengine_us import Microsimulation
-
-    sim = Microsimulation(dataset=dataset)
-
-    assert not sim.calculate("household_net_income").isna().any()
-
-
 def test_cps_has_auto_loan_interest():
     from policyengine_us_data.datasets.cps import CPS_2024
     from policyengine_us import Microsimulation
diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py
@@ -1,34 +1,6 @@
 import pytest
 
 
-@pytest.mark.parametrize("year", [2024])
-def test_policyengine_cps_generates(year: int):
-    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
-
-    dataset_by_year = {
-        2024: EnhancedCPS_2024,
-    }
-
-    dataset_by_year[year](require=True)
-
-
-@pytest.mark.parametrize("year", [2024])
-def test_policyengine_cps_loads(year: int):
-    from policyengine_us_data.datasets.cps import EnhancedCPS_2024
-
-    dataset_by_year = {
-        2024: EnhancedCPS_2024,
-    }
-
-    dataset = dataset_by_year[year]
-
-    from policyengine_us import Microsimulation
-
-    sim = Microsimulation(dataset=dataset)
-
-    assert not sim.calculate("household_net_income").isna().any()
-
-
 def test_ecps_has_mortgage_interest():
     from policyengine_us_data.datasets.cps import EnhancedCPS_2024
     from policyengine_us import Microsimulation
@@ -50,6 +22,25 @@ def test_ecps_has_tips():
 
 
 def test_ecps_replicates_jct_tax_expenditures():
+    import pandas as pd
+
+    calibration_log = pd.read_csv(
+        "calibration_log.csv",
+    )
+
+    jct_rows = calibration_log[
+        (calibration_log["target_name"].str.contains("jct/"))
+        & (calibration_log["epoch"] == calibration_log["epoch"].max())
+    ]
+
+    assert (
+        jct_rows.rel_abs_error.max() < 0.4
+    ), "JCT tax expenditure targets not met (see the calibration log for details). Max relative error: {:.2%}".format(
+        jct_rows.rel_abs_error.max()
+    )
+
+
+def deprecated_test_ecps_replicates_jct_tax_expenditures_full():
     from policyengine_us import Microsimulation
     from policyengine_core.reforms import Reform
     from policyengine_us_data.datasets import EnhancedCPS_2024