PolicyEngine · baogorek · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -17,7 +17,6 @@ node_modules
 !population_by_state.csv
 !aca_spending_and_enrollment_2024.csv
 !real_estate_taxes_by_state_acs.csv
-!np2023_d5_mid.csv
 !snap_state.csv
 !age_state.csv
 !agi_state.csv

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- changes:
+    changed:
+    - New configuration for sparse solution (~20k non-zero households) 
+    - added a seeding function to remove non-deterministic behavior in reweight
+    - Made np2023_d5_mid.csv a git ignorable file (it's in hugging face)
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -7,6 +7,7 @@
     fmt,
     HardConcrete,
     print_reweighting_diagnostics,
+    set_seeds,
 )
 import numpy as np
 from tqdm import trange
@@ -19,6 +20,7 @@
 )
 import os
 from pathlib import Path
+import logging
 
 
 try:
@@ -34,10 +36,12 @@ def reweight(
     dropout_rate=0.05,
     log_path="calibration_log.csv",
     epochs=500,
-    l0_lambda=5e-6,  # the action happens between 1e-6 and 1e-5
-    init_mean=0.999,  # initial proportion with non-zero weights, set near 0
-    temperature=0.5,  # Usual values .5 to 3, .5 was working better
+    l0_lambda=2.6445e-07,
+    init_mean=0.999,  # initial proportion with non-zero weights
+    temperature=0.25,
+    seed=1456,
 ):
+    set_seeds(seed)
     target_names = np.array(loss_matrix.columns)
     is_national = loss_matrix.columns.str.startswith("nation/")
     loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
@@ -137,7 +141,11 @@ def dropout_weights(weights, p):
     )
 
     # New (Sparse) path depending on temperature, init_mean, l0_lambda -----
-    # make a calibration_log_sparse.csv path
+    logging.info(
+        f"Sparse optimization using seed {seed}, temp {temperature} "
+        + f"init_mean {init_mean}, l0_lambda {l0_lambda}"
+    )
+    set_seeds(seed)
     p = Path(log_path)
     log_path_sparse = p.with_name(f"{p.stem}_sparse{p.suffix}")
 
@@ -156,8 +164,7 @@ def dropout_weights(weights, p):
     performance = pd.DataFrame()
     for i in iterator:
         optimizer.zero_grad()
-        weights_ = dropout_weights(weights, dropout_rate)
-        masked = torch.exp(weights_) * gates()
+        masked = torch.exp(weights) * gates()
         l_main = loss(masked)
         l = l_main + l0_lambda * gates.get_penalty()
         if (log_path_sparse is not None) and (i % 10 == 0):
@@ -309,6 +316,7 @@ def generate(self):
                 targets_array_clean,
                 log_path="calibration_log.csv",
                 epochs=200,
+                seed=1456,
             )
             data["household_weight"][year] = optimised_weights_dense
             data["household_sparse_weight"][year] = optimised_weights_sparse

diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py
@@ -3,3 +3,4 @@
 from .loss import *
 from .qrf import *
 from .l0 import *
+from .seed import *
diff --git a/policyengine_us_data/utils/seed.py b/policyengine_us_data/utils/seed.py
@@ -0,0 +1,21 @@
+import random
+import numpy as np
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def set_seeds(seed: int) -> None:
+    """Seed Python, NumPy and PyTorch for reproducible behavior."""
+    random.seed(seed)
+    np.random.seed(seed)
+    if torch is not None:
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        except Exception:
+            pass