diff --git a/.gitignore b/.gitignore index 0c753ec2..48551e95 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,6 @@ node_modules !population_by_state.csv !aca_spending_and_enrollment_2024.csv !real_estate_taxes_by_state_acs.csv -!np2023_d5_mid.csv !snap_state.csv !age_state.csv !agi_state.csv diff --git a/Makefile b/Makefile index cfef0baf..4e0cec16 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,8 @@ data: python policyengine_us_data/datasets/cps/extended_cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py python policyengine_us_data/datasets/cps/small_enhanced_cps.py + mv policyengine_us_data/storage/enhanced_cps_2024.h5 policyengine_us_data/storage/dense_enhanced_cps_2024.h5 + cp policyengine_us_data/storage/sparse_enhanced_cps_2024.h5 policyengine_us_data/storage/enhanced_cps_2024.h5 clean: rm -f policyengine_us_data/storage/*.h5 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..6b0a5e4d 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,5 @@ +- changes: + changed: + - New configuration for sparse solution (~20k non-zero households) + - added a seeding function to remove non-deterministic behavior in reweight + - Made np2023_d5_mid.csv a git ignorable file (it's in hugging face) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 59fed93b..8bbe67bc 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -7,6 +7,7 @@ fmt, HardConcrete, print_reweighting_diagnostics, + set_seeds, ) import numpy as np from tqdm import trange @@ -19,6 +20,7 @@ ) import os from pathlib import Path +import logging try: @@ -34,10 +36,12 @@ def reweight( dropout_rate=0.05, log_path="calibration_log.csv", epochs=500, - l0_lambda=5e-6, # the action happens between 1e-6 and 1e-5 - init_mean=0.999, # initial proportion with non-zero weights, set near 0 - temperature=0.5, # Usual values .5 to 3, .5 was working better + l0_lambda=2.6445e-07, + init_mean=0.999, # initial proportion with non-zero weights + temperature=0.25, + seed=1456, ): + set_seeds(seed) target_names = np.array(loss_matrix.columns) is_national = loss_matrix.columns.str.startswith("nation/") loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) @@ -137,7 +141,11 @@ def dropout_weights(weights, p): ) # New (Sparse) path depending on temperature, init_mean, l0_lambda ----- - # make a calibration_log_sparse.csv path + logging.info( + f"Sparse optimization using seed {seed}, temp {temperature} " + + f"init_mean {init_mean}, l0_lambda {l0_lambda}" + ) + set_seeds(seed) p = Path(log_path) log_path_sparse = p.with_name(f"{p.stem}_sparse{p.suffix}") @@ -156,8 +164,7 @@ def dropout_weights(weights, p): performance = pd.DataFrame() for i in iterator: optimizer.zero_grad() - weights_ = dropout_weights(weights, dropout_rate) - masked = torch.exp(weights_) * gates() + masked = torch.exp(weights) * gates() l_main = loss(masked) l = l_main + l0_lambda * gates.get_penalty() if (log_path_sparse is not None) and (i % 10 == 0): @@ -309,6 +316,7 @@ def generate(self): targets_array_clean, log_path="calibration_log.csv", epochs=200, + seed=1456, ) data["household_weight"][year] = optimised_weights_dense data["household_sparse_weight"][year] = optimised_weights_sparse diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index 56eb6f25..75f0510a 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -3,3 +3,4 @@ from .loss import * from .qrf import * from .l0 import * +from .seed import * diff --git a/policyengine_us_data/utils/seed.py b/policyengine_us_data/utils/seed.py new file mode 100644 index 00000000..e5fa7669 --- /dev/null +++ b/policyengine_us_data/utils/seed.py @@ -0,0 +1,21 @@ +import random +import numpy as np + +try: + import torch +except ImportError: + torch = None + + +def set_seeds(seed: int) -> None: + """Seed Python, NumPy and PyTorch for reproducible behavior.""" + random.seed(seed) + np.random.seed(seed) + if torch is not None: + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + try: + torch.use_deterministic_algorithms(True, warn_only=True) + except Exception: + pass