Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ node_modules
!population_by_state.csv
!aca_spending_and_enrollment_2024.csv
!real_estate_taxes_by_state_acs.csv
!np2023_d5_mid.csv
!snap_state.csv
!age_state.csv
!agi_state.csv
Expand Down
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- changes:
changed:
- New configuration for sparse solution (~20k non-zero households)
- added a seeding function to remove non-deterministic behavior in reweight
- Made np2023_d5_mid.csv a git ignorable file (it's in hugging face)
20 changes: 14 additions & 6 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
fmt,
HardConcrete,
print_reweighting_diagnostics,
set_seeds,
)
import numpy as np
from tqdm import trange
Expand All @@ -19,6 +20,7 @@
)
import os
from pathlib import Path
import logging


try:
Expand All @@ -34,10 +36,12 @@ def reweight(
dropout_rate=0.05,
log_path="calibration_log.csv",
epochs=500,
l0_lambda=5e-6, # the action happens between 1e-6 and 1e-5
init_mean=0.999, # initial proportion with non-zero weights, set near 0
temperature=0.5, # Usual values .5 to 3, .5 was working better
l0_lambda=2.6445e-07,
init_mean=0.999, # initial proportion with non-zero weights
temperature=0.25,
seed=1456,
):
set_seeds(seed)
target_names = np.array(loss_matrix.columns)
is_national = loss_matrix.columns.str.startswith("nation/")
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
Expand Down Expand Up @@ -137,7 +141,11 @@ def dropout_weights(weights, p):
)

# New (Sparse) path depending on temperature, init_mean, l0_lambda -----
# make a calibration_log_sparse.csv path
logging.info(
f"Sparse optimization using seed {seed}, temp {temperature} "
+ f"init_mean {init_mean}, l0_lambda {l0_lambda}"
)
set_seeds(seed)
p = Path(log_path)
log_path_sparse = p.with_name(f"{p.stem}_sparse{p.suffix}")

Expand All @@ -156,8 +164,7 @@ def dropout_weights(weights, p):
performance = pd.DataFrame()
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
masked = torch.exp(weights_) * gates()
masked = torch.exp(weights) * gates()
l_main = loss(masked)
l = l_main + l0_lambda * gates.get_penalty()
if (log_path_sparse is not None) and (i % 10 == 0):
Expand Down Expand Up @@ -309,6 +316,7 @@ def generate(self):
targets_array_clean,
log_path="calibration_log.csv",
epochs=200,
seed=1456,
)
data["household_weight"][year] = optimised_weights_dense
data["household_sparse_weight"][year] = optimised_weights_sparse
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .loss import *
from .qrf import *
from .l0 import *
from .seed import *
21 changes: 21 additions & 0 deletions policyengine_us_data/utils/seed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import random
import numpy as np

try:
import torch
except ImportError:
torch = None


def set_seeds(seed: int) -> None:
"""Seed Python, NumPy and PyTorch for reproducible behavior."""
random.seed(seed)
np.random.seed(seed)
if torch is not None:
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
try:
torch.use_deterministic_algorithms(True, warn_only=True)
except Exception:
pass
Loading