Skip to content

Commit d6e4862

Browse files
authored
Merge pull request #359 from PolicyEngine/bogorek-lite
Remove problematic targets and other minor enhancements
2 parents 65473ce + ab8fa4f commit d6e4862

File tree

6 files changed

+72
-25
lines changed

6 files changed

+72
-25
lines changed

.github/workflows/code_changes.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
name: Code changes
44
on:
5+
workflow_call:
56
push:
67
branches:
78
- main
@@ -27,7 +28,7 @@ jobs:
2728
contents: "read"
2829
# Required to auth against gcp
2930
id-token: "write"
30-
runs-on: larger-runner
31+
runs-on: ubuntu-latest
3132
steps:
3233
- name: Checkout repo
3334
uses: actions/checkout@v2

.github/workflows/manual_tests.yaml

Lines changed: 0 additions & 17 deletions
This file was deleted.

changelog_entry.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
- bump: patch
2+
changes:
3+
changed:
4+
- bad targets (causing problems with estimation) removed
5+
- lite mode now builds CPS_2023 in addition to CPS_2024
6+
- gave reweight an epochs argument and set it at 150 for optimization
7+
- updating minimum versions on policyengine-us and pandas dependencies
8+
- getting rid of non-working manual workflow code

policyengine_us_data/datasets/cps/cps.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,6 @@ def downsample(self, frac: float):
100100
original_dtypes = {
101101
key: original_data[key].dtype for key in original_data
102102
}
103-
104103
sim = Microsimulation(dataset=self)
105104
sim.subsample(frac=frac)
106105

@@ -2006,6 +2005,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
20062005

20072006
if __name__ == "__main__":
20082007
if test_lite:
2008+
CPS_2023().generate()
20092009
CPS_2024().generate()
20102010
else:
20112011
CPS_2021().generate()

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def reweight(
2828
targets_array,
2929
dropout_rate=0.05,
3030
log_path="calibration_log.csv",
31+
epochs=150,
3132
):
3233
target_names = np.array(loss_matrix.columns)
3334
is_national = loss_matrix.columns.str.startswith("nation/")
@@ -45,7 +46,7 @@ def reweight(
4546
np.log(original_weights), requires_grad=True, dtype=torch.float32
4647
)
4748

48-
# TODO: replace this with a call to the python reweight.py package.
49+
# TODO: replace this functionality from the microcalibrate package.
4950
def loss(weights):
5051
# Check for Nans in either the weights or the loss matrix
5152
if torch.isnan(weights).any():
@@ -78,7 +79,7 @@ def dropout_weights(weights, p):
7879

7980
start_loss = None
8081

81-
iterator = trange(500)
82+
iterator = trange(epochs)
8283
performance = pd.DataFrame()
8384
for i in iterator:
8485
optimizer.zero_grad()
@@ -178,18 +179,71 @@ def generate(self):
178179
original_weights = original_weights.values + np.random.normal(
179180
1, 0.1, len(original_weights)
180181
)
182+
183+
bad_targets = [
184+
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
185+
"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
186+
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
187+
"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
188+
"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
189+
"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
190+
"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
191+
"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
192+
"state/RI/adjusted_gross_income/amount/-inf_1",
193+
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
194+
"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Head of Household",
195+
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
196+
"nation/irs/adjusted gross income/total/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
197+
"nation/irs/count/count/AGI in 10k-15k/taxable/Head of Household",
198+
"nation/irs/count/count/AGI in 15k-20k/taxable/Head of Household",
199+
"nation/irs/count/count/AGI in 10k-15k/taxable/Married Filing Jointly/Surviving Spouse",
200+
"nation/irs/count/count/AGI in 15k-20k/taxable/Married Filing Jointly/Surviving Spouse",
201+
"state/RI/adjusted_gross_income/amount/-inf_1",
202+
"nation/irs/exempt interest/count/AGI in -inf-inf/taxable/All",
203+
]
204+
205+
# Run the optimization procedure to get (close to) minimum loss weights
181206
for year in range(self.start_year, self.end_year + 1):
182207
loss_matrix, targets_array = build_loss_matrix(
183208
self.input_dataset, year
184209
)
210+
zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
211+
bad_mask = loss_matrix.columns.isin(bad_targets)
212+
keep_mask_bool = ~(zero_mask | bad_mask)
213+
keep_idx = np.where(keep_mask_bool)[0]
214+
loss_matrix_clean = loss_matrix.iloc[:, keep_idx]
215+
targets_array_clean = targets_array[keep_idx]
216+
assert loss_matrix_clean.shape[1] == targets_array_clean.size
217+
185218
optimised_weights = reweight(
186219
original_weights,
187-
loss_matrix,
188-
targets_array,
220+
loss_matrix_clean,
221+
targets_array_clean,
189222
log_path="calibration_log.csv",
223+
epochs=150,
190224
)
191225
data["household_weight"][year] = optimised_weights
192226

227+
print("\n\n---reweighting quick diagnostics----\n")
228+
estimate = optimised_weights @ loss_matrix_clean
229+
rel_error = (
230+
((estimate - targets_array_clean) + 1)
231+
/ (targets_array_clean + 1)
232+
) ** 2
233+
print(
234+
f"rel_error: min: {np.min(rel_error):.2f}, "
235+
f"max: {np.max(rel_error):.2f} "
236+
f"mean: {np.mean(rel_error):.2f}, "
237+
f"median: {np.median(rel_error):.2f}"
238+
)
239+
print("Relative error over 100% for:")
240+
for i in np.where(rel_error > 1)[0]:
241+
print(f"target_name: {loss_matrix_clean.columns[i]}")
242+
print(f"target_value: {targets_array_clean[i]}")
243+
print(f"estimate_value: {estimate[i]}")
244+
print(f"has rel_error: {rel_error[i]:.2f}\n")
245+
print("---End of reweighting quick diagnostics------")
246+
193247
self.save_dataset(data)
194248

195249

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ authors = [
1717
license = {file = "LICENSE"}
1818
requires-python = ">=3.11, <3.13.0"
1919
dependencies = [
20-
"policyengine-us>=1.197.0",
21-
"policyengine-core>=3.14.1",
20+
"policyengine-us>=1.340.1",
21+
"policyengine-core>=3.17.1",
22+
"pandas>=2.3.0",
2223
"requests",
2324
"tqdm",
2425
"microdf_python>=0.4.3",

0 commit comments

Comments
 (0)