Skip to content

Commit 1dfbf41

Browse files
authored
Changing to auto loan interest imputation (#255)
* changing to auto loan interest imputation * move 100 * auto loan variables in scf.py * division by 100 for loan rate comments * changing test to use upper and lower bounds * around 270 * testing logging level and auto loan test * trying with 100_000 * fixed auto loan interest values
1 parent 744f08b commit 1dfbf41

File tree

7 files changed

+103
-55
lines changed

7 files changed

+103
-55
lines changed

.github/workflows/code_changes.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ jobs:
4949
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
5050
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
5151
- name: Build datasets
52-
run: make data
52+
run: make data
53+
env:
54+
PYTHON_LOG_LEVEL: INFO
5355
- name: Run tests
5456
run: pytest
5557
- name: Upload data

.github/workflows/pr_code_changes.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ jobs:
4343
run: make data
4444
env:
4545
TEST_LITE: true
46+
PYTHON_LOG_LEVEL: INFO
4647
- name: Run tests
4748
run: pytest
4849
- name: Test documentation builds

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: patch
2+
changes:
3+
changed:
4+
- Methodology to directly impute auto loan interest instead of assuming a 2% interest rate on auto loan balance.

policyengine_us_data/datasets/cps/cps.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def generate(self):
6868
logging.info("Adding rent")
6969
add_rent(self, cps, person, household)
7070
logging.info("Adding auto loan balance")
71-
add_auto_loan_balance(self, cps)
71+
add_auto_loan_interest(self, cps)
7272
logging.info("Adding tips")
7373
add_tips(self, cps)
7474
logging.info("Added all variables")
@@ -184,8 +184,8 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
184184
cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]
185185

186186

187-
def add_auto_loan_balance(self, cps: h5py.File) -> None:
188-
""" "Add auto loan balance variable."""
187+
def add_auto_loan_interest(self, cps: h5py.File) -> None:
188+
""" "Add auto loan interest variable."""
189189
self.save_dataset(cps)
190190
cps_data = self.load_dataset()
191191

@@ -298,21 +298,29 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
298298
"self_employment_income",
299299
"farm_income",
300300
]
301-
IMPUTED_VARIABLES = ["auto_loan_balance"]
301+
IMPUTED_VARIABLES = ["auto_loan_interest"]
302302
weights = ["household_weight"]
303303

304304
donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy()
305305

306306
donor_data = donor_data.loc[
307307
np.random.choice(
308308
donor_data.index,
309-
size=100_000 if not test_lite else 1_000,
309+
size=100_000 if not test_lite else 100_000,
310310
replace=True,
311311
p=donor_data.household_weight / donor_data.household_weight.sum(),
312312
)
313313
]
314314

315315
from microimpute.models.qrf import QRF
316+
import logging
317+
import os
318+
319+
# Set root logger level
320+
log_level = os.getenv("PYTHON_LOG_LEVEL", "WARNING")
321+
322+
# Specifically target the microimpute logger
323+
logging.getLogger("microimpute").setLevel(getattr(logging, log_level))
316324

317325
qrf_model = QRF()
318326
fitted_model = qrf_model.fit(
@@ -327,10 +335,6 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
327335
for var in IMPUTED_VARIABLES:
328336
cps[var] = imputations[0.5][var]
329337

330-
cps["auto_loan_interest"] = (
331-
cps["auto_loan_balance"] * scf_data["auto_loan_interest"].mean() / 100
332-
) * 12
333-
334338
self.save_dataset(cps)
335339

336340

policyengine_us_data/datasets/scf/fed_scf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from policyengine_us_data.storage import STORAGE_FOLDER
1010

1111

12-
class FedSCF(Dataset):
12+
class SummarizedFedSCF(Dataset):
1313
"""Dataset containing Survey of Consumer Finances data from the Federal Reserve."""
1414

1515
time_period: int
@@ -84,26 +84,26 @@ def generate(self):
8484

8585
@property
8686
def _scf_download_url(self) -> str:
87-
return SCF_URL_BY_YEAR.get(self.time_period)
87+
return SummarizedSCF_URL_BY_YEAR.get(self.time_period)
8888

8989

90-
class FedSCF_2022(FedSCF):
90+
class SummarizedFedSCF_2022(SummarizedFedSCF):
9191
time_period = 2022
9292
label = "Federal Reserve SCF (2022)"
9393
name = "fed_scf_2022"
9494
file_path = STORAGE_FOLDER / "fed_scf_2022.h5"
9595
data_format = Dataset.TABLES
9696

9797

98-
class FedSCF_2019(FedSCF):
98+
class SummarizedFedSCF_2019(SummarizedFedSCF):
9999
time_period = 2019
100100
label = "Federal Reserve SCF (2019)"
101101
name = "fed_scf_2019"
102102
file_path = STORAGE_FOLDER / "fed_scf_2019.h5"
103103
data_format = Dataset.TABLES
104104

105105

106-
class FedSCF_2016(FedSCF):
106+
class SummarizedFedSCF_2016(SummarizedFedSCF):
107107
time_period = 2016
108108
label = "Federal Reserve SCF (2016)"
109109
name = "fed_scf_2016"
@@ -112,7 +112,7 @@ class FedSCF_2016(FedSCF):
112112

113113

114114
# URLs for the SCF data by year
115-
SCF_URL_BY_YEAR = {
115+
SummarizedSCF_URL_BY_YEAR = {
116116
2016: "https://www.federalreserve.gov/econres/files/scfp2016s.zip",
117117
2019: "https://www.federalreserve.gov/econres/files/scfp2019s.zip",
118118
2022: "https://www.federalreserve.gov/econres/files/scfp2022s.zip",

policyengine_us_data/datasets/scf/scf.py

Lines changed: 67 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from policyengine_core.data import Dataset
22
from policyengine_us_data.storage import STORAGE_FOLDER
33
from policyengine_us_data.datasets.scf.fed_scf import (
4-
FedSCF,
5-
FedSCF_2016,
6-
FedSCF_2019,
7-
FedSCF_2022,
4+
SummarizedFedSCF,
5+
SummarizedFedSCF_2016,
6+
SummarizedFedSCF_2019,
7+
SummarizedFedSCF_2022,
88
)
99
import pandas as pd
1010
import numpy as np
@@ -18,7 +18,7 @@ class SCF(Dataset):
1818

1919
name = "scf"
2020
label = "SCF"
21-
raw_scf: Type[FedSCF] = None
21+
raw_scf: Type[SummarizedFedSCF] = None
2222
time_period: int = None
2323
data_format = Dataset.ARRAYS
2424
frac: float | None = 1
@@ -217,7 +217,9 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
217217

218218
# Vehicle loan (auto loan)
219219
if "veh_inst" in raw_data.columns:
220-
scf["auto_loan_balance"] = raw_data["veh_inst"].fillna(0).values
220+
scf["total_vehicle_installments"] = (
221+
raw_data["veh_inst"].fillna(0).values
222+
)
221223

222224
# Household weights
223225
if "wgt" in raw_data.columns:
@@ -248,7 +250,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
248250

249251

250252
def add_auto_loan_interest(scf: dict, year: int) -> None:
251-
"""Adds auto loan interest to the summarized SCF dataset from the full SCF."""
253+
"""Adds auto loan balance and interest to the summarized SCF dataset from the full SCF."""
252254
import requests
253255
import zipfile
254256
import io
@@ -260,7 +262,17 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
260262
url = f"https://www.federalreserve.gov/econres/files/scf{year}s.zip"
261263

262264
# Define columns of interest
263-
columns = ["yy1", "y1", "x2219", "x2319", "x2419", "x7170"]
265+
IDENTIFYER_COLUMNS = ["yy1", "y1"]
266+
AUTO_LOAN_COLUMNS = [
267+
"x2209", # loan amount on car 1
268+
"x2309", # loan amount on car 2
269+
"x2409", # loan amount on car 3
270+
"x7158", # loan amount on car 4
271+
"x2219", # loan interest rate on car 1
272+
"x2319", # loan interest rate on car 2
273+
"x2419", # loan interest rate on car 3
274+
"x7170", # loan interest rate on car 4
275+
]
264276

265277
try:
266278
# Download zip file
@@ -295,7 +307,10 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
295307
try:
296308
logger.info(f"Reading Stata file: {dta_files[0]}")
297309
with z.open(dta_files[0]) as f:
298-
df = pd.read_stata(io.BytesIO(f.read()), columns=columns)
310+
df = pd.read_stata(
311+
io.BytesIO(f.read()),
312+
columns=(IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS),
313+
)
299314
logger.info(f"Read DataFrame with shape {df.shape}")
300315
except Exception as e:
301316
logger.error(
@@ -312,31 +327,41 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
312327
) from e
313328

314329
# Process the interest data and add to final SCF dictionary
315-
auto_int = df[columns].copy()
316-
auto_int["x2219"] = auto_int["x2219"].replace(-1, 0)
317-
auto_int["x2319"] = auto_int["x2319"].replace(-1, 0)
318-
auto_int["x2419"] = auto_int["x2419"].replace(-1, 0)
319-
auto_int["x7170"] = auto_int["x7170"].replace(-1, 0)
320-
# Calculate total auto loan interest (sum of all auto loan interest variables)
321-
auto_int["auto_loan_interest"] = auto_int[
322-
["x2219", "x2319", "x2419", "x7170"]
330+
auto_df = df[IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS].copy()
331+
auto_df[AUTO_LOAN_COLUMNS].replace(-1, 0, inplace=True)
332+
333+
# Interest rate columns are in percent * 10,000 format, we need to divide by 10,000 to leave them in percentage format
334+
RATE_COLUMNS = ["x2219", "x2319", "x2419", "x7170"]
335+
auto_df[RATE_COLUMNS] /= 10_000
336+
337+
# Calculate total auto loan balance (sum of all auto loan balance variables)
338+
auto_df["auto_loan_balance"] = auto_df[
339+
["x2209", "x2309", "x2409", "x7158"]
323340
].sum(axis=1)
324341

342+
# Calculate total auto loan interest (sum of the amounts of each balance variable multiplied by its respective interest rate variable)
343+
auto_df["auto_loan_interest"] = (
344+
auto_df["x2209"] * auto_df["x2219"]
345+
+ auto_df["x2309"] * auto_df["x2319"]
346+
+ auto_df["x2409"] * auto_df["x2419"]
347+
+ auto_df["x7158"] * auto_df["x7170"]
348+
)
349+
325350
# Check if we have household identifiers (y1, yy1) in both datasets
326351
if (
327352
"y1" in scf
328353
and "yy1" in scf
329-
and "y1" in auto_int.columns
330-
and "yy1" in auto_int.columns
354+
and "y1" in auto_df.columns
355+
and "yy1" in auto_df.columns
331356
):
332357
logger.info(
333358
"Using household identifiers (y1, yy1) to ensure correct matching"
334359
)
335360

336361
# Create unique identifier from y1 and yy1 for each dataset
337362
# In the original data
338-
auto_int["household_id"] = (
339-
auto_int["y1"].astype(str) + "_" + auto_int["yy1"].astype(str)
363+
auto_df["household_id"] = (
364+
auto_df["y1"].astype(str) + "_" + auto_df["yy1"].astype(str)
340365
)
341366

342367
# In the SCF dictionary
@@ -346,35 +371,42 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
346371
temp_scf["y1"].astype(str) + "_" + temp_scf["yy1"].astype(str)
347372
)
348373

349-
# Create a mapping from household ID to auto loan interest
374+
# Create a mapping from household ID to auto loan balance and interest
350375
id_to_interest = dict(
351376
zip(
352-
auto_int["household_id"].values,
353-
auto_int["auto_loan_interest"].values,
377+
auto_df["household_id"].values,
378+
auto_df["auto_loan_interest"].values,
379+
)
380+
)
381+
id_to_balance = dict(
382+
zip(
383+
auto_df["household_id"].values,
384+
auto_df["auto_loan_balance"].values,
354385
)
355386
)
356387

357388
# Create array for auto loan interest that matches SCF order
358389
interest_values = np.zeros(len(temp_scf), dtype=float)
390+
balance_values = np.zeros(len(temp_scf), dtype=float)
359391

360392
# Fill in interest values based on household ID
361393
for i, household_id in enumerate(temp_scf["household_id"]):
362394
if household_id in id_to_interest:
363395
interest_values[i] = id_to_interest[household_id]
396+
for i, household_id in enumerate(temp_scf["household_id"]):
397+
if household_id in id_to_balance:
398+
balance_values[i] = id_to_balance[household_id]
364399

365400
# Add to SCF dictionary
366-
scf["auto_loan_interest"] = interest_values / 100
401+
scf["auto_loan_interest"] = interest_values
402+
scf["auto_loan_balance"] = balance_values
403+
367404
logger.info(
368405
f"Added auto loan interest data for year {year} with household matching"
369406
)
370407
else:
371-
# Fallback to simple assignment if identifiers aren't present
372-
logger.warning(
373-
"Household identifiers not found. Using direct array assignment (may not match households correctly)"
374-
)
375-
scf["auto_loan_interest"] = auto_int["auto_loan_interest"].values
376-
logger.info(
377-
f"Added auto loan interest data for year {year} without household matching"
408+
raise ValueError(
409+
"Household identifiers (y1, yy1) not found in both datasets."
378410
)
379411

380412
except Exception as e:
@@ -387,7 +419,7 @@ class SCF_2022(SCF):
387419

388420
name = "scf_2022"
389421
label = "SCF 2022"
390-
raw_scf = FedSCF_2022
422+
raw_scf = SummarizedFedSCF_2022
391423
file_path = STORAGE_FOLDER / "scf_2022.h5"
392424
time_period = 2022
393425
frac = 1
@@ -398,7 +430,7 @@ class SCF_2019(SCF):
398430

399431
name = "scf_2019"
400432
label = "SCF 2019"
401-
raw_scf = FedSCF_2019
433+
raw_scf = SummarizedFedSCF_2019
402434
file_path = STORAGE_FOLDER / "scf_2019.h5"
403435
time_period = 2019
404436
frac = 1
@@ -409,7 +441,7 @@ class SCF_2016(SCF):
409441

410442
name = "scf_2016"
411443
label = "SCF 2016"
412-
raw_scf = FedSCF_2016
444+
raw_scf = SummarizedFedSCF_2016
413445
file_path = STORAGE_FOLDER / "scf_2016.h5"
414446
time_period = 2016
415447
frac = 1

policyengine_us_data/tests/test_datasets/test_cps.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,16 @@ def test_cps_has_auto_loan_interest():
3434
from policyengine_us import Microsimulation
3535

3636
sim = Microsimulation(dataset=CPS_2024)
37-
# Ensure we impute at least $65 billion in auto loan interest.
38-
# We currently target $256 billion.
39-
AUTO_LOAN_INTEREST_MINIMUM = 65e9
37+
# Ensure we impute around $85 billion in overtime premium with 20% error bounds.
38+
AUTO_LOAN_INTEREST_TARGET = 85e9
39+
RELATIVE_TOLERANCE = 0.2
4040
assert (
41-
sim.calculate("auto_loan_interest").sum() > AUTO_LOAN_INTEREST_MINIMUM
41+
abs(
42+
sim.calculate("auto_loan_interest").sum()
43+
/ AUTO_LOAN_INTEREST_TARGET
44+
- 1
45+
)
46+
< RELATIVE_TOLERANCE
4247
)
4348

4449

0 commit comments

Comments
 (0)