Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/code_changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ jobs:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
run: make data
env:
PYTHON_LOG_LEVEL: INFO
- name: Run tests
run: pytest
- name: Upload data
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pr_code_changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
run: make data
env:
TEST_LITE: true
PYTHON_LOG_LEVEL: INFO
- name: Run tests
run: pytest
- name: Test documentation builds
Expand Down
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: patch
changes:
changed:
- Methodology to directly impute auto loan interest instead of assuming a 2% interest rate on auto loan balance.
22 changes: 13 additions & 9 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def generate(self):
logging.info("Adding rent")
add_rent(self, cps, person, household)
logging.info("Adding auto loan balance")
add_auto_loan_balance(self, cps)
add_auto_loan_interest(self, cps)
logging.info("Adding tips")
add_tips(self, cps)
logging.info("Added all variables")
Expand Down Expand Up @@ -184,8 +184,8 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]


def add_auto_loan_balance(self, cps: h5py.File) -> None:
""" "Add auto loan balance variable."""
def add_auto_loan_interest(self, cps: h5py.File) -> None:
""" "Add auto loan interest variable."""
self.save_dataset(cps)
cps_data = self.load_dataset()

Expand Down Expand Up @@ -298,21 +298,29 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
"self_employment_income",
"farm_income",
]
IMPUTED_VARIABLES = ["auto_loan_balance"]
IMPUTED_VARIABLES = ["auto_loan_interest"]
weights = ["household_weight"]

donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy()

donor_data = donor_data.loc[
np.random.choice(
donor_data.index,
size=100_000 if not test_lite else 1_000,
size=100_000 if not test_lite else 100_000,
replace=True,
p=donor_data.household_weight / donor_data.household_weight.sum(),
)
]

from microimpute.models.qrf import QRF
import logging
import os

# Set root logger level
log_level = os.getenv("PYTHON_LOG_LEVEL", "WARNING")

# Specifically target the microimpute logger
logging.getLogger("microimpute").setLevel(getattr(logging, log_level))

qrf_model = QRF()
fitted_model = qrf_model.fit(
Expand All @@ -327,10 +335,6 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
for var in IMPUTED_VARIABLES:
cps[var] = imputations[0.5][var]

cps["auto_loan_interest"] = (
cps["auto_loan_balance"] * scf_data["auto_loan_interest"].mean() / 100
) * 12

self.save_dataset(cps)


Expand Down
12 changes: 6 additions & 6 deletions policyengine_us_data/datasets/scf/fed_scf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from policyengine_us_data.storage import STORAGE_FOLDER


class FedSCF(Dataset):
class SummarizedFedSCF(Dataset):
"""Dataset containing Survey of Consumer Finances data from the Federal Reserve."""

time_period: int
Expand Down Expand Up @@ -84,26 +84,26 @@ def generate(self):

@property
def _scf_download_url(self) -> str:
return SCF_URL_BY_YEAR.get(self.time_period)
return SummarizedSCF_URL_BY_YEAR.get(self.time_period)


class FedSCF_2022(FedSCF):
class SummarizedFedSCF_2022(SummarizedFedSCF):
time_period = 2022
label = "Federal Reserve SCF (2022)"
name = "fed_scf_2022"
file_path = STORAGE_FOLDER / "fed_scf_2022.h5"
data_format = Dataset.TABLES


class FedSCF_2019(FedSCF):
class SummarizedFedSCF_2019(SummarizedFedSCF):
time_period = 2019
label = "Federal Reserve SCF (2019)"
name = "fed_scf_2019"
file_path = STORAGE_FOLDER / "fed_scf_2019.h5"
data_format = Dataset.TABLES


class FedSCF_2016(FedSCF):
class SummarizedFedSCF_2016(SummarizedFedSCF):
time_period = 2016
label = "Federal Reserve SCF (2016)"
name = "fed_scf_2016"
Expand All @@ -112,7 +112,7 @@ class FedSCF_2016(FedSCF):


# URLs for the SCF data by year
SCF_URL_BY_YEAR = {
SummarizedSCF_URL_BY_YEAR = {
2016: "https://www.federalreserve.gov/econres/files/scfp2016s.zip",
2019: "https://www.federalreserve.gov/econres/files/scfp2019s.zip",
2022: "https://www.federalreserve.gov/econres/files/scfp2022s.zip",
Expand Down
102 changes: 67 additions & 35 deletions policyengine_us_data/datasets/scf/scf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from policyengine_core.data import Dataset
from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us_data.datasets.scf.fed_scf import (
FedSCF,
FedSCF_2016,
FedSCF_2019,
FedSCF_2022,
SummarizedFedSCF,
SummarizedFedSCF_2016,
SummarizedFedSCF_2019,
SummarizedFedSCF_2022,
)
import pandas as pd
import numpy as np
Expand All @@ -18,7 +18,7 @@ class SCF(Dataset):

name = "scf"
label = "SCF"
raw_scf: Type[FedSCF] = None
raw_scf: Type[SummarizedFedSCF] = None
time_period: int = None
data_format = Dataset.ARRAYS
frac: float | None = 1
Expand Down Expand Up @@ -217,7 +217,9 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:

# Vehicle loan (auto loan)
if "veh_inst" in raw_data.columns:
scf["auto_loan_balance"] = raw_data["veh_inst"].fillna(0).values
scf["total_vehicle_installments"] = (
raw_data["veh_inst"].fillna(0).values
)

# Household weights
if "wgt" in raw_data.columns:
Expand Down Expand Up @@ -248,7 +250,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:


def add_auto_loan_interest(scf: dict, year: int) -> None:
"""Adds auto loan interest to the summarized SCF dataset from the full SCF."""
"""Adds auto loan balance and interest to the summarized SCF dataset from the full SCF."""
import requests
import zipfile
import io
Expand All @@ -260,7 +262,17 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
url = f"https://www.federalreserve.gov/econres/files/scf{year}s.zip"

# Define columns of interest
columns = ["yy1", "y1", "x2219", "x2319", "x2419", "x7170"]
IDENTIFYER_COLUMNS = ["yy1", "y1"]
AUTO_LOAN_COLUMNS = [
"x2209", # loan amount on car 1
"x2309", # loan amount on car 2
"x2409", # loan amount on car 3
"x7158", # loan amount on car 4
"x2219", # loan interest rate on car 1
"x2319", # loan interest rate on car 2
"x2419", # loan interest rate on car 3
"x7170", # loan interest rate on car 4
]

try:
# Download zip file
Expand Down Expand Up @@ -295,7 +307,10 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
try:
logger.info(f"Reading Stata file: {dta_files[0]}")
with z.open(dta_files[0]) as f:
df = pd.read_stata(io.BytesIO(f.read()), columns=columns)
df = pd.read_stata(
io.BytesIO(f.read()),
columns=(IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS),
)
logger.info(f"Read DataFrame with shape {df.shape}")
except Exception as e:
logger.error(
Expand All @@ -312,31 +327,41 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
) from e

# Process the interest data and add to final SCF dictionary
auto_int = df[columns].copy()
auto_int["x2219"] = auto_int["x2219"].replace(-1, 0)
auto_int["x2319"] = auto_int["x2319"].replace(-1, 0)
auto_int["x2419"] = auto_int["x2419"].replace(-1, 0)
auto_int["x7170"] = auto_int["x7170"].replace(-1, 0)
# Calculate total auto loan interest (sum of all auto loan interest variables)
auto_int["auto_loan_interest"] = auto_int[
["x2219", "x2319", "x2419", "x7170"]
auto_df = df[IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS].copy()
auto_df[AUTO_LOAN_COLUMNS].replace(-1, 0, inplace=True)

# Interest rate columns are in percent * 10,000 format, we need to divide by 10,000 to leave them in percentage format
RATE_COLUMNS = ["x2219", "x2319", "x2419", "x7170"]
auto_df[RATE_COLUMNS] /= 10_000

# Calculate total auto loan balance (sum of all auto loan balance variables)
auto_df["auto_loan_balance"] = auto_df[
["x2209", "x2309", "x2409", "x7158"]
].sum(axis=1)

# Calculate total auto loan interest (sum of the amounts of each balance variable multiplied by its respective interest rate variable)
auto_df["auto_loan_interest"] = (
auto_df["x2209"] * auto_df["x2219"]
+ auto_df["x2309"] * auto_df["x2319"]
+ auto_df["x2409"] * auto_df["x2419"]
+ auto_df["x7158"] * auto_df["x7170"]
)

# Check if we have household identifiers (y1, yy1) in both datasets
if (
"y1" in scf
and "yy1" in scf
and "y1" in auto_int.columns
and "yy1" in auto_int.columns
and "y1" in auto_df.columns
and "yy1" in auto_df.columns
):
logger.info(
"Using household identifiers (y1, yy1) to ensure correct matching"
)

# Create unique identifier from y1 and yy1 for each dataset
# In the original data
auto_int["household_id"] = (
auto_int["y1"].astype(str) + "_" + auto_int["yy1"].astype(str)
auto_df["household_id"] = (
auto_df["y1"].astype(str) + "_" + auto_df["yy1"].astype(str)
)

# In the SCF dictionary
Expand All @@ -346,35 +371,42 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
temp_scf["y1"].astype(str) + "_" + temp_scf["yy1"].astype(str)
)

# Create a mapping from household ID to auto loan interest
# Create a mapping from household ID to auto loan balance and interest
id_to_interest = dict(
zip(
auto_int["household_id"].values,
auto_int["auto_loan_interest"].values,
auto_df["household_id"].values,
auto_df["auto_loan_interest"].values,
)
)
id_to_balance = dict(
zip(
auto_df["household_id"].values,
auto_df["auto_loan_balance"].values,
)
)

# Create array for auto loan interest that matches SCF order
interest_values = np.zeros(len(temp_scf), dtype=float)
balance_values = np.zeros(len(temp_scf), dtype=float)

# Fill in interest values based on household ID
for i, household_id in enumerate(temp_scf["household_id"]):
if household_id in id_to_interest:
interest_values[i] = id_to_interest[household_id]
for i, household_id in enumerate(temp_scf["household_id"]):
if household_id in id_to_balance:
balance_values[i] = id_to_balance[household_id]

# Add to SCF dictionary
scf["auto_loan_interest"] = interest_values / 100
scf["auto_loan_interest"] = interest_values
scf["auto_loan_balance"] = balance_values

logger.info(
f"Added auto loan interest data for year {year} with household matching"
)
else:
# Fallback to simple assignment if identifiers aren't present
logger.warning(
"Household identifiers not found. Using direct array assignment (may not match households correctly)"
)
scf["auto_loan_interest"] = auto_int["auto_loan_interest"].values
logger.info(
f"Added auto loan interest data for year {year} without household matching"
raise ValueError(
"Household identifiers (y1, yy1) not found in both datasets."
)

except Exception as e:
Expand All @@ -387,7 +419,7 @@ class SCF_2022(SCF):

name = "scf_2022"
label = "SCF 2022"
raw_scf = FedSCF_2022
raw_scf = SummarizedFedSCF_2022
file_path = STORAGE_FOLDER / "scf_2022.h5"
time_period = 2022
frac = 1
Expand All @@ -398,7 +430,7 @@ class SCF_2019(SCF):

name = "scf_2019"
label = "SCF 2019"
raw_scf = FedSCF_2019
raw_scf = SummarizedFedSCF_2019
file_path = STORAGE_FOLDER / "scf_2019.h5"
time_period = 2019
frac = 1
Expand All @@ -409,7 +441,7 @@ class SCF_2016(SCF):

name = "scf_2016"
label = "SCF 2016"
raw_scf = FedSCF_2016
raw_scf = SummarizedFedSCF_2016
file_path = STORAGE_FOLDER / "scf_2016.h5"
time_period = 2016
frac = 1
Expand Down
13 changes: 9 additions & 4 deletions policyengine_us_data/tests/test_datasets/test_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,14 @@ def test_cps_has_auto_loan_interest():
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=CPS_2024)
# Ensure we impute at least $65 billion in auto loan interest.
# We currently target $256 billion.
AUTO_LOAN_INTEREST_MINIMUM = 65e9
# Ensure we impute around $85 billion in overtime premium with 20% error bounds.
AUTO_LOAN_INTEREST_TARGET = 85e9
RELATIVE_TOLERANCE = 0.2
assert (
sim.calculate("auto_loan_interest").sum() > AUTO_LOAN_INTEREST_MINIMUM
abs(
sim.calculate("auto_loan_interest").sum()
/ AUTO_LOAN_INTEREST_TARGET
- 1
)
< RELATIVE_TOLERANCE
)
Loading