Changing to auto loan interest imputation (#255)

juaristi22 · web-flow · commit 1dfbf41be699 · 2025-05-22T11:56:14.000+01:00
* changing to auto loan interest imputation

* move 100

* auto loan variables in scf.py

* division by 100 for loan rate comments

* changing test to use upper and lower bounds

* around 270

* testing logging level and auto loan test

* trying with 100_000

* fixed auto loan interest values
diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml
@@ -49,7 +49,9 @@ jobs:
               HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
               POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
           - name: Build datasets
-            run: make data       
+            run: make data   
+            env:
+              PYTHON_LOG_LEVEL: INFO    
           - name: Run tests
             run: pytest
           - name: Upload data
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
@@ -43,6 +43,7 @@ jobs:
             run: make data
             env:
               TEST_LITE: true
+              PYTHON_LOG_LEVEL: INFO  
           - name: Run tests
             run: pytest
           - name: Test documentation builds
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: patch
+  changes:
+    changed:
+    - Methodology to directly impute auto loan interest instead of assuming a 2% interest rate on auto loan balance.
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -68,7 +68,7 @@ def generate(self):
         logging.info("Adding rent")
         add_rent(self, cps, person, household)
         logging.info("Adding auto loan balance")
-        add_auto_loan_balance(self, cps)
+        add_auto_loan_interest(self, cps)
         logging.info("Adding tips")
         add_tips(self, cps)
         logging.info("Added all variables")
@@ -184,8 +184,8 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
     cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]
 
 
-def add_auto_loan_balance(self, cps: h5py.File) -> None:
-    """ "Add auto loan balance variable."""
+def add_auto_loan_interest(self, cps: h5py.File) -> None:
+    """ "Add auto loan interest variable."""
     self.save_dataset(cps)
     cps_data = self.load_dataset()
 
@@ -298,21 +298,29 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
         "self_employment_income",
         "farm_income",
     ]
-    IMPUTED_VARIABLES = ["auto_loan_balance"]
+    IMPUTED_VARIABLES = ["auto_loan_interest"]
     weights = ["household_weight"]
 
     donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy()
 
     donor_data = donor_data.loc[
         np.random.choice(
             donor_data.index,
-            size=100_000 if not test_lite else 1_000,
+            size=100_000 if not test_lite else 100_000,
             replace=True,
             p=donor_data.household_weight / donor_data.household_weight.sum(),
         )
     ]
 
     from microimpute.models.qrf import QRF
+    import logging
+    import os
+
+    # Set root logger level
+    log_level = os.getenv("PYTHON_LOG_LEVEL", "WARNING")
+
+    # Specifically target the microimpute logger
+    logging.getLogger("microimpute").setLevel(getattr(logging, log_level))
 
     qrf_model = QRF()
     fitted_model = qrf_model.fit(
@@ -327,10 +335,6 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None:
     for var in IMPUTED_VARIABLES:
         cps[var] = imputations[0.5][var]
 
-    cps["auto_loan_interest"] = (
-        cps["auto_loan_balance"] * scf_data["auto_loan_interest"].mean() / 100
-    ) * 12
-
     self.save_dataset(cps)
 
 
diff --git a/policyengine_us_data/datasets/scf/fed_scf.py b/policyengine_us_data/datasets/scf/fed_scf.py
@@ -9,7 +9,7 @@
 from policyengine_us_data.storage import STORAGE_FOLDER
 
 
-class FedSCF(Dataset):
+class SummarizedFedSCF(Dataset):
     """Dataset containing Survey of Consumer Finances data from the Federal Reserve."""
 
     time_period: int
@@ -84,26 +84,26 @@ def generate(self):
 
     @property
     def _scf_download_url(self) -> str:
-        return SCF_URL_BY_YEAR.get(self.time_period)
+        return SummarizedSCF_URL_BY_YEAR.get(self.time_period)
 
 
-class FedSCF_2022(FedSCF):
+class SummarizedFedSCF_2022(SummarizedFedSCF):
     time_period = 2022
     label = "Federal Reserve SCF (2022)"
     name = "fed_scf_2022"
     file_path = STORAGE_FOLDER / "fed_scf_2022.h5"
     data_format = Dataset.TABLES
 
 
-class FedSCF_2019(FedSCF):
+class SummarizedFedSCF_2019(SummarizedFedSCF):
     time_period = 2019
     label = "Federal Reserve SCF (2019)"
     name = "fed_scf_2019"
     file_path = STORAGE_FOLDER / "fed_scf_2019.h5"
     data_format = Dataset.TABLES
 
 
-class FedSCF_2016(FedSCF):
+class SummarizedFedSCF_2016(SummarizedFedSCF):
     time_period = 2016
     label = "Federal Reserve SCF (2016)"
     name = "fed_scf_2016"
@@ -112,7 +112,7 @@ class FedSCF_2016(FedSCF):
 
 
 # URLs for the SCF data by year
-SCF_URL_BY_YEAR = {
+SummarizedSCF_URL_BY_YEAR = {
     2016: "https://www.federalreserve.gov/econres/files/scfp2016s.zip",
     2019: "https://www.federalreserve.gov/econres/files/scfp2019s.zip",
     2022: "https://www.federalreserve.gov/econres/files/scfp2022s.zip",
diff --git a/policyengine_us_data/datasets/scf/scf.py b/policyengine_us_data/datasets/scf/scf.py
@@ -1,10 +1,10 @@
 from policyengine_core.data import Dataset
 from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_us_data.datasets.scf.fed_scf import (
-    FedSCF,
-    FedSCF_2016,
-    FedSCF_2019,
-    FedSCF_2022,
+    SummarizedFedSCF,
+    SummarizedFedSCF_2016,
+    SummarizedFedSCF_2019,
+    SummarizedFedSCF_2022,
 )
 import pandas as pd
 import numpy as np
@@ -18,7 +18,7 @@ class SCF(Dataset):
 
     name = "scf"
     label = "SCF"
-    raw_scf: Type[FedSCF] = None
+    raw_scf: Type[SummarizedFedSCF] = None
     time_period: int = None
     data_format = Dataset.ARRAYS
     frac: float | None = 1
@@ -217,7 +217,9 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
 
     # Vehicle loan (auto loan)
     if "veh_inst" in raw_data.columns:
-        scf["auto_loan_balance"] = raw_data["veh_inst"].fillna(0).values
+        scf["total_vehicle_installments"] = (
+            raw_data["veh_inst"].fillna(0).values
+        )
 
     # Household weights
     if "wgt" in raw_data.columns:
@@ -248,7 +250,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
 
 
 def add_auto_loan_interest(scf: dict, year: int) -> None:
-    """Adds auto loan interest to the summarized SCF dataset from the full SCF."""
+    """Adds auto loan balance and interest to the summarized SCF dataset from the full SCF."""
     import requests
     import zipfile
     import io
@@ -260,7 +262,17 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
     url = f"https://www.federalreserve.gov/econres/files/scf{year}s.zip"
 
     # Define columns of interest
-    columns = ["yy1", "y1", "x2219", "x2319", "x2419", "x7170"]
+    IDENTIFYER_COLUMNS = ["yy1", "y1"]
+    AUTO_LOAN_COLUMNS = [
+        "x2209",  # loan amount on car 1
+        "x2309",  # loan amount on car 2
+        "x2409",  # loan amount on car 3
+        "x7158",  # loan amount on car 4
+        "x2219",  # loan interest rate on car 1
+        "x2319",  # loan interest rate on car 2
+        "x2419",  # loan interest rate on car 3
+        "x7170",  # loan interest rate on car 4
+    ]
 
     try:
         # Download zip file
@@ -295,7 +307,10 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
             try:
                 logger.info(f"Reading Stata file: {dta_files[0]}")
                 with z.open(dta_files[0]) as f:
-                    df = pd.read_stata(io.BytesIO(f.read()), columns=columns)
+                    df = pd.read_stata(
+                        io.BytesIO(f.read()),
+                        columns=(IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS),
+                    )
                     logger.info(f"Read DataFrame with shape {df.shape}")
             except Exception as e:
                 logger.error(
@@ -312,31 +327,41 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
             ) from e
 
         # Process the interest data and add to final SCF dictionary
-        auto_int = df[columns].copy()
-        auto_int["x2219"] = auto_int["x2219"].replace(-1, 0)
-        auto_int["x2319"] = auto_int["x2319"].replace(-1, 0)
-        auto_int["x2419"] = auto_int["x2419"].replace(-1, 0)
-        auto_int["x7170"] = auto_int["x7170"].replace(-1, 0)
-        # Calculate total auto loan interest (sum of all auto loan interest variables)
-        auto_int["auto_loan_interest"] = auto_int[
-            ["x2219", "x2319", "x2419", "x7170"]
+        auto_df = df[IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS].copy()
+        auto_df[AUTO_LOAN_COLUMNS].replace(-1, 0, inplace=True)
+
+        # Interest rate columns are in percent * 10,000 format, we need to divide by 10,000 to leave them in percentage format
+        RATE_COLUMNS = ["x2219", "x2319", "x2419", "x7170"]
+        auto_df[RATE_COLUMNS] /= 10_000
+
+        # Calculate total auto loan balance (sum of all auto loan balance variables)
+        auto_df["auto_loan_balance"] = auto_df[
+            ["x2209", "x2309", "x2409", "x7158"]
         ].sum(axis=1)
 
+        # Calculate total auto loan interest (sum of the amounts of each balance variable multiplied by its respective interest rate variable)
+        auto_df["auto_loan_interest"] = (
+            auto_df["x2209"] * auto_df["x2219"]
+            + auto_df["x2309"] * auto_df["x2319"]
+            + auto_df["x2409"] * auto_df["x2419"]
+            + auto_df["x7158"] * auto_df["x7170"]
+        )
+
         # Check if we have household identifiers (y1, yy1) in both datasets
         if (
             "y1" in scf
             and "yy1" in scf
-            and "y1" in auto_int.columns
-            and "yy1" in auto_int.columns
+            and "y1" in auto_df.columns
+            and "yy1" in auto_df.columns
         ):
             logger.info(
                 "Using household identifiers (y1, yy1) to ensure correct matching"
             )
 
             # Create unique identifier from y1 and yy1 for each dataset
             # In the original data
-            auto_int["household_id"] = (
-                auto_int["y1"].astype(str) + "_" + auto_int["yy1"].astype(str)
+            auto_df["household_id"] = (
+                auto_df["y1"].astype(str) + "_" + auto_df["yy1"].astype(str)
             )
 
             # In the SCF dictionary
@@ -346,35 +371,42 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
                 temp_scf["y1"].astype(str) + "_" + temp_scf["yy1"].astype(str)
             )
 
-            # Create a mapping from household ID to auto loan interest
+            # Create a mapping from household ID to auto loan balance and interest
             id_to_interest = dict(
                 zip(
-                    auto_int["household_id"].values,
-                    auto_int["auto_loan_interest"].values,
+                    auto_df["household_id"].values,
+                    auto_df["auto_loan_interest"].values,
+                )
+            )
+            id_to_balance = dict(
+                zip(
+                    auto_df["household_id"].values,
+                    auto_df["auto_loan_balance"].values,
                 )
             )
 
             # Create array for auto loan interest that matches SCF order
             interest_values = np.zeros(len(temp_scf), dtype=float)
+            balance_values = np.zeros(len(temp_scf), dtype=float)
 
             # Fill in interest values based on household ID
             for i, household_id in enumerate(temp_scf["household_id"]):
                 if household_id in id_to_interest:
                     interest_values[i] = id_to_interest[household_id]
+            for i, household_id in enumerate(temp_scf["household_id"]):
+                if household_id in id_to_balance:
+                    balance_values[i] = id_to_balance[household_id]
 
             # Add to SCF dictionary
-            scf["auto_loan_interest"] = interest_values / 100
+            scf["auto_loan_interest"] = interest_values
+            scf["auto_loan_balance"] = balance_values
+
             logger.info(
                 f"Added auto loan interest data for year {year} with household matching"
             )
         else:
-            # Fallback to simple assignment if identifiers aren't present
-            logger.warning(
-                "Household identifiers not found. Using direct array assignment (may not match households correctly)"
-            )
-            scf["auto_loan_interest"] = auto_int["auto_loan_interest"].values
-            logger.info(
-                f"Added auto loan interest data for year {year} without household matching"
+            raise ValueError(
+                "Household identifiers (y1, yy1) not found in both datasets."
             )
 
     except Exception as e:
@@ -387,7 +419,7 @@ class SCF_2022(SCF):
 
     name = "scf_2022"
     label = "SCF 2022"
-    raw_scf = FedSCF_2022
+    raw_scf = SummarizedFedSCF_2022
     file_path = STORAGE_FOLDER / "scf_2022.h5"
     time_period = 2022
     frac = 1
@@ -398,7 +430,7 @@ class SCF_2019(SCF):
 
     name = "scf_2019"
     label = "SCF 2019"
-    raw_scf = FedSCF_2019
+    raw_scf = SummarizedFedSCF_2019
     file_path = STORAGE_FOLDER / "scf_2019.h5"
     time_period = 2019
     frac = 1
@@ -409,7 +441,7 @@ class SCF_2016(SCF):
 
     name = "scf_2016"
     label = "SCF 2016"
-    raw_scf = FedSCF_2016
+    raw_scf = SummarizedFedSCF_2016
     file_path = STORAGE_FOLDER / "scf_2016.h5"
     time_period = 2016
     frac = 1
diff --git a/policyengine_us_data/tests/test_datasets/test_cps.py b/policyengine_us_data/tests/test_datasets/test_cps.py
@@ -34,11 +34,16 @@ def test_cps_has_auto_loan_interest():
     from policyengine_us import Microsimulation
 
     sim = Microsimulation(dataset=CPS_2024)
-    # Ensure we impute at least $65 billion in auto loan interest.
-    # We currently target $256 billion.
-    AUTO_LOAN_INTEREST_MINIMUM = 65e9
+    # Ensure we impute around $85 billion in overtime premium with 20% error bounds.
+    AUTO_LOAN_INTEREST_TARGET = 85e9
+    RELATIVE_TOLERANCE = 0.2
     assert (
-        sim.calculate("auto_loan_interest").sum() > AUTO_LOAN_INTEREST_MINIMUM
+        abs(
+            sim.calculate("auto_loan_interest").sum()
+            / AUTO_LOAN_INTEREST_TARGET
+            - 1
+        )
+        < RELATIVE_TOLERANCE
     )