diff --git a/.github/workflows/code_changes.yaml b/.github/workflows/code_changes.yaml index 7e6f8384..14d13f02 100644 --- a/.github/workflows/code_changes.yaml +++ b/.github/workflows/code_changes.yaml @@ -41,7 +41,9 @@ jobs: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} - name: Build datasets - run: make data + run: make data + env: + PYTHON_LOG_LEVEL: INFO - name: Run tests run: pytest - name: Upload data diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 1e21595f..192d2f2c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -43,6 +43,7 @@ jobs: run: make data env: TEST_LITE: true + PYTHON_LOG_LEVEL: INFO - name: Run tests run: pytest - name: Test documentation builds diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..7d398ebc 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + changed: + - Methodology to directly impute auto loan interest instead of assuming a 2% interest rate on auto loan balance. \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 0ce17925..2fcbb64d 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -68,7 +68,7 @@ def generate(self): logging.info("Adding rent") add_rent(self, cps, person, household) logging.info("Adding auto loan balance") - add_auto_loan_balance(self, cps) + add_auto_loan_interest(self, cps) logging.info("Adding tips") add_tips(self, cps) logging.info("Added all variables") @@ -184,8 +184,8 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame): cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"] -def add_auto_loan_balance(self, cps: h5py.File) -> None: - """ "Add auto loan balance variable.""" +def add_auto_loan_interest(self, cps: h5py.File) -> None: + """ "Add auto loan interest variable.""" self.save_dataset(cps) cps_data = self.load_dataset() @@ -298,7 +298,7 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None: "self_employment_income", "farm_income", ] - IMPUTED_VARIABLES = ["auto_loan_balance"] + IMPUTED_VARIABLES = ["auto_loan_interest"] weights = ["household_weight"] donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy() @@ -306,13 +306,21 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None: donor_data = donor_data.loc[ np.random.choice( donor_data.index, - size=100_000 if not test_lite else 1_000, + size=100_000 if not test_lite else 100_000, replace=True, p=donor_data.household_weight / donor_data.household_weight.sum(), ) ] from microimpute.models.qrf import QRF + import logging + import os + + # Set root logger level + log_level = os.getenv("PYTHON_LOG_LEVEL", "WARNING") + + # Specifically target the microimpute logger + logging.getLogger("microimpute").setLevel(getattr(logging, log_level)) qrf_model = QRF() fitted_model = qrf_model.fit( @@ -327,10 +335,6 @@ def add_auto_loan_balance(self, cps: h5py.File) -> None: for var in IMPUTED_VARIABLES: cps[var] = imputations[0.5][var] - cps["auto_loan_interest"] = ( - cps["auto_loan_balance"] * scf_data["auto_loan_interest"].mean() / 100 - ) * 12 - self.save_dataset(cps) diff --git a/policyengine_us_data/datasets/scf/fed_scf.py b/policyengine_us_data/datasets/scf/fed_scf.py index 003ce7d2..f67a2c07 100644 --- a/policyengine_us_data/datasets/scf/fed_scf.py +++ b/policyengine_us_data/datasets/scf/fed_scf.py @@ -9,7 +9,7 @@ from policyengine_us_data.storage import STORAGE_FOLDER -class FedSCF(Dataset): +class SummarizedFedSCF(Dataset): """Dataset containing Survey of Consumer Finances data from the Federal Reserve.""" time_period: int @@ -84,10 +84,10 @@ def generate(self): @property def _scf_download_url(self) -> str: - return SCF_URL_BY_YEAR.get(self.time_period) + return SummarizedSCF_URL_BY_YEAR.get(self.time_period) -class FedSCF_2022(FedSCF): +class SummarizedFedSCF_2022(SummarizedFedSCF): time_period = 2022 label = "Federal Reserve SCF (2022)" name = "fed_scf_2022" @@ -95,7 +95,7 @@ class FedSCF_2022(FedSCF): data_format = Dataset.TABLES -class FedSCF_2019(FedSCF): +class SummarizedFedSCF_2019(SummarizedFedSCF): time_period = 2019 label = "Federal Reserve SCF (2019)" name = "fed_scf_2019" @@ -103,7 +103,7 @@ class FedSCF_2019(FedSCF): data_format = Dataset.TABLES -class FedSCF_2016(FedSCF): +class SummarizedFedSCF_2016(SummarizedFedSCF): time_period = 2016 label = "Federal Reserve SCF (2016)" name = "fed_scf_2016" @@ -112,7 +112,7 @@ class FedSCF_2016(FedSCF): # URLs for the SCF data by year -SCF_URL_BY_YEAR = { +SummarizedSCF_URL_BY_YEAR = { 2016: "https://www.federalreserve.gov/econres/files/scfp2016s.zip", 2019: "https://www.federalreserve.gov/econres/files/scfp2019s.zip", 2022: "https://www.federalreserve.gov/econres/files/scfp2022s.zip", diff --git a/policyengine_us_data/datasets/scf/scf.py b/policyengine_us_data/datasets/scf/scf.py index 6c0b5b0c..55325145 100644 --- a/policyengine_us_data/datasets/scf/scf.py +++ b/policyengine_us_data/datasets/scf/scf.py @@ -1,10 +1,10 @@ from policyengine_core.data import Dataset from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.datasets.scf.fed_scf import ( - FedSCF, - FedSCF_2016, - FedSCF_2019, - FedSCF_2022, + SummarizedFedSCF, + SummarizedFedSCF_2016, + SummarizedFedSCF_2019, + SummarizedFedSCF_2022, ) import pandas as pd import numpy as np @@ -18,7 +18,7 @@ class SCF(Dataset): name = "scf" label = "SCF" - raw_scf: Type[FedSCF] = None + raw_scf: Type[SummarizedFedSCF] = None time_period: int = None data_format = Dataset.ARRAYS frac: float | None = 1 @@ -217,7 +217,9 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None: # Vehicle loan (auto loan) if "veh_inst" in raw_data.columns: - scf["auto_loan_balance"] = raw_data["veh_inst"].fillna(0).values + scf["total_vehicle_installments"] = ( + raw_data["veh_inst"].fillna(0).values + ) # Household weights if "wgt" in raw_data.columns: @@ -248,7 +250,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None: def add_auto_loan_interest(scf: dict, year: int) -> None: - """Adds auto loan interest to the summarized SCF dataset from the full SCF.""" + """Adds auto loan balance and interest to the summarized SCF dataset from the full SCF.""" import requests import zipfile import io @@ -260,7 +262,17 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: url = f"https://www.federalreserve.gov/econres/files/scf{year}s.zip" # Define columns of interest - columns = ["yy1", "y1", "x2219", "x2319", "x2419", "x7170"] + IDENTIFYER_COLUMNS = ["yy1", "y1"] + AUTO_LOAN_COLUMNS = [ + "x2209", # loan amount on car 1 + "x2309", # loan amount on car 2 + "x2409", # loan amount on car 3 + "x7158", # loan amount on car 4 + "x2219", # loan interest rate on car 1 + "x2319", # loan interest rate on car 2 + "x2419", # loan interest rate on car 3 + "x7170", # loan interest rate on car 4 + ] try: # Download zip file @@ -295,7 +307,10 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: try: logger.info(f"Reading Stata file: {dta_files[0]}") with z.open(dta_files[0]) as f: - df = pd.read_stata(io.BytesIO(f.read()), columns=columns) + df = pd.read_stata( + io.BytesIO(f.read()), + columns=(IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS), + ) logger.info(f"Read DataFrame with shape {df.shape}") except Exception as e: logger.error( @@ -312,22 +327,32 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: ) from e # Process the interest data and add to final SCF dictionary - auto_int = df[columns].copy() - auto_int["x2219"] = auto_int["x2219"].replace(-1, 0) - auto_int["x2319"] = auto_int["x2319"].replace(-1, 0) - auto_int["x2419"] = auto_int["x2419"].replace(-1, 0) - auto_int["x7170"] = auto_int["x7170"].replace(-1, 0) - # Calculate total auto loan interest (sum of all auto loan interest variables) - auto_int["auto_loan_interest"] = auto_int[ - ["x2219", "x2319", "x2419", "x7170"] + auto_df = df[IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS].copy() + auto_df[AUTO_LOAN_COLUMNS].replace(-1, 0, inplace=True) + + # Interest rate columns are in percent * 10,000 format, we need to divide by 10,000 to leave them in percentage format + RATE_COLUMNS = ["x2219", "x2319", "x2419", "x7170"] + auto_df[RATE_COLUMNS] /= 10_000 + + # Calculate total auto loan balance (sum of all auto loan balance variables) + auto_df["auto_loan_balance"] = auto_df[ + ["x2209", "x2309", "x2409", "x7158"] ].sum(axis=1) + # Calculate total auto loan interest (sum of the amounts of each balance variable multiplied by its respective interest rate variable) + auto_df["auto_loan_interest"] = ( + auto_df["x2209"] * auto_df["x2219"] + + auto_df["x2309"] * auto_df["x2319"] + + auto_df["x2409"] * auto_df["x2419"] + + auto_df["x7158"] * auto_df["x7170"] + ) + # Check if we have household identifiers (y1, yy1) in both datasets if ( "y1" in scf and "yy1" in scf - and "y1" in auto_int.columns - and "yy1" in auto_int.columns + and "y1" in auto_df.columns + and "yy1" in auto_df.columns ): logger.info( "Using household identifiers (y1, yy1) to ensure correct matching" @@ -335,8 +360,8 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: # Create unique identifier from y1 and yy1 for each dataset # In the original data - auto_int["household_id"] = ( - auto_int["y1"].astype(str) + "_" + auto_int["yy1"].astype(str) + auto_df["household_id"] = ( + auto_df["y1"].astype(str) + "_" + auto_df["yy1"].astype(str) ) # In the SCF dictionary @@ -346,35 +371,42 @@ def add_auto_loan_interest(scf: dict, year: int) -> None: temp_scf["y1"].astype(str) + "_" + temp_scf["yy1"].astype(str) ) - # Create a mapping from household ID to auto loan interest + # Create a mapping from household ID to auto loan balance and interest id_to_interest = dict( zip( - auto_int["household_id"].values, - auto_int["auto_loan_interest"].values, + auto_df["household_id"].values, + auto_df["auto_loan_interest"].values, + ) + ) + id_to_balance = dict( + zip( + auto_df["household_id"].values, + auto_df["auto_loan_balance"].values, ) ) # Create array for auto loan interest that matches SCF order interest_values = np.zeros(len(temp_scf), dtype=float) + balance_values = np.zeros(len(temp_scf), dtype=float) # Fill in interest values based on household ID for i, household_id in enumerate(temp_scf["household_id"]): if household_id in id_to_interest: interest_values[i] = id_to_interest[household_id] + for i, household_id in enumerate(temp_scf["household_id"]): + if household_id in id_to_balance: + balance_values[i] = id_to_balance[household_id] # Add to SCF dictionary - scf["auto_loan_interest"] = interest_values / 100 + scf["auto_loan_interest"] = interest_values + scf["auto_loan_balance"] = balance_values + logger.info( f"Added auto loan interest data for year {year} with household matching" ) else: - # Fallback to simple assignment if identifiers aren't present - logger.warning( - "Household identifiers not found. Using direct array assignment (may not match households correctly)" - ) - scf["auto_loan_interest"] = auto_int["auto_loan_interest"].values - logger.info( - f"Added auto loan interest data for year {year} without household matching" + raise ValueError( + "Household identifiers (y1, yy1) not found in both datasets." ) except Exception as e: @@ -387,7 +419,7 @@ class SCF_2022(SCF): name = "scf_2022" label = "SCF 2022" - raw_scf = FedSCF_2022 + raw_scf = SummarizedFedSCF_2022 file_path = STORAGE_FOLDER / "scf_2022.h5" time_period = 2022 frac = 1 @@ -398,7 +430,7 @@ class SCF_2019(SCF): name = "scf_2019" label = "SCF 2019" - raw_scf = FedSCF_2019 + raw_scf = SummarizedFedSCF_2019 file_path = STORAGE_FOLDER / "scf_2019.h5" time_period = 2019 frac = 1 @@ -409,7 +441,7 @@ class SCF_2016(SCF): name = "scf_2016" label = "SCF 2016" - raw_scf = FedSCF_2016 + raw_scf = SummarizedFedSCF_2016 file_path = STORAGE_FOLDER / "scf_2016.h5" time_period = 2016 frac = 1 diff --git a/policyengine_us_data/tests/test_datasets/test_cps.py b/policyengine_us_data/tests/test_datasets/test_cps.py index b4bd59ed..a010166d 100644 --- a/policyengine_us_data/tests/test_datasets/test_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_cps.py @@ -34,9 +34,14 @@ def test_cps_has_auto_loan_interest(): from policyengine_us import Microsimulation sim = Microsimulation(dataset=CPS_2024) - # Ensure we impute at least $65 billion in auto loan interest. - # We currently target $256 billion. - AUTO_LOAN_INTEREST_MINIMUM = 65e9 + # Ensure we impute around $85 billion in overtime premium with 20% error bounds. + AUTO_LOAN_INTEREST_TARGET = 85e9 + RELATIVE_TOLERANCE = 0.2 assert ( - sim.calculate("auto_loan_interest").sum() > AUTO_LOAN_INTEREST_MINIMUM + abs( + sim.calculate("auto_loan_interest").sum() + / AUTO_LOAN_INTEREST_TARGET + - 1 + ) + < RELATIVE_TOLERANCE )