11from policyengine_core .data import Dataset
22from policyengine_us_data .storage import STORAGE_FOLDER
33from policyengine_us_data .datasets .scf .fed_scf import (
4- FedSCF ,
5- FedSCF_2016 ,
6- FedSCF_2019 ,
7- FedSCF_2022 ,
4+ SummarizedFedSCF ,
5+ SummarizedFedSCF_2016 ,
6+ SummarizedFedSCF_2019 ,
7+ SummarizedFedSCF_2022 ,
88)
99import pandas as pd
1010import numpy as np
@@ -18,7 +18,7 @@ class SCF(Dataset):
1818
1919 name = "scf"
2020 label = "SCF"
21- raw_scf : Type [FedSCF ] = None
21+ raw_scf : Type [SummarizedFedSCF ] = None
2222 time_period : int = None
2323 data_format = Dataset .ARRAYS
2424 frac : float | None = 1
@@ -217,7 +217,9 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
217217
218218 # Vehicle loan (auto loan)
219219 if "veh_inst" in raw_data .columns :
220- scf ["auto_loan_balance" ] = raw_data ["veh_inst" ].fillna (0 ).values
220+ scf ["total_vehicle_installments" ] = (
221+ raw_data ["veh_inst" ].fillna (0 ).values
222+ )
221223
222224 # Household weights
223225 if "wgt" in raw_data .columns :
@@ -248,7 +250,7 @@ def rename_columns_to_match_cps(scf: dict, raw_data: pd.DataFrame) -> None:
248250
249251
250252def add_auto_loan_interest (scf : dict , year : int ) -> None :
251- """Adds auto loan interest to the summarized SCF dataset from the full SCF."""
253+ """Adds auto loan balance and interest to the summarized SCF dataset from the full SCF."""
252254 import requests
253255 import zipfile
254256 import io
@@ -260,7 +262,17 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
260262 url = f"https://www.federalreserve.gov/econres/files/scf{ year } s.zip"
261263
262264 # Define columns of interest
263- columns = ["yy1" , "y1" , "x2219" , "x2319" , "x2419" , "x7170" ]
265+ IDENTIFYER_COLUMNS = ["yy1" , "y1" ]
266+ AUTO_LOAN_COLUMNS = [
267+ "x2209" , # loan amount on car 1
268+ "x2309" , # loan amount on car 2
269+ "x2409" , # loan amount on car 3
270+ "x7158" , # loan amount on car 4
271+ "x2219" , # loan interest rate on car 1
272+ "x2319" , # loan interest rate on car 2
273+ "x2419" , # loan interest rate on car 3
274+ "x7170" , # loan interest rate on car 4
275+ ]
264276
265277 try :
266278 # Download zip file
@@ -295,7 +307,10 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
295307 try :
296308 logger .info (f"Reading Stata file: { dta_files [0 ]} " )
297309 with z .open (dta_files [0 ]) as f :
298- df = pd .read_stata (io .BytesIO (f .read ()), columns = columns )
310+ df = pd .read_stata (
311+ io .BytesIO (f .read ()),
312+ columns = (IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS ),
313+ )
299314 logger .info (f"Read DataFrame with shape { df .shape } " )
300315 except Exception as e :
301316 logger .error (
@@ -312,31 +327,41 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
312327 ) from e
313328
314329 # Process the interest data and add to final SCF dictionary
315- auto_int = df [columns ].copy ()
316- auto_int ["x2219" ] = auto_int ["x2219" ].replace (- 1 , 0 )
317- auto_int ["x2319" ] = auto_int ["x2319" ].replace (- 1 , 0 )
318- auto_int ["x2419" ] = auto_int ["x2419" ].replace (- 1 , 0 )
319- auto_int ["x7170" ] = auto_int ["x7170" ].replace (- 1 , 0 )
320- # Calculate total auto loan interest (sum of all auto loan interest variables)
321- auto_int ["auto_loan_interest" ] = auto_int [
322- ["x2219" , "x2319" , "x2419" , "x7170" ]
330+ auto_df = df [IDENTIFYER_COLUMNS + AUTO_LOAN_COLUMNS ].copy ()
331+ auto_df [AUTO_LOAN_COLUMNS ].replace (- 1 , 0 , inplace = True )
332+
333+ # Interest rate columns are in percent * 10,000 format, we need to divide by 10,000 to leave them in percentage format
334+ RATE_COLUMNS = ["x2219" , "x2319" , "x2419" , "x7170" ]
335+ auto_df [RATE_COLUMNS ] /= 10_000
336+
337+ # Calculate total auto loan balance (sum of all auto loan balance variables)
338+ auto_df ["auto_loan_balance" ] = auto_df [
339+ ["x2209" , "x2309" , "x2409" , "x7158" ]
323340 ].sum (axis = 1 )
324341
342+ # Calculate total auto loan interest (sum of the amounts of each balance variable multiplied by its respective interest rate variable)
343+ auto_df ["auto_loan_interest" ] = (
344+ auto_df ["x2209" ] * auto_df ["x2219" ]
345+ + auto_df ["x2309" ] * auto_df ["x2319" ]
346+ + auto_df ["x2409" ] * auto_df ["x2419" ]
347+ + auto_df ["x7158" ] * auto_df ["x7170" ]
348+ )
349+
325350 # Check if we have household identifiers (y1, yy1) in both datasets
326351 if (
327352 "y1" in scf
328353 and "yy1" in scf
329- and "y1" in auto_int .columns
330- and "yy1" in auto_int .columns
354+ and "y1" in auto_df .columns
355+ and "yy1" in auto_df .columns
331356 ):
332357 logger .info (
333358 "Using household identifiers (y1, yy1) to ensure correct matching"
334359 )
335360
336361 # Create unique identifier from y1 and yy1 for each dataset
337362 # In the original data
338- auto_int ["household_id" ] = (
339- auto_int ["y1" ].astype (str ) + "_" + auto_int ["yy1" ].astype (str )
363+ auto_df ["household_id" ] = (
364+ auto_df ["y1" ].astype (str ) + "_" + auto_df ["yy1" ].astype (str )
340365 )
341366
342367 # In the SCF dictionary
@@ -346,35 +371,42 @@ def add_auto_loan_interest(scf: dict, year: int) -> None:
346371 temp_scf ["y1" ].astype (str ) + "_" + temp_scf ["yy1" ].astype (str )
347372 )
348373
349- # Create a mapping from household ID to auto loan interest
374+ # Create a mapping from household ID to auto loan balance and interest
350375 id_to_interest = dict (
351376 zip (
352- auto_int ["household_id" ].values ,
353- auto_int ["auto_loan_interest" ].values ,
377+ auto_df ["household_id" ].values ,
378+ auto_df ["auto_loan_interest" ].values ,
379+ )
380+ )
381+ id_to_balance = dict (
382+ zip (
383+ auto_df ["household_id" ].values ,
384+ auto_df ["auto_loan_balance" ].values ,
354385 )
355386 )
356387
357388 # Create array for auto loan interest that matches SCF order
358389 interest_values = np .zeros (len (temp_scf ), dtype = float )
390+ balance_values = np .zeros (len (temp_scf ), dtype = float )
359391
360392 # Fill in interest values based on household ID
361393 for i , household_id in enumerate (temp_scf ["household_id" ]):
362394 if household_id in id_to_interest :
363395 interest_values [i ] = id_to_interest [household_id ]
396+ for i , household_id in enumerate (temp_scf ["household_id" ]):
397+ if household_id in id_to_balance :
398+ balance_values [i ] = id_to_balance [household_id ]
364399
365400 # Add to SCF dictionary
366- scf ["auto_loan_interest" ] = interest_values / 100
401+ scf ["auto_loan_interest" ] = interest_values
402+ scf ["auto_loan_balance" ] = balance_values
403+
367404 logger .info (
368405 f"Added auto loan interest data for year { year } with household matching"
369406 )
370407 else :
371- # Fallback to simple assignment if identifiers aren't present
372- logger .warning (
373- "Household identifiers not found. Using direct array assignment (may not match households correctly)"
374- )
375- scf ["auto_loan_interest" ] = auto_int ["auto_loan_interest" ].values
376- logger .info (
377- f"Added auto loan interest data for year { year } without household matching"
408+ raise ValueError (
409+ "Household identifiers (y1, yy1) not found in both datasets."
378410 )
379411
380412 except Exception as e :
@@ -387,7 +419,7 @@ class SCF_2022(SCF):
387419
388420 name = "scf_2022"
389421 label = "SCF 2022"
390- raw_scf = FedSCF_2022
422+ raw_scf = SummarizedFedSCF_2022
391423 file_path = STORAGE_FOLDER / "scf_2022.h5"
392424 time_period = 2022
393425 frac = 1
@@ -398,7 +430,7 @@ class SCF_2019(SCF):
398430
399431 name = "scf_2019"
400432 label = "SCF 2019"
401- raw_scf = FedSCF_2019
433+ raw_scf = SummarizedFedSCF_2019
402434 file_path = STORAGE_FOLDER / "scf_2019.h5"
403435 time_period = 2019
404436 frac = 1
@@ -409,7 +441,7 @@ class SCF_2016(SCF):
409441
410442 name = "scf_2016"
411443 label = "SCF 2016"
412- raw_scf = FedSCF_2016
444+ raw_scf = SummarizedFedSCF_2016
413445 file_path = STORAGE_FOLDER / "scf_2016.h5"
414446 time_period = 2016
415447 frac = 1
0 commit comments