PolicyEngine · nikhilwoodruff · May 14, 2025 · May 13, 2025 · May 14, 2025 · May 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@
 **/_build
 !population_by_state.csv
 **/*.pkl
+venv
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - scf package loading module
+    - auto loan balance imputation notebook
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -57,6 +57,7 @@ def generate(self):
         add_spm_variables(cps, spm_unit)
         add_household_variables(cps, household)
         add_rent(self, cps, person, household)
+        add_auto_loan_balance(self, cps)
         add_tips(self, cps)
 
         raw_data.close()
@@ -167,6 +168,156 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
     cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]
 
 
+def add_auto_loan_balance(self, cps: h5py.File) -> None:
+    """ "Add auto loan balance variable."""
+    self.save_dataset(cps)
+    cps_data = self.load_dataset()
+
+    # Preprocess the CPS for imputation
+    lengths = {k: len(v) for k, v in cps_data.items()}
+    var_len = cps_data["person_household_id"].shape[0]
+    vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
+    agg_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
+
+    agg = (
+        agg_data.groupby("person_household_id")[
+            ["employment_income", "self_employment_income", "farm_income"]
+        ]
+        .sum()
+        .rename(
+            columns={
+                "employment_income": "household_employment_income",
+                "self_employment_income": "household_self_employment_income",
+                "farm_income": "household_farm_income",
+            }
+        )
+        .reset_index()
+    )
+
+    mask = cps_data["is_household_head"]
+    mask_len = mask.shape[0]
+
+    cps_data = {
+        var: data[mask] if data.shape[0] == mask_len else data
+        for var, data in cps_data.items()
+    }
+
+    CPS_RACE_MAPPING = {
+        1: 1,  # White only -> WHITE
+        2: 2,  # Black only -> BLACK/AFRICAN-AMERICAN
+        3: 5,  # American Indian, Alaskan Native only -> AMERICAN INDIAN/ALASKA NATIVE
+        4: 4,  # Asian only -> ASIAN
+        5: 6,  # Hawaiian/Pacific Islander only -> NATIVE HAWAIIAN/PACIFIC ISLANDER
+        6: 7,  # White-Black -> OTHER
+        7: 7,  # White-AI -> OTHER
+        8: 7,  # White-Asian -> OTHER
+        9: 7,  # White-HP -> OTHER
+        10: 7,  # Black-AI -> OTHER
+        11: 7,  # Black-Asian -> OTHER
+        12: 7,  # Black-HP -> OTHER
+        13: 7,  # AI-Asian -> OTHER
+        14: 7,  # AI-HP -> OTHER
+        15: 7,  # Asian-HP -> OTHER
+        16: 7,  # White-Black-AI -> OTHER
+        17: 7,  # White-Black-Asian -> OTHER
+        18: 7,  # White-Black-HP -> OTHER
+        19: 7,  # White-AI-Asian -> OTHER
+        20: 7,  # White-AI-HP -> OTHER
+        21: 7,  # White-Asian-HP -> OTHER
+        22: 7,  # Black-AI-Asian -> OTHER
+        23: 7,  # White-Black-AI-Asian -> OTHER
+        24: 7,  # White-AI-Asian-HP -> OTHER
+        25: 7,  # Other 3 race comb. -> OTHER
+        26: 7,  # Other 4 or 5 race comb. -> OTHER
+    }
+
+    # Apply the mapping to recode the race values
+    cps_data["cps_race"] = np.vectorize(CPS_RACE_MAPPING.get)(
+        cps_data["cps_race"]
+    )
+
+    lengths = {k: len(v) for k, v in cps_data.items()}
+    var_len = cps_data["household_id"].shape[0]
+    vars_of_interest = [name for name, ln in lengths.items() if ln == var_len]
+    receiver_data = pd.DataFrame({n: cps_data[n] for n in vars_of_interest})
+
+    receiver_data = receiver_data.merge(
+        agg[
+            [
+                "person_household_id",
+                "household_employment_income",
+                "household_self_employment_income",
+                "household_farm_income",
+            ]
+        ],
+        on="person_household_id",
+        how="left",
+    )
+    receiver_data.drop("employment_income", axis=1, inplace=True)
+    receiver_data.drop("self_employment_income", axis=1, inplace=True)
+    receiver_data.drop("farm_income", axis=1, inplace=True)
+
+    receiver_data.rename(
+        columns={
+            "household_employment_income": "employment_income",
+            "household_self_employment_income": "self_employment_income",
+            "household_farm_income": "farm_income",
+        },
+        inplace=True,
+    )
+
+    # Impute auto loan balance from the SCF
+    from policyengine_us_data.datasets.scf.scf import SCF_2022
+
+    scf_dataset = SCF_2022()
+    scf_data = scf_dataset.load_dataset()
+    scf_data = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()})
+
+    PREDICTORS = [
+        "age",
+        "is_female",
+        "cps_race",
+        "own_children_in_household",
+        "employment_income",
+        "self_employment_income",
+        "farm_income",
+    ]
+    IMPUTED_VARIABLES = ["auto_loan_balance"]
+    weights = ["household_weight"]
+
+    donor_data = scf_data[PREDICTORS + IMPUTED_VARIABLES + weights].copy()
+
+    donor_data = donor_data.loc[
+        np.random.choice(
+            donor_data.index,
+            size=100_000,
+            replace=True,
+            p=donor_data.household_weight / donor_data.household_weight.sum(),
+        )
+    ]
+
+    from microimpute.models.qrf import QRF
+
+    qrf_model = QRF()
+    fitted_model = qrf_model.fit(
+        X_train=donor_data,
+        predictors=PREDICTORS,
+        imputed_variables=IMPUTED_VARIABLES,
+        tune_hyperparameters=True,
+    )
+
+    imputations = fitted_model.predict(X_test=receiver_data)
+
+    for var in IMPUTED_VARIABLES:
+        cps[var] = imputations[0.5][var]
+
+    cps["auto_loan_interest"] = (
+        cps["auto_loan_balance"] * scf_data["auto_loan_interest"].mean() / 100
+    ) * 12
+
+    self.save_dataset(cps)
+
+
 def add_takeup(self):
     data = self.load_dataset()
 

diff --git a/policyengine_us_data/datasets/scf/__init__.py b/policyengine_us_data/datasets/scf/__init__.py
@@ -0,0 +1 @@
+from policyengine_us_data.datasets.scf import *
diff --git a/policyengine_us_data/datasets/scf/fed_scf.py b/policyengine_us_data/datasets/scf/fed_scf.py
@@ -0,0 +1,119 @@
+from policyengine_core.data import Dataset
+from tqdm import tqdm
+from typing import List, Optional, Union
+import requests
+from io import BytesIO
+from zipfile import ZipFile
+import pandas as pd
+import os
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+
+class FedSCF(Dataset):
+    """Dataset containing Survey of Consumer Finances data from the Federal Reserve."""
+
+    time_period: int
+    """Year of the dataset."""
+
+    def load(self):
+        """Loads the raw SCF dataset.
+
+        Returns:
+            pd.DataFrame: The raw SCF data.
+        """
+        # Check if file exists
+        if not os.path.exists(self.file_path):
+            print(f"Raw SCF dataset file not found. Generating it.")
+            self.generate()
+
+        # Open the HDF store and return the DataFrame
+        with pd.HDFStore(self.file_path, mode="r") as storage:
+            return storage["data"]
+
+    def generate(self):
+        if self._scf_download_url is None:
+            raise ValueError(
+                f"No raw SCF data URL known for year {self.time_period}."
+            )
+
+        url = self._scf_download_url
+
+        response = requests.get(url, stream=True)
+        total_size_in_bytes = int(
+            response.headers.get("content-length", 200e6)
+        )
+        progress_bar = tqdm(
+            total=total_size_in_bytes,
+            unit="iB",
+            unit_scale=True,
+            desc="Downloading SCF",
+        )
+        if response.status_code == 404:
+            raise FileNotFoundError(
+                "Received a 404 response when fetching the data."
+            )
+        with BytesIO() as file:
+            content_length_actual = 0
+            for data in response.iter_content(int(1e6)):
+                progress_bar.update(len(data))
+                content_length_actual += len(data)
+                file.write(data)
+            progress_bar.set_description("Downloaded SCF")
+            progress_bar.total = content_length_actual
+            progress_bar.close()
+
+            zipfile = ZipFile(file)
+            with pd.HDFStore(self.file_path, mode="w") as storage:
+                # Find the Stata file, which should be the only .dta file in the zip
+                dta_files = [
+                    f for f in zipfile.namelist() if f.endswith(".dta")
+                ]
+                if not dta_files:
+                    raise FileNotFoundError(
+                        "No .dta file found in the SCF zip archive."
+                    )
+                # Usually there's only one .dta file, but we'll handle multiple just in case
+                for dta_file in dta_files:
+                    with zipfile.open(dta_file) as f:
+                        # Read the Stata file with pandas
+                        data = pd.read_stata(f)
+                        # Add year column
+                        data["year"] = self.time_period
+                        # Store in HDF file
+                        storage["data"] = data
+
+    @property
+    def _scf_download_url(self) -> str:
+        return SCF_URL_BY_YEAR.get(self.time_period)
+
+
+class FedSCF_2022(FedSCF):
+    time_period = 2022
+    label = "Federal Reserve SCF (2022)"
+    name = "fed_scf_2022"
+    file_path = STORAGE_FOLDER / "fed_scf_2022.h5"
+    data_format = Dataset.TABLES
+
+
+class FedSCF_2019(FedSCF):
+    time_period = 2019
+    label = "Federal Reserve SCF (2019)"
+    name = "fed_scf_2019"
+    file_path = STORAGE_FOLDER / "fed_scf_2019.h5"
+    data_format = Dataset.TABLES
+
+
+class FedSCF_2016(FedSCF):
+    time_period = 2016
+    label = "Federal Reserve SCF (2016)"
+    name = "fed_scf_2016"
+    file_path = STORAGE_FOLDER / "fed_scf_2016.h5"
+    data_format = Dataset.TABLES
+
+
+# URLs for the SCF data by year
+SCF_URL_BY_YEAR = {
+    2016: "https://www.federalreserve.gov/econres/files/scfp2016s.zip",
+    2019: "https://www.federalreserve.gov/econres/files/scfp2019s.zip",
+    2022: "https://www.federalreserve.gov/econres/files/scfp2022s.zip",
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from policyengine_us_data.datasets.scf import *