diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..6ff4a197 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,8 @@ +- bump: patch + changes: + added: + - Non-downsampled versions of the 2021, 2022, and 2023 CPS datasets + changed: + - Modified downsampling method within CPS base dataset class + - Pooled 3-Year CPS generation uses the non-downsampled versions of the 2021, 2022, and 2023 CPS datasets + - Downsampling method attempts to preserve original dtype values \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 02e04e41..26579b82 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -20,10 +20,15 @@ class CPS(Dataset): raw_cps: Type[CensusCPS] = None previous_year_raw_cps: Type[CensusCPS] = None data_format = Dataset.ARRAYS + frac: float | None = 1 def generate(self): """Generates the Current Population Survey dataset for PolicyEngine US microsimulations. Technical documentation and codebook here: https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar21.pdf + + Args: + frac (float, optional): Fraction of the dataset to keep. Defaults to 1. Example: To downsample to 25% of dataset, + set frac=0.25. """ if self.raw_cps is None: @@ -57,19 +62,42 @@ def generate(self): add_takeup(self) # Downsample + if self.frac is not None and self.frac < 1.0: + self.downsample(frac=self.frac) - self.downsample(fraction=0.5) - - def downsample(self, fraction: float = 0.5): + def downsample(self, frac: float): from policyengine_us import Microsimulation - sim = Microsimulation(dataset=self) - sim.subsample(frac=fraction) + # Store original dtypes before modifying original_data: dict = self.load_dataset() + original_dtypes = { + key: original_data[key].dtype for key in original_data + } + + sim = Microsimulation(dataset=self) + sim.subsample(frac=frac) + for key in original_data: if key not in sim.tax_benefit_system.variables: continue - original_data[key] = sim.calculate(key).values + values = sim.calculate(key).values + + # Preserve the original dtype if possible + if ( + key in original_dtypes + and hasattr(values, "dtype") + and values.dtype != original_dtypes[key] + ): + try: + original_data[key] = values.astype(original_dtypes[key]) + except: + # If conversion fails, log it but continue + print( + f"Warning: Could not convert {key} back to {original_dtypes[key]}" + ) + original_data[key] = values + else: + original_data[key] = values self.save_dataset(original_data) @@ -627,6 +655,7 @@ class CPS_2019(CPS): previous_year_raw_cps = CensusCPS_2018 file_path = STORAGE_FOLDER / "cps_2019.h5" time_period = 2019 + frac = 0.5 class CPS_2020(CPS): @@ -636,6 +665,7 @@ class CPS_2020(CPS): previous_year_raw_cps = CensusCPS_2019 file_path = STORAGE_FOLDER / "cps_2020.h5" time_period = 2020 + frac = 0.5 class CPS_2021(CPS): @@ -645,6 +675,7 @@ class CPS_2021(CPS): previous_year_raw_cps = CensusCPS_2020 file_path = STORAGE_FOLDER / "cps_2021_v1_6_1.h5" time_period = 2021 + frac = 0.5 class CPS_2022(CPS): @@ -654,6 +685,7 @@ class CPS_2022(CPS): previous_year_raw_cps = CensusCPS_2021 file_path = STORAGE_FOLDER / "cps_2022_v1_6_1.h5" time_period = 2022 + frac = 0.5 class CPS_2023(CPS): @@ -663,6 +695,7 @@ class CPS_2023(CPS): previous_year_raw_cps = CensusCPS_2022 file_path = STORAGE_FOLDER / "cps_2023.h5" time_period = 2023 + frac = 0.5 class CPS_2024(CPS): @@ -671,6 +704,37 @@ class CPS_2024(CPS): file_path = STORAGE_FOLDER / "cps_2024.h5" time_period = 2024 url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5" + frac = 0.5 + + +# The below datasets are a very naïve way of preventing downsampling in the +# Pooled 3-Year CPS. They should be replaced by a more sustainable approach. +# If these are still here on July 1, 2025, please open an issue and raise at standup. +class CPS_2021_Full(CPS): + name = "cps_2021_full" + label = "CPS 2021 (full)" + raw_cps = CensusCPS_2021 + previous_year_raw_cps = CensusCPS_2020 + file_path = STORAGE_FOLDER / "cps_2021_full.h5" + time_period = 2021 + + +class CPS_2022_Full(CPS): + name = "cps_2022_full" + label = "CPS 2022 (full)" + raw_cps = CensusCPS_2022 + previous_year_raw_cps = CensusCPS_2021 + file_path = STORAGE_FOLDER / "cps_2022_full.h5" + time_period = 2022 + + +class CPS_2023_Full(CPS): + name = "cps_2023_full" + label = "CPS 2023 (full)" + raw_cps = CensusCPS_2023 + previous_year_raw_cps = CensusCPS_2022 + file_path = STORAGE_FOLDER / "cps_2023_full.h5" + time_period = 2023 class PooledCPS(Dataset): @@ -724,9 +788,9 @@ class Pooled_3_Year_CPS_2023(PooledCPS): name = "pooled_3_year_cps_2023" file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023.h5" input_datasets = [ - CPS_2021, - CPS_2022, - CPS_2023, + CPS_2021_Full, + CPS_2022_Full, + CPS_2023_Full, ] time_period = 2023 url = "hf://policyengine/policyengine-us-data/pooled_3_year_cps_2023.h5" @@ -737,4 +801,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): CPS_2022().generate() CPS_2023().generate() CPS_2024().generate() + CPS_2021_Full().generate() + CPS_2022_Full().generate() + CPS_2023_Full().generate() Pooled_3_Year_CPS_2023().generate()