From 8526e9ee10836ee03f67b09d9e301f15c14c9785 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Fri, 11 Apr 2025 15:06:30 -0700 Subject: [PATCH 1/5] =?UTF-8?q?fix:=20Na=C3=AFve=20fix=20for=20CPS=20downs?= =?UTF-8?q?ampling=20in=20Pooled=203-year=20CPS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- changelog_entry.yaml | 7 ++++ policyengine_us_data/datasets/cps/cps.py | 43 +++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..2fc0254a 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,7 @@ +- bump: patch + changes: + added: + - A method to disable downsampling within the base CPS dataset generation class + - Non-downsampled versions of the 2021, 2022, and 2023 CPS datasets + changed: + - Pooled 3-Year CPS generation uses the non-downsampled versions of the 2021, 2022, and 2023 CPS datasets \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 02e04e41..78515f4e 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -20,6 +20,7 @@ class CPS(Dataset): raw_cps: Type[CensusCPS] = None previous_year_raw_cps: Type[CensusCPS] = None data_format = Dataset.ARRAYS + downsample_by_half: bool = True def generate(self): """Generates the Current Population Survey dataset for PolicyEngine US microsimulations. @@ -58,7 +59,8 @@ def generate(self): # Downsample - self.downsample(fraction=0.5) + if self.downsample_by_half: + self.downsample(fraction=0.5) def downsample(self, fraction: float = 0.5): from policyengine_us import Microsimulation @@ -673,6 +675,36 @@ class CPS_2024(CPS): url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5" +class CPS_2021_Not_Downsampled(CPS): + name = "cps_2021_not_downsampled" + label = "CPS 2021 (not downsampled)" + raw_cps = CensusCPS_2021 + previous_year_raw_cps = CensusCPS_2020 + file_path = STORAGE_FOLDER / "cps_2021_not_downsampled.h5" + time_period = 2021 + downsample_by_half = False + + +class CPS_2022_Not_Downsampled(CPS): + name = "cps_2022_not_downsampled" + label = "CPS 2022 (not downsampled)" + raw_cps = CensusCPS_2022 + previous_year_raw_cps = CensusCPS_2021 + file_path = STORAGE_FOLDER / "cps_2022_not_downsampled.h5" + time_period = 2022 + downsample_by_half = False + + +class CPS_2023_Not_Downsampled(CPS): + name = "cps_2023_not_downsampled" + label = "CPS 2023 (not downsampled)" + raw_cps = CensusCPS_2023 + previous_year_raw_cps = CensusCPS_2022 + file_path = STORAGE_FOLDER / "cps_2023_not_downsampled.h5" + time_period = 2023 + downsample_by_half = False + + class PooledCPS(Dataset): data_format = Dataset.ARRAYS input_datasets: list @@ -724,9 +756,9 @@ class Pooled_3_Year_CPS_2023(PooledCPS): name = "pooled_3_year_cps_2023" file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023.h5" input_datasets = [ - CPS_2021, - CPS_2022, - CPS_2023, + CPS_2021_Not_Downsampled, + CPS_2022_Not_Downsampled, + CPS_2023_Not_Downsampled, ] time_period = 2023 url = "hf://policyengine/policyengine-us-data/pooled_3_year_cps_2023.h5" @@ -737,4 +769,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): CPS_2022().generate() CPS_2023().generate() CPS_2024().generate() + CPS_2021_Not_Downsampled().generate() + CPS_2022_Not_Downsampled().generate() + CPS_2023_Not_Downsampled().generate() Pooled_3_Year_CPS_2023().generate() From 7fd2fa241e253c87a84491a8c9c273afa5ddf595 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Fri, 11 Apr 2025 15:08:14 -0700 Subject: [PATCH 2/5] chore: Add warning --- policyengine_us_data/datasets/cps/cps.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 78515f4e..03916925 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -675,6 +675,9 @@ class CPS_2024(CPS): url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5" +# The below datasets are a very naïve way of preventing downsampling in the +# Pooled 3-Year CPS. They should be replaced by a more sustainable approach. +# If these are still here on July 1, 2025, please open an issue and raise at standup. class CPS_2021_Not_Downsampled(CPS): name = "cps_2021_not_downsampled" label = "CPS 2021 (not downsampled)" From 165eb39a6d8813b09f45490d5cd7766a70e1919a Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Fri, 11 Apr 2025 18:03:40 -0700 Subject: [PATCH 3/5] fix: Attempt to preserve original dtype when downsampling --- changelog_entry.yaml | 3 +- policyengine_us_data/datasets/cps/cps.py | 40 ++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 2fc0254a..2b3377bc 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -4,4 +4,5 @@ - A method to disable downsampling within the base CPS dataset generation class - Non-downsampled versions of the 2021, 2022, and 2023 CPS datasets changed: - - Pooled 3-Year CPS generation uses the non-downsampled versions of the 2021, 2022, and 2023 CPS datasets \ No newline at end of file + - Pooled 3-Year CPS generation uses the non-downsampled versions of the 2021, 2022, and 2023 CPS datasets + - Downsampling method attempts to preserve original dtype values \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 03916925..19f0609c 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -62,16 +62,52 @@ def generate(self): if self.downsample_by_half: self.downsample(fraction=0.5) + # def downsample(self, fraction: float = 0.5): + # from policyengine_us import Microsimulation + + # sim = Microsimulation(dataset=self) + # sim.subsample(frac=fraction) + # original_data: dict = self.load_dataset() + # for key in original_data: + # if key not in sim.tax_benefit_system.variables: + # continue + # original_data[key] = sim.calculate(key).values + + # self.save_dataset(original_data) + def downsample(self, fraction: float = 0.5): from policyengine_us import Microsimulation + # Store original dtypes before modifying + original_data: dict = self.load_dataset() + original_dtypes = { + key: original_data[key].dtype for key in original_data + } + sim = Microsimulation(dataset=self) sim.subsample(frac=fraction) - original_data: dict = self.load_dataset() + for key in original_data: if key not in sim.tax_benefit_system.variables: continue - original_data[key] = sim.calculate(key).values + values = sim.calculate(key).values + + # Preserve the original dtype if possible + if ( + key in original_dtypes + and hasattr(values, "dtype") + and values.dtype != original_dtypes[key] + ): + try: + original_data[key] = values.astype(original_dtypes[key]) + except: + # If conversion fails, log it but continue + print( + f"Warning: Could not convert {key} back to {original_dtypes[key]}" + ) + original_data[key] = values + else: + original_data[key] = values self.save_dataset(original_data) From 2340cffefb5ce0f87ea48755d207b69d3c5082c9 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Mon, 14 Apr 2025 16:59:21 -0400 Subject: [PATCH 4/5] fix: Make fixes from review --- policyengine_us_data/datasets/cps/cps.py | 62 +++++++++--------------- 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 19f0609c..5f7d0a94 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -20,7 +20,7 @@ class CPS(Dataset): raw_cps: Type[CensusCPS] = None previous_year_raw_cps: Type[CensusCPS] = None data_format = Dataset.ARRAYS - downsample_by_half: bool = True + downsample_frac: float | None = 0.5 def generate(self): """Generates the Current Population Survey dataset for PolicyEngine US microsimulations. @@ -58,22 +58,8 @@ def generate(self): add_takeup(self) # Downsample - - if self.downsample_by_half: - self.downsample(fraction=0.5) - - # def downsample(self, fraction: float = 0.5): - # from policyengine_us import Microsimulation - - # sim = Microsimulation(dataset=self) - # sim.subsample(frac=fraction) - # original_data: dict = self.load_dataset() - # for key in original_data: - # if key not in sim.tax_benefit_system.variables: - # continue - # original_data[key] = sim.calculate(key).values - - # self.save_dataset(original_data) + if self.downsample_frac is not None and self.downsample_frac < 1.0: + self.downsample(fraction=self.downsample_frac) def downsample(self, fraction: float = 0.5): from policyengine_us import Microsimulation @@ -714,34 +700,34 @@ class CPS_2024(CPS): # The below datasets are a very naïve way of preventing downsampling in the # Pooled 3-Year CPS. They should be replaced by a more sustainable approach. # If these are still here on July 1, 2025, please open an issue and raise at standup. -class CPS_2021_Not_Downsampled(CPS): - name = "cps_2021_not_downsampled" - label = "CPS 2021 (not downsampled)" +class CPS_2021_Full(CPS): + name = "cps_2021_full" + label = "CPS 2021 (full)" raw_cps = CensusCPS_2021 previous_year_raw_cps = CensusCPS_2020 - file_path = STORAGE_FOLDER / "cps_2021_not_downsampled.h5" + file_path = STORAGE_FOLDER / "cps_2021_full.h5" time_period = 2021 - downsample_by_half = False + downsample_frac = None -class CPS_2022_Not_Downsampled(CPS): - name = "cps_2022_not_downsampled" - label = "CPS 2022 (not downsampled)" +class CPS_2022_Full(CPS): + name = "cps_2022_full" + label = "CPS 2022 (full)" raw_cps = CensusCPS_2022 previous_year_raw_cps = CensusCPS_2021 - file_path = STORAGE_FOLDER / "cps_2022_not_downsampled.h5" + file_path = STORAGE_FOLDER / "cps_2022_full.h5" time_period = 2022 - downsample_by_half = False + downsample_frac = None -class CPS_2023_Not_Downsampled(CPS): - name = "cps_2023_not_downsampled" - label = "CPS 2023 (not downsampled)" +class CPS_2023_Full(CPS): + name = "cps_2023_full" + label = "CPS 2023 (full)" raw_cps = CensusCPS_2023 previous_year_raw_cps = CensusCPS_2022 - file_path = STORAGE_FOLDER / "cps_2023_not_downsampled.h5" + file_path = STORAGE_FOLDER / "cps_2023_full.h5" time_period = 2023 - downsample_by_half = False + downsample_frac = None class PooledCPS(Dataset): @@ -795,9 +781,9 @@ class Pooled_3_Year_CPS_2023(PooledCPS): name = "pooled_3_year_cps_2023" file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023.h5" input_datasets = [ - CPS_2021_Not_Downsampled, - CPS_2022_Not_Downsampled, - CPS_2023_Not_Downsampled, + CPS_2021_Full, + CPS_2022_Full, + CPS_2023_Full, ] time_period = 2023 url = "hf://policyengine/policyengine-us-data/pooled_3_year_cps_2023.h5" @@ -808,7 +794,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS): CPS_2022().generate() CPS_2023().generate() CPS_2024().generate() - CPS_2021_Not_Downsampled().generate() - CPS_2022_Not_Downsampled().generate() - CPS_2023_Not_Downsampled().generate() + CPS_2021_Full().generate() + CPS_2022_Full().generate() + CPS_2023_Full().generate() Pooled_3_Year_CPS_2023().generate() From 95ca793085c12e28754de5a386051eddeb00cdba Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Wed, 16 Apr 2025 15:15:27 -0400 Subject: [PATCH 5/5] fix: Rename downsampling_fraction to frac, apply 1 by default --- changelog_entry.yaml | 2 +- policyengine_us_data/datasets/cps/cps.py | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 2b3377bc..6ff4a197 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,8 +1,8 @@ - bump: patch changes: added: - - A method to disable downsampling within the base CPS dataset generation class - Non-downsampled versions of the 2021, 2022, and 2023 CPS datasets changed: + - Modified downsampling method within CPS base dataset class - Pooled 3-Year CPS generation uses the non-downsampled versions of the 2021, 2022, and 2023 CPS datasets - Downsampling method attempts to preserve original dtype values \ No newline at end of file diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 5f7d0a94..26579b82 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -20,11 +20,15 @@ class CPS(Dataset): raw_cps: Type[CensusCPS] = None previous_year_raw_cps: Type[CensusCPS] = None data_format = Dataset.ARRAYS - downsample_frac: float | None = 0.5 + frac: float | None = 1 def generate(self): """Generates the Current Population Survey dataset for PolicyEngine US microsimulations. Technical documentation and codebook here: https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar21.pdf + + Args: + frac (float, optional): Fraction of the dataset to keep. Defaults to 1. Example: To downsample to 25% of dataset, + set frac=0.25. """ if self.raw_cps is None: @@ -58,10 +62,10 @@ def generate(self): add_takeup(self) # Downsample - if self.downsample_frac is not None and self.downsample_frac < 1.0: - self.downsample(fraction=self.downsample_frac) + if self.frac is not None and self.frac < 1.0: + self.downsample(frac=self.frac) - def downsample(self, fraction: float = 0.5): + def downsample(self, frac: float): from policyengine_us import Microsimulation # Store original dtypes before modifying @@ -71,7 +75,7 @@ def downsample(self, fraction: float = 0.5): } sim = Microsimulation(dataset=self) - sim.subsample(frac=fraction) + sim.subsample(frac=frac) for key in original_data: if key not in sim.tax_benefit_system.variables: @@ -651,6 +655,7 @@ class CPS_2019(CPS): previous_year_raw_cps = CensusCPS_2018 file_path = STORAGE_FOLDER / "cps_2019.h5" time_period = 2019 + frac = 0.5 class CPS_2020(CPS): @@ -660,6 +665,7 @@ class CPS_2020(CPS): previous_year_raw_cps = CensusCPS_2019 file_path = STORAGE_FOLDER / "cps_2020.h5" time_period = 2020 + frac = 0.5 class CPS_2021(CPS): @@ -669,6 +675,7 @@ class CPS_2021(CPS): previous_year_raw_cps = CensusCPS_2020 file_path = STORAGE_FOLDER / "cps_2021_v1_6_1.h5" time_period = 2021 + frac = 0.5 class CPS_2022(CPS): @@ -678,6 +685,7 @@ class CPS_2022(CPS): previous_year_raw_cps = CensusCPS_2021 file_path = STORAGE_FOLDER / "cps_2022_v1_6_1.h5" time_period = 2022 + frac = 0.5 class CPS_2023(CPS): @@ -687,6 +695,7 @@ class CPS_2023(CPS): previous_year_raw_cps = CensusCPS_2022 file_path = STORAGE_FOLDER / "cps_2023.h5" time_period = 2023 + frac = 0.5 class CPS_2024(CPS): @@ -695,6 +704,7 @@ class CPS_2024(CPS): file_path = STORAGE_FOLDER / "cps_2024.h5" time_period = 2024 url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5" + frac = 0.5 # The below datasets are a very naïve way of preventing downsampling in the @@ -707,7 +717,6 @@ class CPS_2021_Full(CPS): previous_year_raw_cps = CensusCPS_2020 file_path = STORAGE_FOLDER / "cps_2021_full.h5" time_period = 2021 - downsample_frac = None class CPS_2022_Full(CPS): @@ -717,7 +726,6 @@ class CPS_2022_Full(CPS): previous_year_raw_cps = CensusCPS_2021 file_path = STORAGE_FOLDER / "cps_2022_full.h5" time_period = 2022 - downsample_frac = None class CPS_2023_Full(CPS): @@ -727,7 +735,6 @@ class CPS_2023_Full(CPS): previous_year_raw_cps = CensusCPS_2022 file_path = STORAGE_FOLDER / "cps_2023_full.h5" time_period = 2023 - downsample_frac = None class PooledCPS(Dataset):