diff --git a/.gitignore b/.gitignore index a63f8215..0d209ad9 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ **/_build !population_by_state.csv **/*.pkl +**/*.dta diff --git a/CHANGELOG.md b/CHANGELOG.md index a7bc8c0d..577bfe43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.21.1] - 2025-05-14 13:31:21 + +### Fixed + +- Data downloads for Census datasets disabled. +- Warning added for downsampling non-existent policyengine-[country] variables. + ## [1.21.0] - 2025-05-13 13:29:57 ### Added @@ -266,6 +273,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.21.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.21.0...1.21.1 [1.21.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.20.0...1.21.0 [1.20.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.2...1.20.0 [1.19.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.1...1.19.2 diff --git a/changelog.yaml b/changelog.yaml index 87782c33..7e33f91f 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -218,3 +218,9 @@ added: - Calibration of the QBID tax expenditure. date: 2025-05-13 13:29:57 +- bump: patch + changes: + fixed: + - Data downloads for Census datasets disabled. + - Warning added for downsampling non-existent policyengine-[country] variables. + date: 2025-05-14 13:31:21 diff --git a/policyengine_us_data/datasets/acs/census_acs.py b/policyengine_us_data/datasets/acs/census_acs.py index f6ec8f4b..842af627 100644 --- a/policyengine_us_data/datasets/acs/census_acs.py +++ b/policyengine_us_data/datasets/acs/census_acs.py @@ -206,4 +206,3 @@ class CensusACS_2022(CensusACS): name = "census_acs_2022.h5" file_path = STORAGE_FOLDER / "census_acs_2022.h5" time_period = 2022 - url = "hf://policyengine/policyengine-us-data/census_acs_2022.h5" diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 287f54f5..896140af 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -124,7 +124,6 @@ class CensusCPS_2023(CensusCPS): name = "census_cps_2023" file_path = STORAGE_FOLDER / "census_cps_2023.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2023.h5" class CensusCPS_2022(CensusCPS): @@ -133,7 +132,6 @@ class CensusCPS_2022(CensusCPS): name = "census_cps_2022" file_path = STORAGE_FOLDER / "census_cps_2022.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2022.h5" class CensusCPS_2021(CensusCPS): @@ -142,7 +140,6 @@ class CensusCPS_2021(CensusCPS): name = "census_cps_2021" file_path = STORAGE_FOLDER / "census_cps_2021.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2021.h5" class CensusCPS_2020(CensusCPS): @@ -151,7 +148,6 @@ class CensusCPS_2020(CensusCPS): name = "census_cps_2020" file_path = STORAGE_FOLDER / "census_cps_2020.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2020.h5" class CensusCPS_2019(CensusCPS): @@ -303,4 +299,5 @@ class CensusCPS_2018(CensusCPS): "POTC_VAL", "PMED_VAL", "PEMCPREM", + "NOW_GRP", ] diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index eefbd2e6..666151ac 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -12,6 +12,7 @@ create_policyengine_uprating_factors_table, ) from policyengine_us_data.utils import QRF +import logging class CPS(Dataset): @@ -80,6 +81,9 @@ def downsample(self, frac: float): for key in original_data: if key not in sim.tax_benefit_system.variables: + logging.warning( + f"Attempting to downsample the variable {key} but failing because it is not in the given country package." + ) continue values = sim.calculate(key).values @@ -321,6 +325,8 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["has_marketplace_health_coverage"] = person.MRK == 1 + cps["has_esi"] = person.NOW_GRP == 1 + cps["cps_race"] = person.PRDTRACE cps["is_hispanic"] = person.PRDTHSP != 0 diff --git a/policyengine_us_data/datasets/cps/org.py b/policyengine_us_data/datasets/cps/org.py new file mode 100644 index 00000000..a01c98d4 --- /dev/null +++ b/policyengine_us_data/datasets/cps/org.py @@ -0,0 +1,62 @@ +from policyengine_core.data import Dataset +import requests +import zipfile +import io +import pandas as pd +from policyengine_us_data.storage import STORAGE_FOLDER +import h5py +from tqdm import tqdm +import huggingface_hub + + +class CensusCPSOrg(Dataset): + file_path = STORAGE_FOLDER / "census_cps_org_2024.h5" + name = "census_cps_org_2024" + label = "Census CPS Org (2024)" + time_period = 2024 + data_format = Dataset.TABLES + + def generate(self): + + # Download from https://microdata.epi.org/epi_cpsorg_1979_2025.zip + # Extract the file and read the epi_cpsorg_2024.dta with pandas + DOWNLOAD_FROM_CENSUS = False + if DOWNLOAD_FROM_CENSUS: + url = "https://microdata.epi.org/epi_cpsorg_1979_2025.zip" + response = requests.get(url, stream=True) + total_size = int(response.headers.get("content-length", 0)) + block_size = 8192 + progress_bar = tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc="Downloading CPS Org data", + ) + content = b"" + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + content += data + progress_bar.close() + response.content = content + if response.status_code != 200: + raise Exception( + f"Failed to download file: {response.status_code}" + ) + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + with z.open("epi_cpsorg_2024.dta") as f: + df = pd.read_stata(f) + else: + huggingface_hub.hf_hub_download( + repo_id="policyengine/policyengine-us-data", + filename="epi_cpsorg_2024.dta", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + df = pd.read_stata(STORAGE_FOLDER / "epi_cpsorg_2024.dta") + for col in df.columns: + try: + df[col] = df[col].astype(float) + except: + df[col] = df[col].astype(str) + with pd.HDFStore(self.file_path, "a") as f: + f.put("main", df) diff --git a/policyengine_us_data/datasets/cps/overtime.py b/policyengine_us_data/datasets/cps/overtime.py new file mode 100644 index 00000000..300b12c7 --- /dev/null +++ b/policyengine_us_data/datasets/cps/overtime.py @@ -0,0 +1,32 @@ +from microimpute.models.qrf import QRFResults +from policyengine_us_data.datasets.cps.org import CensusCPSOrg +from policyengine_us_data.storage import STORAGE_FOLDER +import pickle + + +def train_exemption_status_model() -> QRFResults: + + org_df = CensusCPSOrg().load("main") + + # Add exemption status using rules + + ... + + # Train the model + + return ... + + +def get_tip_model() -> QRFResults: + model_path = STORAGE_FOLDER / "tips.pkl" + + if not model_path.exists(): + model = train_exemption_status_model() + + with open(model_path, "wb") as f: + pickle.dump(model, f) + else: + with open(model_path, "rb") as f: + model = pickle.load(f) + + return model diff --git a/pyproject.toml b/pyproject.toml index bc053e53..fceb5138 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.21.0" +version = "1.21.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ @@ -23,6 +23,7 @@ dependencies = [ "tqdm", "microdf_python>=0.4.3", "microimpute", + "pip-system-certs", ] [project.optional-dependencies] diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 00000000..3b1991a1 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "id": "4a41b930", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us_data.datasets.cps.org import CensusCPSOrg\n", + "\n", + "\n", + "df = CensusCPSOrg().load(\"main\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "60b59227", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | year | \n", + "month | \n", + "minsamp | \n", + "hrhhid | \n", + "hrhhid2 | \n", + "hrsample | \n", + "hrsersuf | \n", + "huhhnum | \n", + "pulineno | \n", + "unicon_recnum | \n", + "... | \n", + "differrands | \n", + "diffhear | \n", + "diffmemory | \n", + "diffphysical | \n", + "diffvision | \n", + "difficulty | \n", + "hoursu1i | \n", + "weekpay | \n", + "wage | \n", + "wageotc | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "2024.0 | \n", + "1.0 | \n", + "8.0 | \n", + "9.103430e+13 | \n", + "15011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "40.0 | \n", + "876.919983 | \n", + "21.923 | \n", + "21.923000 | \n", + "
| 1 | \n", + "2024.0 | \n", + "1.0 | \n", + "4.0 | \n", + "3.906711e+14 | \n", + "16011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "3.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "40.0 | \n", + "850.000000 | \n", + "20.000 | \n", + "21.250000 | \n", + "
| 2 | \n", + "2024.0 | \n", + "1.0 | \n", + "4.0 | \n", + "1.017104e+13 | \n", + "16111.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "2.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "44.0 | \n", + "1200.000000 | \n", + "17.500 | \n", + "17.500000 | \n", + "
| 3 | \n", + "2024.0 | \n", + "1.0 | \n", + "4.0 | \n", + "1.000010e+14 | \n", + "16011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "40.0 | \n", + "640.000000 | \n", + "16.000 | \n", + "16.000000 | \n", + "
| 4 | \n", + "2024.0 | \n", + "1.0 | \n", + "4.0 | \n", + "2.314000e+14 | \n", + "16111.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "2.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "40.0 | \n", + "1350.000000 | \n", + "33.750 | \n", + "33.750000 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 243231 | \n", + "2024.0 | \n", + "12.0 | \n", + "4.0 | \n", + "1.004753e+14 | \n", + "18111.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "nan | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 243232 | \n", + "2024.0 | \n", + "12.0 | \n", + "8.0 | \n", + "2.610721e+14 | \n", + "16011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "2.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Experiences difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Experiences difficulty | \n", + "Experiences one or more difficulty | \n", + "40.0 | \n", + "856.000000 | \n", + "21.400 | \n", + "21.400000 | \n", + "
| 243233 | \n", + "2024.0 | \n", + "12.0 | \n", + "4.0 | \n", + "1.106560e+14 | \n", + "18011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Experiences difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Experiences one or more difficulty | \n", + "24.0 | \n", + "360.000000 | \n", + "15.000 | \n", + "15.000000 | \n", + "
| 243234 | \n", + "2024.0 | \n", + "12.0 | \n", + "4.0 | \n", + "9.413007e+14 | \n", + "18011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "30.0 | \n", + "540.000000 | \n", + "18.000 | \n", + "18.000000 | \n", + "
| 243235 | \n", + "2024.0 | \n", + "12.0 | \n", + "8.0 | \n", + "3.690201e+14 | \n", + "16011.0 | \n", + "\n", + " | \n", + " | NaN | \n", + "1.0 | \n", + "NaN | \n", + "... | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't experience difficulty | \n", + "Doesn't not experience any diffulty | \n", + "50.0 | \n", + "1380.000000 | \n", + "25.000 | \n", + "34.400002 | \n", + "
243236 rows × 138 columns
\n", + "