diff --git a/.gitignore b/.gitignore index a63f8215..0d209ad9 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ **/_build !population_by_state.csv **/*.pkl +**/*.dta diff --git a/CHANGELOG.md b/CHANGELOG.md index a7bc8c0d..577bfe43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.21.1] - 2025-05-14 13:31:21 + +### Fixed + +- Data downloads for Census datasets disabled. +- Warning added for downsampling non-existent policyengine-[country] variables. + ## [1.21.0] - 2025-05-13 13:29:57 ### Added @@ -266,6 +273,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.21.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.21.0...1.21.1 [1.21.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.20.0...1.21.0 [1.20.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.2...1.20.0 [1.19.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.1...1.19.2 diff --git a/changelog.yaml b/changelog.yaml index 87782c33..7e33f91f 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -218,3 +218,9 @@ added: - Calibration of the QBID tax expenditure. date: 2025-05-13 13:29:57 +- bump: patch + changes: + fixed: + - Data downloads for Census datasets disabled. + - Warning added for downsampling non-existent policyengine-[country] variables. + date: 2025-05-14 13:31:21 diff --git a/policyengine_us_data/datasets/acs/census_acs.py b/policyengine_us_data/datasets/acs/census_acs.py index f6ec8f4b..842af627 100644 --- a/policyengine_us_data/datasets/acs/census_acs.py +++ b/policyengine_us_data/datasets/acs/census_acs.py @@ -206,4 +206,3 @@ class CensusACS_2022(CensusACS): name = "census_acs_2022.h5" file_path = STORAGE_FOLDER / "census_acs_2022.h5" time_period = 2022 - url = "hf://policyengine/policyengine-us-data/census_acs_2022.h5" diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 287f54f5..896140af 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -124,7 +124,6 @@ class CensusCPS_2023(CensusCPS): name = "census_cps_2023" file_path = STORAGE_FOLDER / "census_cps_2023.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2023.h5" class CensusCPS_2022(CensusCPS): @@ -133,7 +132,6 @@ class CensusCPS_2022(CensusCPS): name = "census_cps_2022" file_path = STORAGE_FOLDER / "census_cps_2022.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2022.h5" class CensusCPS_2021(CensusCPS): @@ -142,7 +140,6 @@ class CensusCPS_2021(CensusCPS): name = "census_cps_2021" file_path = STORAGE_FOLDER / "census_cps_2021.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2021.h5" class CensusCPS_2020(CensusCPS): @@ -151,7 +148,6 @@ class CensusCPS_2020(CensusCPS): name = "census_cps_2020" file_path = STORAGE_FOLDER / "census_cps_2020.h5" data_format = Dataset.TABLES - url = "hf://policyengine/policyengine-us-data/census_cps_2020.h5" class CensusCPS_2019(CensusCPS): @@ -303,4 +299,5 @@ class CensusCPS_2018(CensusCPS): "POTC_VAL", "PMED_VAL", "PEMCPREM", + "NOW_GRP", ] diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index eefbd2e6..666151ac 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -12,6 +12,7 @@ create_policyengine_uprating_factors_table, ) from policyengine_us_data.utils import QRF +import logging class CPS(Dataset): @@ -80,6 +81,9 @@ def downsample(self, frac: float): for key in original_data: if key not in sim.tax_benefit_system.variables: + logging.warning( + f"Attempting to downsample the variable {key} but failing because it is not in the given country package." + ) continue values = sim.calculate(key).values @@ -321,6 +325,8 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["has_marketplace_health_coverage"] = person.MRK == 1 + cps["has_esi"] = person.NOW_GRP == 1 + cps["cps_race"] = person.PRDTRACE cps["is_hispanic"] = person.PRDTHSP != 0 diff --git a/policyengine_us_data/datasets/cps/org.py b/policyengine_us_data/datasets/cps/org.py new file mode 100644 index 00000000..a01c98d4 --- /dev/null +++ b/policyengine_us_data/datasets/cps/org.py @@ -0,0 +1,62 @@ +from policyengine_core.data import Dataset +import requests +import zipfile +import io +import pandas as pd +from policyengine_us_data.storage import STORAGE_FOLDER +import h5py +from tqdm import tqdm +import huggingface_hub + + +class CensusCPSOrg(Dataset): + file_path = STORAGE_FOLDER / "census_cps_org_2024.h5" + name = "census_cps_org_2024" + label = "Census CPS Org (2024)" + time_period = 2024 + data_format = Dataset.TABLES + + def generate(self): + + # Download from https://microdata.epi.org/epi_cpsorg_1979_2025.zip + # Extract the file and read the epi_cpsorg_2024.dta with pandas + DOWNLOAD_FROM_CENSUS = False + if DOWNLOAD_FROM_CENSUS: + url = "https://microdata.epi.org/epi_cpsorg_1979_2025.zip" + response = requests.get(url, stream=True) + total_size = int(response.headers.get("content-length", 0)) + block_size = 8192 + progress_bar = tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc="Downloading CPS Org data", + ) + content = b"" + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + content += data + progress_bar.close() + response.content = content + if response.status_code != 200: + raise Exception( + f"Failed to download file: {response.status_code}" + ) + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + with z.open("epi_cpsorg_2024.dta") as f: + df = pd.read_stata(f) + else: + huggingface_hub.hf_hub_download( + repo_id="policyengine/policyengine-us-data", + filename="epi_cpsorg_2024.dta", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + df = pd.read_stata(STORAGE_FOLDER / "epi_cpsorg_2024.dta") + for col in df.columns: + try: + df[col] = df[col].astype(float) + except: + df[col] = df[col].astype(str) + with pd.HDFStore(self.file_path, "a") as f: + f.put("main", df) diff --git a/policyengine_us_data/datasets/cps/overtime.py b/policyengine_us_data/datasets/cps/overtime.py new file mode 100644 index 00000000..300b12c7 --- /dev/null +++ b/policyengine_us_data/datasets/cps/overtime.py @@ -0,0 +1,32 @@ +from microimpute.models.qrf import QRFResults +from policyengine_us_data.datasets.cps.org import CensusCPSOrg +from policyengine_us_data.storage import STORAGE_FOLDER +import pickle + + +def train_exemption_status_model() -> QRFResults: + + org_df = CensusCPSOrg().load("main") + + # Add exemption status using rules + + ... + + # Train the model + + return ... + + +def get_tip_model() -> QRFResults: + model_path = STORAGE_FOLDER / "tips.pkl" + + if not model_path.exists(): + model = train_exemption_status_model() + + with open(model_path, "wb") as f: + pickle.dump(model, f) + else: + with open(model_path, "rb") as f: + model = pickle.load(f) + + return model diff --git a/pyproject.toml b/pyproject.toml index bc053e53..fceb5138 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.21.0" +version = "1.21.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ @@ -23,6 +23,7 @@ dependencies = [ "tqdm", "microdf_python>=0.4.3", "microimpute", + "pip-system-certs", ] [project.optional-dependencies] diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 00000000..3b1991a1 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "id": "4a41b930", + "metadata": {}, + "outputs": [], + "source": [ + "from policyengine_us_data.datasets.cps.org import CensusCPSOrg\n", + "\n", + "\n", + "df = CensusCPSOrg().load(\"main\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "60b59227", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearmonthminsamphrhhidhrhhid2hrsamplehrsersufhuhhnumpulinenounicon_recnum...differrandsdiffheardiffmemorydiffphysicaldiffvisiondifficultyhoursu1iweekpaywagewageotc
02024.01.08.09.103430e+1315011.0NaN1.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty40.0876.91998321.92321.923000
12024.01.04.03.906711e+1416011.0NaN3.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty40.0850.00000020.00021.250000
22024.01.04.01.017104e+1316111.0NaN2.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty44.01200.00000017.50017.500000
32024.01.04.01.000010e+1416011.0NaN1.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty40.0640.00000016.00016.000000
42024.01.04.02.314000e+1416111.0NaN2.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty40.01350.00000033.75033.750000
..................................................................
2432312024.012.04.01.004753e+1418111.0NaN1.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffultynanNaNNaNNaN
2432322024.012.08.02.610721e+1416011.0NaN2.0NaN...Doesn't experience difficultyExperiences difficultyDoesn't experience difficultyDoesn't experience difficultyExperiences difficultyExperiences one or more difficulty40.0856.00000021.40021.400000
2432332024.012.04.01.106560e+1418011.0NaN1.0NaN...Experiences difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyExperiences one or more difficulty24.0360.00000015.00015.000000
2432342024.012.04.09.413007e+1418011.0NaN1.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty30.0540.00000018.00018.000000
2432352024.012.08.03.690201e+1416011.0NaN1.0NaN...Doesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't experience difficultyDoesn't not experience any diffulty50.01380.00000025.00034.400002
\n", + "

243236 rows × 138 columns

\n", + "
" + ], + "text/plain": [ + " year month minsamp hrhhid hrhhid2 hrsample hrsersuf \\\n", + "0 2024.0 1.0 8.0 9.103430e+13 15011.0 \n", + "1 2024.0 1.0 4.0 3.906711e+14 16011.0 \n", + "2 2024.0 1.0 4.0 1.017104e+13 16111.0 \n", + "3 2024.0 1.0 4.0 1.000010e+14 16011.0 \n", + "4 2024.0 1.0 4.0 2.314000e+14 16111.0 \n", + "... ... ... ... ... ... ... ... \n", + "243231 2024.0 12.0 4.0 1.004753e+14 18111.0 \n", + "243232 2024.0 12.0 8.0 2.610721e+14 16011.0 \n", + "243233 2024.0 12.0 4.0 1.106560e+14 18011.0 \n", + "243234 2024.0 12.0 4.0 9.413007e+14 18011.0 \n", + "243235 2024.0 12.0 8.0 3.690201e+14 16011.0 \n", + "\n", + " huhhnum pulineno unicon_recnum ... differrands \\\n", + "0 NaN 1.0 NaN ... Doesn't experience difficulty \n", + "1 NaN 3.0 NaN ... Doesn't experience difficulty \n", + "2 NaN 2.0 NaN ... Doesn't experience difficulty \n", + "3 NaN 1.0 NaN ... Doesn't experience difficulty \n", + "4 NaN 2.0 NaN ... Doesn't experience difficulty \n", + "... ... ... ... ... ... \n", + "243231 NaN 1.0 NaN ... Doesn't experience difficulty \n", + "243232 NaN 2.0 NaN ... Doesn't experience difficulty \n", + "243233 NaN 1.0 NaN ... Experiences difficulty \n", + "243234 NaN 1.0 NaN ... Doesn't experience difficulty \n", + "243235 NaN 1.0 NaN ... Doesn't experience difficulty \n", + "\n", + " diffhear diffmemory \\\n", + "0 Doesn't experience difficulty Doesn't experience difficulty \n", + "1 Doesn't experience difficulty Doesn't experience difficulty \n", + "2 Doesn't experience difficulty Doesn't experience difficulty \n", + "3 Doesn't experience difficulty Doesn't experience difficulty \n", + "4 Doesn't experience difficulty Doesn't experience difficulty \n", + "... ... ... \n", + "243231 Doesn't experience difficulty Doesn't experience difficulty \n", + "243232 Experiences difficulty Doesn't experience difficulty \n", + "243233 Doesn't experience difficulty Doesn't experience difficulty \n", + "243234 Doesn't experience difficulty Doesn't experience difficulty \n", + "243235 Doesn't experience difficulty Doesn't experience difficulty \n", + "\n", + " diffphysical diffvision \\\n", + "0 Doesn't experience difficulty Doesn't experience difficulty \n", + "1 Doesn't experience difficulty Doesn't experience difficulty \n", + "2 Doesn't experience difficulty Doesn't experience difficulty \n", + "3 Doesn't experience difficulty Doesn't experience difficulty \n", + "4 Doesn't experience difficulty Doesn't experience difficulty \n", + "... ... ... \n", + "243231 Doesn't experience difficulty Doesn't experience difficulty \n", + "243232 Doesn't experience difficulty Experiences difficulty \n", + "243233 Doesn't experience difficulty Doesn't experience difficulty \n", + "243234 Doesn't experience difficulty Doesn't experience difficulty \n", + "243235 Doesn't experience difficulty Doesn't experience difficulty \n", + "\n", + " difficulty hoursu1i weekpay wage \\\n", + "0 Doesn't not experience any diffulty 40.0 876.919983 21.923 \n", + "1 Doesn't not experience any diffulty 40.0 850.000000 20.000 \n", + "2 Doesn't not experience any diffulty 44.0 1200.000000 17.500 \n", + "3 Doesn't not experience any diffulty 40.0 640.000000 16.000 \n", + "4 Doesn't not experience any diffulty 40.0 1350.000000 33.750 \n", + "... ... ... ... ... \n", + "243231 Doesn't not experience any diffulty nan NaN NaN \n", + "243232 Experiences one or more difficulty 40.0 856.000000 21.400 \n", + "243233 Experiences one or more difficulty 24.0 360.000000 15.000 \n", + "243234 Doesn't not experience any diffulty 30.0 540.000000 18.000 \n", + "243235 Doesn't not experience any diffulty 50.0 1380.000000 25.000 \n", + "\n", + " wageotc \n", + "0 21.923000 \n", + "1 21.250000 \n", + "2 17.500000 \n", + "3 16.000000 \n", + "4 33.750000 \n", + "... ... \n", + "243231 NaN \n", + "243232 21.400000 \n", + "243233 15.000000 \n", + "243234 18.000000 \n", + "243235 34.400002 \n", + "\n", + "[243236 rows x 138 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}