Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
**/_build
!population_by_state.csv
**/*.pkl
**/*.dta
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.21.1] - 2025-05-14 13:31:21

### Fixed

- Data downloads for Census datasets disabled.
- Warning added for downsampling non-existent policyengine-[country] variables.

## [1.21.0] - 2025-05-13 13:29:57

### Added
Expand Down Expand Up @@ -266,6 +273,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[1.21.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.21.0...1.21.1
[1.21.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.20.0...1.21.0
[1.20.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.2...1.20.0
[1.19.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.1...1.19.2
Expand Down
6 changes: 6 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,9 @@
added:
- Calibration of the QBID tax expenditure.
date: 2025-05-13 13:29:57
- bump: patch
changes:
fixed:
- Data downloads for Census datasets disabled.
- Warning added for downsampling non-existent policyengine-[country] variables.
date: 2025-05-14 13:31:21
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/acs/census_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,3 @@ class CensusACS_2022(CensusACS):
name = "census_acs_2022.h5"
file_path = STORAGE_FOLDER / "census_acs_2022.h5"
time_period = 2022
url = "hf://policyengine/policyengine-us-data/census_acs_2022.h5"
5 changes: 1 addition & 4 deletions policyengine_us_data/datasets/cps/census_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ class CensusCPS_2023(CensusCPS):
name = "census_cps_2023"
file_path = STORAGE_FOLDER / "census_cps_2023.h5"
data_format = Dataset.TABLES
url = "hf://policyengine/policyengine-us-data/census_cps_2023.h5"


class CensusCPS_2022(CensusCPS):
Expand All @@ -133,7 +132,6 @@ class CensusCPS_2022(CensusCPS):
name = "census_cps_2022"
file_path = STORAGE_FOLDER / "census_cps_2022.h5"
data_format = Dataset.TABLES
url = "hf://policyengine/policyengine-us-data/census_cps_2022.h5"


class CensusCPS_2021(CensusCPS):
Expand All @@ -142,7 +140,6 @@ class CensusCPS_2021(CensusCPS):
name = "census_cps_2021"
file_path = STORAGE_FOLDER / "census_cps_2021.h5"
data_format = Dataset.TABLES
url = "hf://policyengine/policyengine-us-data/census_cps_2021.h5"


class CensusCPS_2020(CensusCPS):
Expand All @@ -151,7 +148,6 @@ class CensusCPS_2020(CensusCPS):
name = "census_cps_2020"
file_path = STORAGE_FOLDER / "census_cps_2020.h5"
data_format = Dataset.TABLES
url = "hf://policyengine/policyengine-us-data/census_cps_2020.h5"


class CensusCPS_2019(CensusCPS):
Expand Down Expand Up @@ -303,4 +299,5 @@ class CensusCPS_2018(CensusCPS):
"POTC_VAL",
"PMED_VAL",
"PEMCPREM",
"NOW_GRP",
]
6 changes: 6 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
create_policyengine_uprating_factors_table,
)
from policyengine_us_data.utils import QRF
import logging


class CPS(Dataset):
Expand Down Expand Up @@ -80,6 +81,9 @@ def downsample(self, frac: float):

for key in original_data:
if key not in sim.tax_benefit_system.variables:
logging.warning(
f"Attempting to downsample the variable {key} but failing because it is not in the given country package."
)
continue
values = sim.calculate(key).values

Expand Down Expand Up @@ -321,6 +325,8 @@ def children_per_parent(col: str) -> pd.DataFrame:

cps["has_marketplace_health_coverage"] = person.MRK == 1

cps["has_esi"] = person.NOW_GRP == 1

cps["cps_race"] = person.PRDTRACE
cps["is_hispanic"] = person.PRDTHSP != 0

Expand Down
62 changes: 62 additions & 0 deletions policyengine_us_data/datasets/cps/org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from policyengine_core.data import Dataset
import requests
import zipfile
import io
import pandas as pd
from policyengine_us_data.storage import STORAGE_FOLDER
import h5py
from tqdm import tqdm
import huggingface_hub


class CensusCPSOrg(Dataset):
file_path = STORAGE_FOLDER / "census_cps_org_2024.h5"
name = "census_cps_org_2024"
label = "Census CPS Org (2024)"
time_period = 2024
data_format = Dataset.TABLES

def generate(self):

# Download from https://microdata.epi.org/epi_cpsorg_1979_2025.zip
# Extract the file and read the epi_cpsorg_2024.dta with pandas
DOWNLOAD_FROM_CENSUS = False
if DOWNLOAD_FROM_CENSUS:
url = "https://microdata.epi.org/epi_cpsorg_1979_2025.zip"
response = requests.get(url, stream=True)
total_size = int(response.headers.get("content-length", 0))
block_size = 8192
progress_bar = tqdm(
total=total_size,
unit="iB",
unit_scale=True,
desc="Downloading CPS Org data",
)
content = b""
for data in response.iter_content(block_size):
progress_bar.update(len(data))
content += data
progress_bar.close()
response.content = content
if response.status_code != 200:
raise Exception(
f"Failed to download file: {response.status_code}"
)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
with z.open("epi_cpsorg_2024.dta") as f:
df = pd.read_stata(f)
else:
huggingface_hub.hf_hub_download(
repo_id="policyengine/policyengine-us-data",
filename="epi_cpsorg_2024.dta",
repo_type="model",
local_dir=STORAGE_FOLDER,
)
df = pd.read_stata(STORAGE_FOLDER / "epi_cpsorg_2024.dta")
for col in df.columns:
try:
df[col] = df[col].astype(float)
except:
df[col] = df[col].astype(str)
with pd.HDFStore(self.file_path, "a") as f:
f.put("main", df)
32 changes: 32 additions & 0 deletions policyengine_us_data/datasets/cps/overtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from microimpute.models.qrf import QRFResults
from policyengine_us_data.datasets.cps.org import CensusCPSOrg
from policyengine_us_data.storage import STORAGE_FOLDER
import pickle


def train_exemption_status_model() -> QRFResults:

org_df = CensusCPSOrg().load("main")

# Add exemption status using rules

...

# Train the model

return ...


def get_tip_model() -> QRFResults:
model_path = STORAGE_FOLDER / "tips.pkl"

if not model_path.exists():
model = train_exemption_status_model()

with open(model_path, "wb") as f:
pickle.dump(model, f)
else:
with open(model_path, "rb") as f:
model = pickle.load(f)

return model
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "policyengine_us_data"
version = "1.21.0"
version = "1.21.1"
description = "A package to create representative microdata for the US."
readme = "README.md"
authors = [
Expand All @@ -23,6 +23,7 @@ dependencies = [
"tqdm",
"microdf_python>=0.4.3",
"microimpute",
"pip-system-certs",
]

[project.optional-dependencies]
Expand Down
Loading