PolicyEngine · nikhilwoodruff · May 14, 2025 · May 14, 2025 · May 14, 2025 · May 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@
 **/_build
 !population_by_state.csv
 **/*.pkl
+**/*.dta
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.21.1] - 2025-05-14 13:31:21
+
+### Fixed
+
+- Data downloads for Census datasets disabled.
+- Warning added for downsampling non-existent policyengine-[country] variables.
+
 ## [1.21.0] - 2025-05-13 13:29:57
 
 ### Added
@@ -266,6 +273,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.21.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.21.0...1.21.1
 [1.21.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.20.0...1.21.0
 [1.20.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.2...1.20.0
 [1.19.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.19.1...1.19.2

diff --git a/changelog.yaml b/changelog.yaml
@@ -218,3 +218,9 @@
     added:
     - Calibration of the QBID tax expenditure.
   date: 2025-05-13 13:29:57
+- bump: patch
+  changes:
+    fixed:
+    - Data downloads for Census datasets disabled.
+    - Warning added for downsampling non-existent policyengine-[country] variables.
+  date: 2025-05-14 13:31:21
diff --git a/policyengine_us_data/datasets/acs/census_acs.py b/policyengine_us_data/datasets/acs/census_acs.py
@@ -206,4 +206,3 @@ class CensusACS_2022(CensusACS):
     name = "census_acs_2022.h5"
     file_path = STORAGE_FOLDER / "census_acs_2022.h5"
     time_period = 2022
-    url = "hf://policyengine/policyengine-us-data/census_acs_2022.h5"
diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py
@@ -124,7 +124,6 @@ class CensusCPS_2023(CensusCPS):
     name = "census_cps_2023"
     file_path = STORAGE_FOLDER / "census_cps_2023.h5"
     data_format = Dataset.TABLES
-    url = "hf://policyengine/policyengine-us-data/census_cps_2023.h5"
 
 
 class CensusCPS_2022(CensusCPS):
@@ -133,7 +132,6 @@ class CensusCPS_2022(CensusCPS):
     name = "census_cps_2022"
     file_path = STORAGE_FOLDER / "census_cps_2022.h5"
     data_format = Dataset.TABLES
-    url = "hf://policyengine/policyengine-us-data/census_cps_2022.h5"
 
 
 class CensusCPS_2021(CensusCPS):
@@ -142,7 +140,6 @@ class CensusCPS_2021(CensusCPS):
     name = "census_cps_2021"
     file_path = STORAGE_FOLDER / "census_cps_2021.h5"
     data_format = Dataset.TABLES
-    url = "hf://policyengine/policyengine-us-data/census_cps_2021.h5"
 
 
 class CensusCPS_2020(CensusCPS):
@@ -151,7 +148,6 @@ class CensusCPS_2020(CensusCPS):
     name = "census_cps_2020"
     file_path = STORAGE_FOLDER / "census_cps_2020.h5"
     data_format = Dataset.TABLES
-    url = "hf://policyengine/policyengine-us-data/census_cps_2020.h5"
 
 
 class CensusCPS_2019(CensusCPS):
@@ -303,4 +299,5 @@ class CensusCPS_2018(CensusCPS):
     "POTC_VAL",
     "PMED_VAL",
     "PEMCPREM",
+    "NOW_GRP",
 ]
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -12,6 +12,7 @@
     create_policyengine_uprating_factors_table,
 )
 from policyengine_us_data.utils import QRF
+import logging
 
 
 class CPS(Dataset):
@@ -80,6 +81,9 @@ def downsample(self, frac: float):
 
         for key in original_data:
             if key not in sim.tax_benefit_system.variables:
+                logging.warning(
+                    f"Attempting to downsample the variable {key} but failing because it is not in the given country package."
+                )
                 continue
             values = sim.calculate(key).values
 
@@ -321,6 +325,8 @@ def children_per_parent(col: str) -> pd.DataFrame:
 
     cps["has_marketplace_health_coverage"] = person.MRK == 1
 
+    cps["has_esi"] = person.NOW_GRP == 1
+
     cps["cps_race"] = person.PRDTRACE
     cps["is_hispanic"] = person.PRDTHSP != 0
 

diff --git a/policyengine_us_data/datasets/cps/org.py b/policyengine_us_data/datasets/cps/org.py
@@ -0,0 +1,62 @@
+from policyengine_core.data import Dataset
+import requests
+import zipfile
+import io
+import pandas as pd
+from policyengine_us_data.storage import STORAGE_FOLDER
+import h5py
+from tqdm import tqdm
+import huggingface_hub
+
+
+class CensusCPSOrg(Dataset):
+    file_path = STORAGE_FOLDER / "census_cps_org_2024.h5"
+    name = "census_cps_org_2024"
+    label = "Census CPS Org (2024)"
+    time_period = 2024
+    data_format = Dataset.TABLES
+
+    def generate(self):
+
+        # Download from https://microdata.epi.org/epi_cpsorg_1979_2025.zip
+        # Extract the file and read the epi_cpsorg_2024.dta with pandas
+        DOWNLOAD_FROM_CENSUS = False
+        if DOWNLOAD_FROM_CENSUS:
+            url = "https://microdata.epi.org/epi_cpsorg_1979_2025.zip"
+            response = requests.get(url, stream=True)
+            total_size = int(response.headers.get("content-length", 0))
+            block_size = 8192
+            progress_bar = tqdm(
+                total=total_size,
+                unit="iB",
+                unit_scale=True,
+                desc="Downloading CPS Org data",
+            )
+            content = b""
+            for data in response.iter_content(block_size):
+                progress_bar.update(len(data))
+                content += data
+            progress_bar.close()
+            response.content = content
+            if response.status_code != 200:
+                raise Exception(
+                    f"Failed to download file: {response.status_code}"
+                )
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                with z.open("epi_cpsorg_2024.dta") as f:
+                    df = pd.read_stata(f)
+        else:
+            huggingface_hub.hf_hub_download(
+                repo_id="policyengine/policyengine-us-data",
+                filename="epi_cpsorg_2024.dta",
+                repo_type="model",
+                local_dir=STORAGE_FOLDER,
+            )
+            df = pd.read_stata(STORAGE_FOLDER / "epi_cpsorg_2024.dta")
+        for col in df.columns:
+            try:
+                df[col] = df[col].astype(float)
+            except:
+                df[col] = df[col].astype(str)
+        with pd.HDFStore(self.file_path, "a") as f:
+            f.put("main", df)
diff --git a/policyengine_us_data/datasets/cps/overtime.py b/policyengine_us_data/datasets/cps/overtime.py
@@ -0,0 +1,32 @@
+from microimpute.models.qrf import QRFResults
+from policyengine_us_data.datasets.cps.org import CensusCPSOrg
+from policyengine_us_data.storage import STORAGE_FOLDER
+import pickle
+
+
+def train_exemption_status_model() -> QRFResults:
+
+    org_df = CensusCPSOrg().load("main")
+
+    # Add exemption status using rules
+
+    ...
+
+    # Train the model
+
+    return ...
+
+
+def get_tip_model() -> QRFResults:
+    model_path = STORAGE_FOLDER / "tips.pkl"
+
+    if not model_path.exists():
+        model = train_exemption_status_model()
+
+        with open(model_path, "wb") as f:
+            pickle.dump(model, f)
+    else:
+        with open(model_path, "rb") as f:
+            model = pickle.load(f)
+
+    return model
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_us_data"
-version = "1.21.0"
+version = "1.21.1"
 description = "A package to create representative microdata for the US."
 readme = "README.md"
 authors = [
@@ -23,6 +23,7 @@ dependencies = [
     "tqdm",
     "microdf_python>=0.4.3",
     "microimpute",
+    "pip-system-certs",
 ]
 
 [project.optional-dependencies]