Skip to content
Merged
9 changes: 9 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- bump: patch
changes:
added:
- Conversion of county FIPS codes to county enum items
- Helper function to convert string county names to enum keys
- Function to download and parse county FIPS dataset from Hugging Face
changed:
- Modified county variable to depend on FIPS input, then on ZIP code
- Modified county variable to use helper function for conversion from county names to enum keys
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ metadata:
href: https://legislature.idaho.gov/sessioninfo/2025/legislation/h0231/
- title: House Bill 231 (Bill Text)
href: https://legislature.idaho.gov/wp-content/uploads/sessioninfo/2025/legislation/H0231.pdf
# Recently passed law not yet added to statute.
# Recently passed law not yet added to statute.
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ metadata:

values:
2021-01-01: true
2025-01-01: false
2025-01-01: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
- name: County based on county FIPS for Nassau County, New York state
period: 2025
input:
county_fips: "36059"
output:
county: NASSAU_COUNTY_NY

- name: County derived from county FIPS for Philadelphia, Pennsylvania state
period: 2025
input:
county_fips: "42101"
output:
county: PHILADELPHIA_COUNTY_PA

- name: County derived from county FIPS for Los Angeles County, California
period: 2025
input:
county_fips: "06037"
output:
county: LOS_ANGELES_COUNTY_CA

- name: County equivalent for District of Columnbia
period: 2025
input:
county_fips: "11001"
output:
county: DISTRICT_OF_COLUMBIA_DC

- name: FIPS for Mayagüez, Puerto Rico
period: 2025
input:
county_fips: "72097"
output:
county: MAYAGÜEZ_MUNICIPIO_PR

- name: County FIPS for vectorized input
period: 2025
input:
county_fips: ["36059", "06037", "26163", "32003"]
output:
county: [NASSAU_COUNTY_NY, LOS_ANGELES_COUNTY_CA, WAYNE_COUNTY_MI, CLARK_COUNTY_NV]
150 changes: 150 additions & 0 deletions policyengine_us/tests/utilities/test_load_county_fips_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from policyengine_core.tools.hugging_face import download_huggingface_dataset
from policyengine_us.tools.geography.county_helpers import (
load_county_fips_dataset,
)
from pathlib import Path
import pytest
import pandas as pd
import gzip


@pytest.fixture
def tmp_fips_dir(tmp_path) -> Path:
"""
Create a temporary filepath for the FIPS dataset.
Return this path as a Path object.
"""
TMP_DIR: Path = tmp_path / "county_fips_dataset"
TMP_DIR.mkdir()
return TMP_DIR


@pytest.fixture
def mock_dataset_file(tmp_fips_dir) -> Path:
"""Create a small mock dataset file for testing."""

# Create a small test CSV with the expected format
test_data = pd.DataFrame(
{
"county_fips": ["01001", "02002", "03003"],
"county_name": ["Test County 1", "Test County 2", "Test County 3"],
"state": ["AL", "AK", "AZ"],
}
)

# Save as gzipped CSV
test_file_path = tmp_fips_dir / "county_fips_2020.csv.gz"
with gzip.open(test_file_path, "wb") as f:
test_data.to_csv(f, index=False, encoding="utf-8")

return test_file_path


def mock_download_huggingface_dataset_success(filepath):
def _mock(*args, **kwargs):
return filepath

return _mock


def mock_download_huggingface_dataset_failure(filepath):
def _mock(*args, **kwargs):
raise Exception("Download failed")

return _mock


class TestCountyFIPSDatasetFile:
"""
Test that the county FIPS dataset file exists and downloads properly.
"""

HUGGINGFACE_REPO = "policyengine/policyengine-us-data"
COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz"

def test_when_downloading_county_fips__download_is_successful(
self, tmp_fips_dir
):

download_huggingface_dataset(
repo=self.HUGGINGFACE_REPO,
repo_filename=self.COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=tmp_fips_dir,
)

TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME
assert TMP_FILE.is_file()

def test_when_downloading_and_parsing_county_fips__result_is_correct(
self, tmp_fips_dir
):

download_huggingface_dataset(
repo=self.HUGGINGFACE_REPO,
repo_filename=self.COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=tmp_fips_dir,
)

TMP_FILE = tmp_fips_dir / self.COUNTY_FIPS_DATASET_FILENAME

df = pd.read_csv(
TMP_FILE,
compression="gzip",
dtype={"county_fips": str},
encoding="utf-8",
nrows=5, # Just read a few rows
)

assert "county_fips" in df.columns
assert len(df) > 0

# Check FIPS codes are properly preserved as strings
assert all(isinstance(fips, str) for fips in df["county_fips"])


class TestLoadCountyFIPSDataset:
"""
Test that the load_county_fips_dataset function works correctly.
"""

def test_when_func_is_run__correctly__returns_dataframe(
self, mock_dataset_file, monkeypatch
):
"""
Test that the load_county_fips_dataset function returns a DataFrame with the correct columns.
"""

# Apply the mock
monkeypatch.setattr(
"policyengine_us.tools.geography.county_helpers.download_huggingface_dataset",
mock_download_huggingface_dataset_success(mock_dataset_file),
)

result = load_county_fips_dataset()

# Verify the result is a pandas DataFrame with expected structure
assert isinstance(result, pd.DataFrame)
assert len(result) == 3
assert (
"01001" in result.values
) # Check that FIPS codes are preserved as strings

def test_when_func_is_run__download_fails__raises_exception(
self, mock_dataset_file, monkeypatch
):
"""
Test that the load_county_fips_dataset function raises an exception when download fails.
"""

# Apply the mock
monkeypatch.setattr(
"policyengine_us.tools.geography.county_helpers.download_huggingface_dataset",
mock_download_huggingface_dataset_failure(mock_dataset_file),
)

with pytest.raises(Exception) as excinfo:
load_county_fips_dataset()

assert "Error downloading" in str(excinfo.value)
63 changes: 63 additions & 0 deletions policyengine_us/tools/geography/county_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd
import numpy as np
from policyengine_us.variables.household.demographic.geographic.county.county_enum import (
County,
)
from pathlib import Path
from policyengine_core.tools.hugging_face import download_huggingface_dataset


def load_county_fips_dataset() -> pd.DataFrame:
"""
Download the county FIPS dataset from Hugging Face and load it into a pandas DataFrame.
If the dataset already exists in the 'data' folder and is the most recent version, this
function will just load that into a pandas DataFrame.
"""

DATA_FOLDER = Path("data")
HUGGINGFACE_REPO = "policyengine/policyengine-us-data"
COUNTY_FIPS_DATASET_FILENAME = "county_fips_2020.csv.gz"

try:
COUNTY_FIPS_RAW = download_huggingface_dataset(
repo=HUGGINGFACE_REPO,
repo_filename=COUNTY_FIPS_DATASET_FILENAME,
version=None,
local_dir=DATA_FOLDER,
)

# Read raw data into pandas dataframe; county FIPS MUST be defined as string,
# else pandas reads as int and drops leading zeros
COUNTY_FIPS_DATASET = pd.read_csv(
COUNTY_FIPS_RAW,
compression="gzip",
dtype={"county_fips": str},
encoding="utf-8",
)

return COUNTY_FIPS_DATASET

except Exception as e:
raise Exception(
f"Error downloading {COUNTY_FIPS_DATASET_FILENAME} from {HUGGINGFACE_REPO}: {e}"
)


def map_county_string_to_enum(
county_name: "pd.Series[str]", state_code: "pd.Series[str]"
) -> "pd.Series[int]":
"""Helper function to map county name and state code to County enum value."""
county_key = county_name.apply(
lambda name: name.replace(" ", "_")
.replace("-", "_")
.replace(".", "")
.replace("'", "_")
.strip()
.upper()
)
county_state = county_key.str.cat(state_code, sep="_")
county_names = pd.Series(
np.arange(len(County._member_names_)),
index=County._member_names_,
)
return county_names[county_state]
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from policyengine_us.model_api import *
from policyengine_us.tools.geography.county_helpers import (
map_county_string_to_enum,
)
from policyengine_us.variables.household.demographic.geographic.county.county_enum import (
County,
)
from policyengine_us_data import ZIP_CODE_DATASET
from policyengine_us.tools.geography.county_helpers import (
load_county_fips_dataset,
)


class county(Variable):
Expand All @@ -14,21 +20,22 @@ class county(Variable):
definition_period = YEAR

def formula(household, period, parameters):

# First look if county FIPS is provided; if so, map to county name
county_fips: "pd.Series[str]" | None = household("county_fips", period)

if county_fips.all():
COUNTY_FIPS_DATASET: "pd.DataFrame" = load_county_fips_dataset()

# Decode FIPS codes
county_fips_codes = COUNTY_FIPS_DATASET.set_index("county_fips")
county_name = county_fips_codes.loc[county_fips, "county_name"]
state_code = county_fips_codes.loc[county_fips, "state"]
return map_county_string_to_enum(county_name, state_code)

# Attempt to look up from ZIP code
zip_code = household("zip_code", period).astype(int)
zip_codes = ZIP_CODE_DATASET.set_index("zip_code")
county_name = zip_codes.county[zip_code]
state_code = zip_codes.state[zip_code]
county_key = county_name.apply(
lambda name: name.replace(" ", "_")
.replace("-", "_")
.replace(".", "")
.replace("'", "_")
.strip()
.upper()
)
county_state = county_key.str.cat(state_code, sep="_")
county_names = pd.Series(
np.arange(len(County._member_names_)), index=County._member_names_
)
return county_names[county_state]
return map_county_string_to_enum(county_name, state_code)
2 changes: 1 addition & 1 deletion policyengine_us/variables/input/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def formula(household, period, parameters):


class county_fips(Variable):
value_type = int
value_type = str
label = "County FIPS code"
entity = Household
definition_period = YEAR
Expand Down