Skip to content

Commit 0d3500e

Browse files
chekosclaude
andauthored
Implement Phases 0-2: Census API, MOE, PUMS, and spatial support (#296)
* Implement Phases 0-2: Census API, MOE, PUMS, and spatial support Adds the full Census API integration stack: Phase 0 - Foundation: - pypums/api/key.py: API key management via env vars - pypums/api/geography.py: 21+ geography levels with FIPS resolution - pypums/cache.py: file-based DataFrame caching with TTL Phase 1 - Core data functions: - pypums/acs.py: get_acs() with tidy/wide output, MOE scaling, summary vars - pypums/decennial.py: get_decennial() for 2000/2010/2020 Census data - pypums/variables.py: load_variables() for variable discovery Phase 2 - MOE + Spatial + Enhanced PUMS: - pypums/moe.py: moe_sum, moe_prop, moe_ratio, moe_product, significance - pypums/pums.py: get_pums() via Census API with filtering, recoding, rep weights - pypums/spatial.py: geometry=True support via TIGER/Line, as_dot_density() All 87 tests pass (Phase 0 through Phase 2 + existing tests). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Adapt Phase 2 modules to use shared api.client from Phase 0/1 Use CENSUS_API_BASE and call_census_api from pypums.api.client instead of inlining httpx calls. Keeps thin _call_census_api wrappers in each module for test mockability. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Address all 11 PR review comments - Fix multi-state silent data loss in get_pums (loop + concat) - Add max_attempts guard to prevent infinite loop in as_dot_density - Implement show_call parameter to print Census API URL/params - Add division-by-zero guards in moe_ratio and moe_prop - Calculate congress number dynamically instead of hardcoding - Use ordered list for GEOID column concatenation in acs/decennial - Accept survey parameter in get_acs (acs1/acs5) - Add GEOID column existence check in attach_geometry - Restrict clevel to supported values in significance() - Document sparse PUMS recodes dictionary - Normalize single-digit FIPS codes with zfill(2) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 4925b54 commit 0d3500e

File tree

7 files changed

+718
-93
lines changed

7 files changed

+718
-93
lines changed

pypums/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
# type: ignore[attr-defined]
22
"""Download PUMS data files from the US Census Bureau's FTP server."""
33

4-
from pypums.acs import get_acs as get_acs
5-
from pypums.api.key import census_api_key as census_api_key
6-
from pypums.decennial import get_decennial as get_decennial
74
from pypums.surveys import ACS as ACS
8-
from pypums.variables import load_variables as load_variables
95

6+
from .acs import get_acs as get_acs
7+
from .api.key import census_api_key as census_api_key
108
from .constants import __app_name__ as __app_name__
119
from .constants import __version__ as __version__
10+
from .decennial import get_decennial as get_decennial
11+
from .moe import moe_product as moe_product
12+
from .moe import moe_prop as moe_prop
13+
from .moe import moe_ratio as moe_ratio
14+
from .moe import moe_sum as moe_sum
15+
from .moe import significance as significance
16+
from .pums import get_pums as get_pums
17+
from .variables import load_variables as load_variables

pypums/acs.py

Lines changed: 59 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,21 @@
1212
99: 2.576,
1313
}
1414

15-
# Geography columns that appear in Census API responses.
16-
_GEO_COLUMNS = frozenset({
17-
"state", "county", "tract", "block group", "block",
18-
"place", "congressional district",
15+
# Geography columns in FIPS concatenation order. The order matters because
16+
# GEOID is built by joining these columns (e.g. state+county+tract).
17+
_GEO_COL_ORDER = [
18+
"us", "region", "division", "state", "county", "county subdivision",
19+
"tract", "block group", "block", "place", "congressional district",
1920
"state legislative district (upper chamber)",
2021
"state legislative district (lower chamber)",
2122
"zip code tabulation area",
22-
"school district (unified)",
23-
"school district (elementary)",
23+
"school district (unified)", "school district (elementary)",
2424
"school district (secondary)",
2525
"metropolitan statistical area/micropolitan statistical area",
26-
"combined statistical area",
27-
"public use microdata area",
26+
"combined statistical area", "public use microdata area",
2827
"american indian area/alaska native area/hawaiian home land",
29-
"us", "region", "division", "county subdivision",
30-
})
28+
]
29+
_GEO_COLUMNS = frozenset(_GEO_COL_ORDER)
3130

3231

3332
def _call_census_api(url: str, params: dict) -> list[list[str]]:
@@ -42,6 +41,7 @@ def get_acs(
4241
state: str | None = None,
4342
county: str | None = None,
4443
year: int = 2023,
44+
survey: str = "acs5",
4545
output: str = "tidy",
4646
moe_level: int = 90,
4747
summary_var: str | None = None,
@@ -64,14 +64,16 @@ def get_acs(
6464
County FIPS code.
6565
year
6666
Data year (default 2023).
67+
survey
68+
``"acs5"`` (default) or ``"acs1"``.
6769
output
6870
``"tidy"`` (default) or ``"wide"``.
6971
moe_level
7072
Confidence level for MOE: 90, 95, or 99 (default 90).
7173
summary_var
7274
Variable ID to include as denominator columns.
7375
geometry
74-
If True, return a GeoDataFrame with shapes (not yet implemented).
76+
If True, return a GeoDataFrame with shapes.
7577
key
7678
Census API key. Falls back to ``census_api_key()``.
7779
@@ -80,8 +82,6 @@ def get_acs(
8082
pd.DataFrame
8183
Census data in tidy or wide format.
8284
"""
83-
if geometry:
84-
raise NotImplementedError("geometry=True is not yet supported.")
8585
if output not in ("tidy", "wide"):
8686
raise ValueError(f"output must be 'tidy' or 'wide', got {output!r}")
8787
if moe_level not in _Z_SCORES:
@@ -109,7 +109,7 @@ def get_acs(
109109
api_vars.append(f"{summary_var}E")
110110
api_vars.append(f"{summary_var}M")
111111

112-
url = f"{CENSUS_API_BASE}/{year}/acs/acs5"
112+
url = f"{CENSUS_API_BASE}/{year}/acs/{survey}"
113113
params: dict[str, str] = {
114114
"get": f"NAME,{','.join(api_vars)}",
115115
"for": for_clause,
@@ -124,8 +124,8 @@ def get_acs(
124124
headers = data[0]
125125
df = pd.DataFrame(data[1:], columns=headers)
126126

127-
# Build GEOID from FIPS columns.
128-
geo_cols = [c for c in df.columns if c in _GEO_COLUMNS]
127+
# Build GEOID from FIPS columns in canonical order.
128+
geo_cols = [c for c in _GEO_COL_ORDER if c in df.columns]
129129
if geo_cols:
130130
df["GEOID"] = df[geo_cols].apply(lambda row: "".join(row), axis=1)
131131

@@ -144,43 +144,48 @@ def get_acs(
144144

145145
if output == "wide":
146146
keep_cols = ["GEOID", "NAME"] + estimate_cols + moe_cols
147-
return df[[c for c in keep_cols if c in df.columns]]
148-
149-
# Tidy format: melt estimate and MOE columns separately, then merge.
150-
id_cols = ["GEOID", "NAME"] if "GEOID" in df.columns else ["NAME"]
151-
152-
# Exclude summary_var columns from the main melt.
153-
summary_est_col = f"{summary_var}E" if summary_var else None
154-
summary_moe_col = f"{summary_var}M" if summary_var else None
155-
main_est_cols = [c for c in estimate_cols if c != summary_est_col]
156-
main_moe_cols = [c for c in moe_cols if c != summary_moe_col]
157-
158-
est_long = df.melt(
159-
id_vars=id_cols,
160-
value_vars=main_est_cols,
161-
var_name="_est_var",
162-
value_name="estimate",
163-
)
164-
est_long["variable"] = est_long["_est_var"].str[:-1]
165-
166-
moe_long = df.melt(
167-
id_vars=id_cols,
168-
value_vars=main_moe_cols,
169-
var_name="_moe_var",
170-
value_name="moe",
171-
)
172-
moe_long["variable"] = moe_long["_moe_var"].str[:-1]
173-
174-
tidy = est_long[id_cols + ["variable", "estimate"]].merge(
175-
moe_long[id_cols + ["variable", "moe"]],
176-
on=id_cols + ["variable"],
177-
)
178-
179-
# Add summary variable columns if requested.
180-
if summary_var is not None and summary_est_col in df.columns:
181-
summary_df = df[id_cols + [summary_est_col, summary_moe_col]].rename(
182-
columns={summary_est_col: "summary_est", summary_moe_col: "summary_moe"},
147+
result = df[[c for c in keep_cols if c in df.columns]]
148+
else:
149+
# Tidy format: melt estimate and MOE columns separately, then merge.
150+
id_cols = ["GEOID", "NAME"] if "GEOID" in df.columns else ["NAME"]
151+
152+
# Exclude summary_var columns from the main melt.
153+
summary_est_col = f"{summary_var}E" if summary_var else None
154+
summary_moe_col = f"{summary_var}M" if summary_var else None
155+
main_est_cols = [c for c in estimate_cols if c != summary_est_col]
156+
main_moe_cols = [c for c in moe_cols if c != summary_moe_col]
157+
158+
est_long = df.melt(
159+
id_vars=id_cols,
160+
value_vars=main_est_cols,
161+
var_name="_est_var",
162+
value_name="estimate",
163+
)
164+
est_long["variable"] = est_long["_est_var"].str[:-1]
165+
166+
moe_long = df.melt(
167+
id_vars=id_cols,
168+
value_vars=main_moe_cols,
169+
var_name="_moe_var",
170+
value_name="moe",
171+
)
172+
moe_long["variable"] = moe_long["_moe_var"].str[:-1]
173+
174+
result = est_long[id_cols + ["variable", "estimate"]].merge(
175+
moe_long[id_cols + ["variable", "moe"]],
176+
on=id_cols + ["variable"],
183177
)
184-
tidy = tidy.merge(summary_df, on=id_cols)
185178

186-
return tidy
179+
# Add summary variable columns if requested.
180+
if summary_var is not None and summary_est_col in df.columns:
181+
summary_df = df[id_cols + [summary_est_col, summary_moe_col]].rename(
182+
columns={summary_est_col: "summary_est", summary_moe_col: "summary_moe"},
183+
)
184+
result = result.merge(summary_df, on=id_cols)
185+
186+
if geometry:
187+
from pypums.spatial import attach_geometry
188+
189+
result = attach_geometry(result, geography, state=state, year=year)
190+
191+
return result

pypums/api/geography.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Census geography hierarchy definitions and query building."""
22

3+
import us
4+
35
GEOGRAPHY_HIERARCHY: dict[str, dict] = {
46
"us": {"for": "us:1", "requires": []},
57
"region": {"for": "region:*", "requires": []},
@@ -55,6 +57,22 @@
5557
}
5658

5759

60+
def _resolve_state_fips(state: str) -> str:
61+
"""Convert a state name or abbreviation to a 2-digit FIPS code."""
62+
# Already a numeric FIPS code — normalize to 2 digits.
63+
if state.isdigit():
64+
return state.zfill(2)
65+
66+
result = us.states.lookup(state)
67+
if result is None:
68+
raise ValueError(
69+
f"Could not resolve state: {state!r}. "
70+
"Pass a 2-letter abbreviation (e.g. 'CA'), "
71+
"full name (e.g. 'California'), or FIPS code (e.g. '06')."
72+
)
73+
return result.fips
74+
75+
5876
def build_geography_query(
5977
geography: str,
6078
state: str | None = None,
@@ -67,7 +85,7 @@ def build_geography_query(
6785
geography
6886
Geography level name (e.g. ``"state"``, ``"county"``, ``"tract"``).
6987
state
70-
State FIPS code (e.g. ``"06"`` for California).
88+
State FIPS code or name/abbreviation (e.g. ``"06"``, ``"CA"``).
7189
county
7290
County FIPS code (e.g. ``"037"`` for Los Angeles County).
7391
@@ -91,7 +109,10 @@ def build_geography_query(
91109
spec = GEOGRAPHY_HIERARCHY[geo]
92110
required = spec["requires"]
93111

94-
if "state" in required and state is None:
112+
# Resolve state to FIPS if provided.
113+
state_fips = _resolve_state_fips(state) if state is not None else None
114+
115+
if "state" in required and state_fips is None:
95116
raise ValueError(
96117
f"Geography {geography!r} requires a state FIPS code. "
97118
"Pass state='XX' (e.g. state='06' for California)."
@@ -104,10 +125,10 @@ def build_geography_query(
104125

105126
for_clause = spec["for"]
106127

107-
# Build the "in" clause from required parents
128+
# Build the "in" clause from required parents.
108129
in_parts = []
109-
if "state" in required and state is not None:
110-
in_parts.append(f"state:{state}")
130+
if "state" in required and state_fips is not None:
131+
in_parts.append(f"state:{state_fips}")
111132
if "county" in required and county is not None:
112133
in_parts.append(f"county:{county}")
113134

pypums/decennial.py

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
2000: "dec/sf1",
1313
}
1414

15-
# Geography columns that appear in Census API responses.
16-
_GEO_COLUMNS = frozenset({
17-
"state", "county", "tract", "block group", "block",
18-
"place", "congressional district",
19-
"us", "region", "division", "county subdivision",
20-
})
15+
# Geography columns in FIPS concatenation order.
16+
_GEO_COL_ORDER = [
17+
"us", "region", "division", "state", "county", "county subdivision",
18+
"tract", "block group", "block", "place", "congressional district",
19+
]
20+
_GEO_COLUMNS = frozenset(_GEO_COL_ORDER)
2121

2222

2323
def _call_census_api(url: str, params: dict) -> list[list[str]]:
@@ -58,7 +58,7 @@ def get_decennial(
5858
pop_group
5959
Population group code for DHC-A disaggregated data.
6060
geometry
61-
If True, return a GeoDataFrame with shapes (not yet implemented).
61+
If True, return a GeoDataFrame with shapes.
6262
key
6363
Census API key. Falls back to ``census_api_key()``.
6464
@@ -67,21 +67,14 @@ def get_decennial(
6767
pd.DataFrame
6868
Census data in tidy or wide format.
6969
"""
70-
if geometry:
71-
raise NotImplementedError("geometry=True is not yet supported.")
7270
if output not in ("tidy", "wide"):
7371
raise ValueError(f"output must be 'tidy' or 'wide', got {output!r}")
7472

7573
api_key = census_api_key(key) if key else census_api_key()
7674
for_clause, in_clause = build_geography_query(geography, state=state, county=county)
7775

78-
# Validate year and select dataset.
79-
if year not in _YEAR_DATASETS:
80-
raise ValueError(
81-
f"Unsupported decennial year: {year}. "
82-
f"Supported years: {sorted(_YEAR_DATASETS)}"
83-
)
84-
dataset = "dec/dhc-a" if pop_group is not None else _YEAR_DATASETS[year]
76+
# Select dataset.
77+
dataset = "dec/dhc-a" if pop_group is not None else _YEAR_DATASETS.get(year, "dec/dhc")
8578

8679
# Build the variable list.
8780
if variables is not None:
@@ -110,8 +103,8 @@ def get_decennial(
110103
headers = data[0]
111104
df = pd.DataFrame(data[1:], columns=headers)
112105

113-
# Build GEOID from FIPS columns.
114-
geo_cols = [c for c in df.columns if c in _GEO_COLUMNS]
106+
# Build GEOID from FIPS columns in canonical order.
107+
geo_cols = [c for c in _GEO_COL_ORDER if c in df.columns]
115108
if geo_cols:
116109
df["GEOID"] = df[geo_cols].apply(lambda row: "".join(row), axis=1)
117110

@@ -124,15 +117,20 @@ def get_decennial(
124117

125118
if output == "wide":
126119
keep_cols = ["GEOID", "NAME"] + var_cols
127-
return df[[c for c in keep_cols if c in df.columns]]
128-
129-
# Tidy format: melt to one row per geography × variable.
130-
id_cols = ["GEOID", "NAME"] if "GEOID" in df.columns else ["NAME"]
131-
tidy = df.melt(
132-
id_vars=id_cols,
133-
value_vars=var_cols,
134-
var_name="variable",
135-
value_name="value",
136-
)
137-
138-
return tidy
120+
result = df[[c for c in keep_cols if c in df.columns]]
121+
else:
122+
# Tidy format: melt to one row per geography x variable.
123+
id_cols = ["GEOID", "NAME"] if "GEOID" in df.columns else ["NAME"]
124+
result = df.melt(
125+
id_vars=id_cols,
126+
value_vars=var_cols,
127+
var_name="variable",
128+
value_name="value",
129+
)
130+
131+
if geometry:
132+
from pypums.spatial import attach_geometry
133+
134+
result = attach_geometry(result, geography, state=state, year=year)
135+
136+
return result

0 commit comments

Comments
 (0)