Skip to content

Commit 3ac7d5c

Browse files
committed
Add national geography support and profile layout updates
1 parent e00b068 commit 3ac7d5c

File tree

5 files changed

+271
-9
lines changed

5 files changed

+271
-9
lines changed

geocompare/database/Database.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,7 @@ def __init__(self, path, progress_callback=None):
11911191
rows = self.get_geo_csv_rows()
11921192

11931193
# Filter for summary levels
1194+
# 010 = United States
11941195
# 040 = State
11951196
# 050 = State-County
11961197
# 160 = State-Place
@@ -1200,7 +1201,7 @@ def __init__(self, path, progress_callback=None):
12001201
rows = [
12011202
row
12021203
for row in rows
1203-
if row[1] in {"160", "050", "040", "860", "310", "400"}
1204+
if row[1] in {"010", "160", "050", "040", "860", "310", "400"}
12041205
and len(row[3]) >= 5
12051206
and row[3][3:5] == "00"
12061207
]
@@ -1357,6 +1358,57 @@ def complete_geoids(sumlev_code, rows):
13571358
complete_geoids("400", ua_rows)
13581359
complete_geoids("860", z_rows)
13591360

1361+
# Add a national geoheader row (010) so the ACS national geography row
1362+
# can join through to geocompare_data.
1363+
state_rows_for_agg = [
1364+
row for row in s_rows if len(row) >= 13 and row[4] not in {"NAME", "United States"}
1365+
]
1366+
if state_rows_for_agg:
1367+
total_aland = 0.0
1368+
total_awater = 0.0
1369+
total_aland_sqmi = 0.0
1370+
total_awater_sqmi = 0.0
1371+
lat_weighted_sum = 0.0
1372+
lon_weighted_sum = 0.0
1373+
1374+
for row in state_rows_for_agg:
1375+
try:
1376+
aland = float(row[7])
1377+
awater = float(row[8])
1378+
aland_sqmi = float(row[9])
1379+
awater_sqmi = float(row[10])
1380+
lat = float(row[11])
1381+
lon = float(row[12])
1382+
except (TypeError, ValueError):
1383+
continue
1384+
1385+
total_aland += aland
1386+
total_awater += awater
1387+
total_aland_sqmi += aland_sqmi
1388+
total_awater_sqmi += awater_sqmi
1389+
lat_weighted_sum += lat * aland
1390+
lon_weighted_sum += lon * aland
1391+
1392+
us_lat = lat_weighted_sum / total_aland if total_aland else 0.0
1393+
us_lon = lon_weighted_sum / total_aland if total_aland else 0.0
1394+
us_geoid = "0100000US"
1395+
us_row = [
1396+
"US",
1397+
us_geoid,
1398+
us_geoid,
1399+
"",
1400+
"United States",
1401+
"",
1402+
"",
1403+
f"{total_aland:.0f}",
1404+
f"{total_awater:.0f}",
1405+
f"{total_aland_sqmi:.1f}",
1406+
f"{total_awater_sqmi:.1f}",
1407+
f"{us_lat:.6f}",
1408+
f"{us_lon:.6f}",
1409+
]
1410+
rows.append(us_row)
1411+
13601412
# Merge rows together
13611413
rows = rows + c_rows + s_rows + z_rows + ua_rows + cbsa_rows
13621414

geocompare/engine.py

Lines changed: 206 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import csv
23
import difflib
34
import heapq
@@ -43,6 +44,7 @@ def __init__(self):
4344
self._gv_by_name = {}
4445
self._data_identifier_index = {}
4546
self.identity_index = None
47+
self._us_dp_cache = None
4648

4749
def create_data_products(self, data_path):
4850
"""Generate and save data products."""
@@ -107,6 +109,201 @@ def _set_data_products(self, data_products):
107109
self._gv_by_name = {gv.name: gv for gv in geovectors}
108110
self._data_identifier_index = self._build_data_identifier_index(demographicprofiles)
109111
self.identity_index = PlaceIdentityIndex.from_demographic_profiles(demographicprofiles)
112+
self._us_dp_cache = None
113+
114+
def _weighted_mean(self, values, weights):
115+
total_weight = sum(weights)
116+
if total_weight <= 0:
117+
return float(sum(values)) / float(len(values)) if values else 0.0
118+
return sum(v * w for v, w in zip(values, weights)) / total_weight
119+
120+
def _format_profile_component(self, key, value):
121+
if key == "land_area":
122+
return f"{value:,.1f} sqmi"
123+
if key in {
124+
"per_capita_income",
125+
"median_household_income",
126+
"median_value",
127+
"median_rent",
128+
}:
129+
if key == "median_household_income" and int(round(value)) == 250001:
130+
return "$250,000+"
131+
return "$" + f"{int(round(value)):,}"
132+
if key == "median_year_structure_built":
133+
return str(int(round(value)))
134+
if key in {"median_age", "average_household_size"}:
135+
return f"{value:,.1f}"
136+
if float(value).is_integer():
137+
return f"{int(round(value)):,}"
138+
return f"{value:,.3f}"
139+
140+
def _recompute_compounds(self, dp):
141+
rc = dp.rc
142+
c = {}
143+
144+
population = rc.get("population", 0) or 0
145+
land_area = rc.get("land_area", 0) or 0
146+
if land_area:
147+
c["population_density"] = population / land_area
148+
else:
149+
c["population_density"] = 0.0
150+
151+
if population:
152+
for key in [
153+
"white_alone",
154+
"black_alone",
155+
"asian_alone",
156+
"other_race",
157+
"hispanic_or_latino",
158+
"white_alone_not_hispanic_or_latino",
159+
"italian_alone",
160+
"under_18",
161+
"population_18_to_64",
162+
"age_65_plus",
163+
]:
164+
if key in rc:
165+
c[key] = rc[key] / population * 100.0
166+
for key, value in rc.items():
167+
if key.endswith("_count"):
168+
c[key] = value / population * 100000.0
169+
else:
170+
for key in [
171+
"white_alone",
172+
"black_alone",
173+
"asian_alone",
174+
"other_race",
175+
"hispanic_or_latino",
176+
"white_alone_not_hispanic_or_latino",
177+
"italian_alone",
178+
"under_18",
179+
"population_18_to_64",
180+
"age_65_plus",
181+
]:
182+
if key in rc:
183+
c[key] = 0.0
184+
185+
pop_25 = rc.get("population_25_years_and_older", 0) or 0
186+
if pop_25 and population:
187+
c["population_25_years_and_older"] = pop_25 / population * 100.0
188+
if "bachelors_degree_or_higher" in rc:
189+
c["bachelors_degree_or_higher"] = rc["bachelors_degree_or_higher"] / pop_25 * 100.0
190+
if "graduate_degree_or_higher" in rc:
191+
c["graduate_degree_or_higher"] = rc["graduate_degree_or_higher"] / pop_25 * 100.0
192+
else:
193+
if "population_25_years_and_older" in rc:
194+
c["population_25_years_and_older"] = 0.0
195+
if "bachelors_degree_or_higher" in rc:
196+
c["bachelors_degree_or_higher"] = 0.0
197+
if "graduate_degree_or_higher" in rc:
198+
c["graduate_degree_or_higher"] = 0.0
199+
200+
poverty_universe = rc.get("poverty_universe", 0) or 0
201+
if poverty_universe and "population_below_poverty_level" in rc:
202+
c["population_below_poverty_level"] = (
203+
rc["population_below_poverty_level"] / poverty_universe * 100.0
204+
)
205+
elif "population_below_poverty_level" in rc:
206+
c["population_below_poverty_level"] = 0.0
207+
208+
labor_force = rc.get("labor_force", 0) or 0
209+
if labor_force and "unemployed_population" in rc:
210+
c["unemployed_population"] = rc["unemployed_population"] / labor_force * 100.0
211+
elif "unemployed_population" in rc:
212+
c["unemployed_population"] = 0.0
213+
214+
occupied = rc.get("occupied_housing_units", 0) or 0
215+
if occupied and "homeowner_occupied_housing_units" in rc:
216+
c["homeowner_occupied_housing_units"] = (
217+
rc["homeowner_occupied_housing_units"] / occupied * 100.0
218+
)
219+
elif "homeowner_occupied_housing_units" in rc:
220+
c["homeowner_occupied_housing_units"] = 0.0
221+
222+
registered = rc.get("registered_voters", 0) or 0
223+
if population and "registered_voters" in rc:
224+
c["registered_voters"] = rc["registered_voters"] / population * 100.0
225+
if registered:
226+
for key in ("democratic_voters", "republican_voters", "other_voters"):
227+
if key in rc:
228+
c[key] = rc[key] / registered * 100.0
229+
230+
dp.c = c
231+
fcd = {}
232+
for key, value in c.items():
233+
if key == "population_density":
234+
fcd[key] = f"{value:,.1f}/sqmi"
235+
elif key.endswith("_count"):
236+
fcd[key] = f"{value:,.1f}/100k"
237+
else:
238+
fcd[key] = f"{value:,.1f}%"
239+
dp.fcd = fcd
240+
241+
def _build_united_states_profile(self):
242+
if self._us_dp_cache is not None:
243+
return self._us_dp_cache
244+
245+
d = self.get_data_products()
246+
states = [dp for dp in d["demographicprofiles"] if dp.sumlevel == "040"]
247+
if not states:
248+
raise ValueError("No state-level profiles available to synthesize United States.")
249+
250+
us_dp = copy.deepcopy(states[0])
251+
us_dp.name = "United States"
252+
us_dp.state = "US"
253+
us_dp.sumlevel = "040"
254+
us_dp.geoid = "04000US00"
255+
us_dp.counties = []
256+
us_dp.counties_display = []
257+
258+
population_weights = [float(dp.rc.get("population", 0) or 0) for dp in states]
259+
household_weights = [float(dp.rc.get("households", 0) or 0) for dp in states]
260+
housing_weights = [float(dp.rc.get("occupied_housing_units", 0) or 0) for dp in states]
261+
262+
weighted_by_population = {
263+
"median_age",
264+
"per_capita_income",
265+
"latitude",
266+
"longitude",
267+
"social_ai_score",
268+
"social_acs_score",
269+
"social_overlap_coverage_pct",
270+
}
271+
weighted_by_households = {"median_household_income", "average_household_size"}
272+
weighted_by_housing = {
273+
"median_year_structure_built",
274+
"median_rooms",
275+
"median_value",
276+
"median_rent",
277+
}
278+
279+
keys = sorted({key for dp in states for key in dp.rc.keys()})
280+
aggregated = {}
281+
for key in keys:
282+
values = [float(dp.rc.get(key, 0) or 0) for dp in states]
283+
if key == "land_area":
284+
aggregated[key] = sum(values)
285+
elif key in weighted_by_population or key.endswith("_score") or key.endswith("_pct"):
286+
aggregated[key] = self._weighted_mean(values, population_weights)
287+
elif key in weighted_by_households:
288+
aggregated[key] = self._weighted_mean(values, household_weights)
289+
elif key in weighted_by_housing:
290+
aggregated[key] = self._weighted_mean(values, housing_weights)
291+
else:
292+
aggregated[key] = sum(values)
293+
294+
if "population" in aggregated and "under_18" in aggregated and "age_65_plus" in aggregated:
295+
aggregated["population_18_to_64"] = (
296+
aggregated["population"] - aggregated["under_18"] - aggregated["age_65_plus"]
297+
)
298+
299+
us_dp.rc = aggregated
300+
us_dp.fc = {
301+
key: self._format_profile_component(key, value) for key, value in us_dp.rc.items()
302+
}
303+
self._recompute_compounds(us_dp)
304+
305+
self._us_dp_cache = us_dp
306+
return us_dp
110307

111308
def _build_data_identifier_index(self, demographicprofiles):
112309
index = {}
@@ -160,9 +357,12 @@ def get_data_products(self):
160357

161358
def _lookup_dp(self, display_label):
162359
dp = self._dp_by_name.get(display_label)
163-
if dp is None:
164-
raise ValueError(f"No geography found for display label: {display_label}")
165-
return dp
360+
if dp is not None:
361+
return dp
362+
normalized = str(display_label or "").strip().lower()
363+
if normalized in {"united states", "united states of america", "us", "u.s."}:
364+
return self._build_united_states_profile()
365+
raise ValueError(f"No geography found for display label: {display_label}")
166366

167367
def resolve_geography(self, query, state=None, sumlevel=None, population=None, n=5, **kwargs):
168368
"""Resolve an input geography string to likely canonical matches."""
@@ -380,7 +580,9 @@ def get_dp(self, display_label, **kwargs):
380580

381581
return [self._lookup_dp(display_label)]
382582

383-
def extreme_values(self, data_identifier, context="", geofilter="", n=10, lowest=False, **kwargs):
583+
def extreme_values(
584+
self, data_identifier, context="", geofilter="", n=10, lowest=False, **kwargs
585+
):
384586
"""Get highest and lowest values."""
385587
d = self.get_data_products()
386588

geocompare/interfaces/cli.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,9 @@ def divider(dpi):
546546
return "-" * (68 if key == "population" else 89)
547547

548548
def ev_print_headers(comp, universe_sl, group_sl, group):
549-
if universe_sl == "040":
549+
if universe_sl == "010":
550+
universe = "Nation"
551+
elif universe_sl == "040":
550552
universe = "State"
551553
elif universe_sl == "050":
552554
universe = "County"
@@ -622,7 +624,9 @@ def divider():
622624
return "-" * 68
623625

624626
def cg_print_headers(universe_sl, group_sl, group):
625-
if universe_sl == "040":
627+
if universe_sl == "010":
628+
universe = "Nation"
629+
elif universe_sl == "040":
626630
universe = "State"
627631
elif universe_sl == "050":
628632
universe = "County"

geocompare/models/demographic_profile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def __init__(self, db_row):
126126
self.ind['median_household_income'] = 0
127127
self.ind['population_below_poverty_level'] = 0
128128
self.ind['labor_force'] = 0
129-
self.ind['unemployed_population'] = 2
129+
self.ind['unemployed_population'] = 0
130130
self.ind['households'] = 0
131131

132132
# Housing category
@@ -178,9 +178,9 @@ def __init__(self, db_row):
178178
('std', 'population_below_poverty_level'),
179179
('std', 'labor_force'),
180180
('std', 'unemployed_population'),
181-
('nc', 'households'),
182181
]),
183182
('HOUSING', [
183+
('nc', 'households'),
184184
('nc', 'average_household_size'),
185185
('nc', 'occupied_housing_units'),
186186
('std', 'homeowner_occupied_housing_units'),

geocompare/tools/summary_level_parser.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ class SummaryLevelParser:
99

1010
def __init__(self):
1111
self.keyword_to_code = {
12+
"nations": "010",
13+
"nation": "010",
14+
"n": "010",
1215
"states": "040",
1316
"s": "040",
1417
"counties": "050",
@@ -23,6 +26,7 @@ def __init__(self):
2326
"z": "860",
2427
}
2528
self.code_to_keyword = {
29+
"010": "nations",
2630
"050": "counties",
2731
"040": "states",
2832
"160": "places",

0 commit comments

Comments
 (0)