|
| 1 | +import copy |
1 | 2 | import csv |
2 | 3 | import difflib |
3 | 4 | import heapq |
@@ -43,6 +44,7 @@ def __init__(self): |
43 | 44 | self._gv_by_name = {} |
44 | 45 | self._data_identifier_index = {} |
45 | 46 | self.identity_index = None |
| 47 | + self._us_dp_cache = None |
46 | 48 |
|
47 | 49 | def create_data_products(self, data_path): |
48 | 50 | """Generate and save data products.""" |
@@ -107,6 +109,201 @@ def _set_data_products(self, data_products): |
107 | 109 | self._gv_by_name = {gv.name: gv for gv in geovectors} |
108 | 110 | self._data_identifier_index = self._build_data_identifier_index(demographicprofiles) |
109 | 111 | self.identity_index = PlaceIdentityIndex.from_demographic_profiles(demographicprofiles) |
| 112 | + self._us_dp_cache = None |
| 113 | + |
| 114 | + def _weighted_mean(self, values, weights): |
| 115 | + total_weight = sum(weights) |
| 116 | + if total_weight <= 0: |
| 117 | + return float(sum(values)) / float(len(values)) if values else 0.0 |
| 118 | + return sum(v * w for v, w in zip(values, weights)) / total_weight |
| 119 | + |
| 120 | + def _format_profile_component(self, key, value): |
| 121 | + if key == "land_area": |
| 122 | + return f"{value:,.1f} sqmi" |
| 123 | + if key in { |
| 124 | + "per_capita_income", |
| 125 | + "median_household_income", |
| 126 | + "median_value", |
| 127 | + "median_rent", |
| 128 | + }: |
| 129 | + if key == "median_household_income" and int(round(value)) == 250001: |
| 130 | + return "$250,000+" |
| 131 | + return "$" + f"{int(round(value)):,}" |
| 132 | + if key == "median_year_structure_built": |
| 133 | + return str(int(round(value))) |
| 134 | + if key in {"median_age", "average_household_size"}: |
| 135 | + return f"{value:,.1f}" |
| 136 | + if float(value).is_integer(): |
| 137 | + return f"{int(round(value)):,}" |
| 138 | + return f"{value:,.3f}" |
| 139 | + |
| 140 | + def _recompute_compounds(self, dp): |
| 141 | + rc = dp.rc |
| 142 | + c = {} |
| 143 | + |
| 144 | + population = rc.get("population", 0) or 0 |
| 145 | + land_area = rc.get("land_area", 0) or 0 |
| 146 | + if land_area: |
| 147 | + c["population_density"] = population / land_area |
| 148 | + else: |
| 149 | + c["population_density"] = 0.0 |
| 150 | + |
| 151 | + if population: |
| 152 | + for key in [ |
| 153 | + "white_alone", |
| 154 | + "black_alone", |
| 155 | + "asian_alone", |
| 156 | + "other_race", |
| 157 | + "hispanic_or_latino", |
| 158 | + "white_alone_not_hispanic_or_latino", |
| 159 | + "italian_alone", |
| 160 | + "under_18", |
| 161 | + "population_18_to_64", |
| 162 | + "age_65_plus", |
| 163 | + ]: |
| 164 | + if key in rc: |
| 165 | + c[key] = rc[key] / population * 100.0 |
| 166 | + for key, value in rc.items(): |
| 167 | + if key.endswith("_count"): |
| 168 | + c[key] = value / population * 100000.0 |
| 169 | + else: |
| 170 | + for key in [ |
| 171 | + "white_alone", |
| 172 | + "black_alone", |
| 173 | + "asian_alone", |
| 174 | + "other_race", |
| 175 | + "hispanic_or_latino", |
| 176 | + "white_alone_not_hispanic_or_latino", |
| 177 | + "italian_alone", |
| 178 | + "under_18", |
| 179 | + "population_18_to_64", |
| 180 | + "age_65_plus", |
| 181 | + ]: |
| 182 | + if key in rc: |
| 183 | + c[key] = 0.0 |
| 184 | + |
| 185 | + pop_25 = rc.get("population_25_years_and_older", 0) or 0 |
| 186 | + if pop_25 and population: |
| 187 | + c["population_25_years_and_older"] = pop_25 / population * 100.0 |
| 188 | + if "bachelors_degree_or_higher" in rc: |
| 189 | + c["bachelors_degree_or_higher"] = rc["bachelors_degree_or_higher"] / pop_25 * 100.0 |
| 190 | + if "graduate_degree_or_higher" in rc: |
| 191 | + c["graduate_degree_or_higher"] = rc["graduate_degree_or_higher"] / pop_25 * 100.0 |
| 192 | + else: |
| 193 | + if "population_25_years_and_older" in rc: |
| 194 | + c["population_25_years_and_older"] = 0.0 |
| 195 | + if "bachelors_degree_or_higher" in rc: |
| 196 | + c["bachelors_degree_or_higher"] = 0.0 |
| 197 | + if "graduate_degree_or_higher" in rc: |
| 198 | + c["graduate_degree_or_higher"] = 0.0 |
| 199 | + |
| 200 | + poverty_universe = rc.get("poverty_universe", 0) or 0 |
| 201 | + if poverty_universe and "population_below_poverty_level" in rc: |
| 202 | + c["population_below_poverty_level"] = ( |
| 203 | + rc["population_below_poverty_level"] / poverty_universe * 100.0 |
| 204 | + ) |
| 205 | + elif "population_below_poverty_level" in rc: |
| 206 | + c["population_below_poverty_level"] = 0.0 |
| 207 | + |
| 208 | + labor_force = rc.get("labor_force", 0) or 0 |
| 209 | + if labor_force and "unemployed_population" in rc: |
| 210 | + c["unemployed_population"] = rc["unemployed_population"] / labor_force * 100.0 |
| 211 | + elif "unemployed_population" in rc: |
| 212 | + c["unemployed_population"] = 0.0 |
| 213 | + |
| 214 | + occupied = rc.get("occupied_housing_units", 0) or 0 |
| 215 | + if occupied and "homeowner_occupied_housing_units" in rc: |
| 216 | + c["homeowner_occupied_housing_units"] = ( |
| 217 | + rc["homeowner_occupied_housing_units"] / occupied * 100.0 |
| 218 | + ) |
| 219 | + elif "homeowner_occupied_housing_units" in rc: |
| 220 | + c["homeowner_occupied_housing_units"] = 0.0 |
| 221 | + |
| 222 | + registered = rc.get("registered_voters", 0) or 0 |
| 223 | + if population and "registered_voters" in rc: |
| 224 | + c["registered_voters"] = rc["registered_voters"] / population * 100.0 |
| 225 | + if registered: |
| 226 | + for key in ("democratic_voters", "republican_voters", "other_voters"): |
| 227 | + if key in rc: |
| 228 | + c[key] = rc[key] / registered * 100.0 |
| 229 | + |
| 230 | + dp.c = c |
| 231 | + fcd = {} |
| 232 | + for key, value in c.items(): |
| 233 | + if key == "population_density": |
| 234 | + fcd[key] = f"{value:,.1f}/sqmi" |
| 235 | + elif key.endswith("_count"): |
| 236 | + fcd[key] = f"{value:,.1f}/100k" |
| 237 | + else: |
| 238 | + fcd[key] = f"{value:,.1f}%" |
| 239 | + dp.fcd = fcd |
| 240 | + |
| 241 | + def _build_united_states_profile(self): |
| 242 | + if self._us_dp_cache is not None: |
| 243 | + return self._us_dp_cache |
| 244 | + |
| 245 | + d = self.get_data_products() |
| 246 | + states = [dp for dp in d["demographicprofiles"] if dp.sumlevel == "040"] |
| 247 | + if not states: |
| 248 | + raise ValueError("No state-level profiles available to synthesize United States.") |
| 249 | + |
| 250 | + us_dp = copy.deepcopy(states[0]) |
| 251 | + us_dp.name = "United States" |
| 252 | + us_dp.state = "US" |
| 253 | + us_dp.sumlevel = "040" |
| 254 | + us_dp.geoid = "04000US00" |
| 255 | + us_dp.counties = [] |
| 256 | + us_dp.counties_display = [] |
| 257 | + |
| 258 | + population_weights = [float(dp.rc.get("population", 0) or 0) for dp in states] |
| 259 | + household_weights = [float(dp.rc.get("households", 0) or 0) for dp in states] |
| 260 | + housing_weights = [float(dp.rc.get("occupied_housing_units", 0) or 0) for dp in states] |
| 261 | + |
| 262 | + weighted_by_population = { |
| 263 | + "median_age", |
| 264 | + "per_capita_income", |
| 265 | + "latitude", |
| 266 | + "longitude", |
| 267 | + "social_ai_score", |
| 268 | + "social_acs_score", |
| 269 | + "social_overlap_coverage_pct", |
| 270 | + } |
| 271 | + weighted_by_households = {"median_household_income", "average_household_size"} |
| 272 | + weighted_by_housing = { |
| 273 | + "median_year_structure_built", |
| 274 | + "median_rooms", |
| 275 | + "median_value", |
| 276 | + "median_rent", |
| 277 | + } |
| 278 | + |
| 279 | + keys = sorted({key for dp in states for key in dp.rc.keys()}) |
| 280 | + aggregated = {} |
| 281 | + for key in keys: |
| 282 | + values = [float(dp.rc.get(key, 0) or 0) for dp in states] |
| 283 | + if key == "land_area": |
| 284 | + aggregated[key] = sum(values) |
| 285 | + elif key in weighted_by_population or key.endswith("_score") or key.endswith("_pct"): |
| 286 | + aggregated[key] = self._weighted_mean(values, population_weights) |
| 287 | + elif key in weighted_by_households: |
| 288 | + aggregated[key] = self._weighted_mean(values, household_weights) |
| 289 | + elif key in weighted_by_housing: |
| 290 | + aggregated[key] = self._weighted_mean(values, housing_weights) |
| 291 | + else: |
| 292 | + aggregated[key] = sum(values) |
| 293 | + |
| 294 | + if "population" in aggregated and "under_18" in aggregated and "age_65_plus" in aggregated: |
| 295 | + aggregated["population_18_to_64"] = ( |
| 296 | + aggregated["population"] - aggregated["under_18"] - aggregated["age_65_plus"] |
| 297 | + ) |
| 298 | + |
| 299 | + us_dp.rc = aggregated |
| 300 | + us_dp.fc = { |
| 301 | + key: self._format_profile_component(key, value) for key, value in us_dp.rc.items() |
| 302 | + } |
| 303 | + self._recompute_compounds(us_dp) |
| 304 | + |
| 305 | + self._us_dp_cache = us_dp |
| 306 | + return us_dp |
110 | 307 |
|
111 | 308 | def _build_data_identifier_index(self, demographicprofiles): |
112 | 309 | index = {} |
@@ -160,9 +357,12 @@ def get_data_products(self): |
160 | 357 |
|
161 | 358 | def _lookup_dp(self, display_label): |
162 | 359 | dp = self._dp_by_name.get(display_label) |
163 | | - if dp is None: |
164 | | - raise ValueError(f"No geography found for display label: {display_label}") |
165 | | - return dp |
| 360 | + if dp is not None: |
| 361 | + return dp |
| 362 | + normalized = str(display_label or "").strip().lower() |
| 363 | + if normalized in {"united states", "united states of america", "us", "u.s."}: |
| 364 | + return self._build_united_states_profile() |
| 365 | + raise ValueError(f"No geography found for display label: {display_label}") |
166 | 366 |
|
167 | 367 | def resolve_geography(self, query, state=None, sumlevel=None, population=None, n=5, **kwargs): |
168 | 368 | """Resolve an input geography string to likely canonical matches.""" |
@@ -380,7 +580,9 @@ def get_dp(self, display_label, **kwargs): |
380 | 580 |
|
381 | 581 | return [self._lookup_dp(display_label)] |
382 | 582 |
|
383 | | - def extreme_values(self, data_identifier, context="", geofilter="", n=10, lowest=False, **kwargs): |
| 583 | + def extreme_values( |
| 584 | + self, data_identifier, context="", geofilter="", n=10, lowest=False, **kwargs |
| 585 | + ): |
384 | 586 | """Get highest and lowest values.""" |
385 | 587 | d = self.get_data_products() |
386 | 588 |
|
|
0 commit comments