Skip to content

Commit 9e00815

Browse files
committed
close to child objects
Signed-off-by: John Seekins <[email protected]>
1 parent 4d68bf8 commit 9e00815

File tree

7 files changed

+418
-408
lines changed

7 files changed

+418
-408
lines changed

enricher.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55
wikipedia,
66
openstreetmap,
77
)
8-
from schemas import facilities_schema
8+
from schemas import (
9+
default_coords,
10+
facilities_schema,
11+
)
912
import time
1013
from utils import logger
1114

@@ -37,15 +40,20 @@ def enrich_facility(facility_data: tuple) -> tuple:
3740
logger.info("Enriching facility %s...", facility_name)
3841
enriched_facility = copy.deepcopy(facility)
3942

40-
wiki = wikipedia.search(facility_name)
41-
wd = wikidata.search(facility_name)
42-
osm = openstreetmap.search(facility_name, facility.get("address", {}))
43-
enriched_facility["wikipedia_page_url"] = wiki.get("url", "")
44-
enriched_facility["wikipedia_search_query"] = wiki.get("search_query_steps", "")
45-
enriched_facility["wikidata_page_url"] = wd.get("url", "")
46-
enriched_facility["wikidata_search_query"] = wd.get("search_query_steps", "")
47-
enriched_facility["osm_result_url"] = osm.get("url", "")
48-
enriched_facility["osm_search_query"] = osm.get("search_query_steps", "")
43+
wiki = wikipedia.Wikipedia({"facility_name": facility_name})
44+
wiki_res = wiki.search()
45+
wd = wikidata.Wikidata({"facility_name": facility_name})
46+
wd_res = wd.search()
47+
osm = openstreetmap.OpenStreetMap({"facility_name": facility_name, "address": facility.get("address", {})})
48+
osm_res = osm.search()
49+
enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
50+
enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
51+
enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")
52+
enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "")
53+
enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", default_coords["latitude"])
54+
enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", default_coords["longitude"])
55+
enriched_facility["osm"]["url"] = osm_res.get("url", "")
56+
enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "")
4957

5058
logger.debug(enriched_facility)
5159
return facility_id, enriched_facility

enrichers/__init__.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import copy
2+
import requests
3+
from schemas import enrich_resp_schema
4+
import time
5+
from utils import (
6+
session,
7+
)
8+
9+
10+
class Enrichment(object):
11+
required_keys = [
12+
"facility_name",
13+
]
14+
15+
def __init__(self, **kwargs):
16+
self.resp_info = copy.deepcopy(enrich_resp_schema)
17+
for k in self.required_keys:
18+
if k not in kwargs.keys():
19+
raise KeyError("Missing required key %s in %s", k, kwargs)
20+
self.search_args = copy.deepcopy(kwargs)
21+
self.wait_time = int(kwargs.get("wait_time", 1))
22+
23+
def search(self) -> dict:
24+
"""Child objects should implement this"""
25+
return {}
26+
27+
def _req(self, url: str, params: dict = {}, timeout: int = 10) -> requests.Response:
28+
"""requests response wrapper to ensure we honor waits"""
29+
response = session.get(url, allow_redirects=True, timeout=timeout)
30+
response.raise_for_status()
31+
time.sleep(self.wait_time)
32+
return response
33+
34+
def _minimal_clean_facility_name(self, name: str) -> str:
35+
"""Minimal cleaning that preserves important context like 'County Jail'"""
36+
cleaned = name
37+
38+
# Remove pipe separators and take the main name
39+
if "|" in cleaned:
40+
parts = cleaned.split("|")
41+
cleaned = max(parts, key=len).strip()
42+
43+
# Only remove very generic suffixes, keep specific ones like "County Jail"
44+
generic_suffixes = [
45+
"Service Processing Center",
46+
"ICE Processing Center",
47+
"Immigration Processing Center",
48+
"Contract Detention Facility",
49+
"Adult Detention Facility",
50+
]
51+
52+
for suffix in generic_suffixes:
53+
if cleaned.endswith(suffix):
54+
cleaned = cleaned[: -len(suffix)].strip()
55+
break
56+
57+
return cleaned
58+
59+
def _clean_facility_name(self, name: str) -> str:
60+
"""Clean facility name for better search results"""
61+
# Remove common suffixes and prefixes that might interfere with search
62+
# This function may not be helpful - may be counterproductive.
63+
cleaned = name
64+
65+
# Remove pipe separators and take the main name
66+
if "|" in cleaned:
67+
parts = cleaned.split("|")
68+
# Take the longer part as it's likely the full name
69+
cleaned = max(parts, key=len).strip()
70+
71+
# Remove common facility type suffixes for broader search
72+
suffixes_to_remove = [
73+
"Detention Center",
74+
"Processing Center",
75+
"Correctional Center",
76+
"Correctional Facility",
77+
"Detention Facility",
78+
"Service Processing Center",
79+
"ICE Processing Center",
80+
"Immigration Processing Center",
81+
"Adult Detention Facility",
82+
"Contract Detention Facility",
83+
"Regional Detention Center",
84+
"County Jail",
85+
"County Detention Center",
86+
"Sheriff's Office",
87+
"Justice Center",
88+
"Safety Center",
89+
"Jail Services",
90+
"Correctional Complex",
91+
"Public Safety Complex",
92+
]
93+
94+
for suffix in suffixes_to_remove:
95+
if cleaned.endswith(suffix):
96+
cleaned = cleaned[: -len(suffix)].strip()
97+
break
98+
return cleaned

enrichers/openstreetmap.py

Lines changed: 90 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,100 @@
1-
import copy
2-
from enrichers.utils import (
3-
clean_facility_name,
4-
NOMINATIM_DELAY,
5-
)
6-
from schemas import enrich_resp_schema
7-
import time
8-
from utils import (
9-
logger,
10-
session,
11-
)
1+
from schemas import default_coords
2+
from enrichers import Enrichment
3+
from utils import logger
124

135

14-
def search(facility_name: str, address: dict) -> dict:
15-
search_name = clean_facility_name(facility_name)
16-
search_url = "https://nominatim.openstreetmap.org/search"
17-
resp_info = copy.deepcopy(enrich_resp_schema)
18-
resp_info["enrichment_type"] = "openstreetmap"
19-
data = []
20-
if not address:
21-
logger.debug("No address for %s, simply searching for name", facility_name)
22-
params = {
23-
"q": search_name,
24-
"format": "json",
25-
"limit": 5,
26-
"dedupe": 1,
27-
}
28-
logger.debug("Searching OSM for %s", search_name)
29-
resp_info["search_query_steps"].append(search_name) # type: ignore [attr-defined]
30-
try:
31-
response = session.get(search_url, params=params, timeout=15) # type: ignore [arg-type]
32-
response.raise_for_status()
33-
data = response.json()
34-
time.sleep(NOMINATIM_DELAY)
35-
except Exception as e:
36-
logger.debug(" OSM search error for '%s': %s", facility_name, e)
37-
resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
38-
return resp_info
39-
else:
40-
full_address = (
41-
f"{address['street']} {address['locality']}, {address['administrative_area']} {address['postal_code']}"
42-
)
43-
locality = f"{address['locality']}, {address['administrative_area']} {address['postal_code']}"
6+
class OpenStreetMap(Enrichment):
7+
def search(self) -> dict:
8+
facility_name = self.search_args["facility_name"]
9+
address = self.search_args.get("address", {})
10+
search_name = self._clean_facility_name(facility_name)
4411
search_url = "https://nominatim.openstreetmap.org/search"
45-
search_params = {
46-
"facility_name": {
47-
"q": f"{search_name} {full_address}",
12+
self.resp_info["enrichment_type"] = "openstreetmap"
13+
data = []
14+
if not address:
15+
logger.debug("No address for %s, simply searching for name", facility_name)
16+
params = {
17+
"q": search_name,
4818
"format": "json",
4919
"limit": 5,
5020
"dedupe": 1,
51-
},
52-
"street_address": {
53-
"q": f"{full_address}",
54-
"format": "json",
55-
"limit": 5,
56-
"dedupe": 1,
57-
},
58-
"locality": {
59-
"q": f"{locality}",
60-
"format": "json",
61-
"limit": 5,
62-
"dedupe": 1,
63-
},
64-
}
65-
for search_name, params in search_params.items():
66-
logger.debug("Searching OSM for %s", params["q"])
67-
resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
21+
}
22+
logger.debug("Searching OSM for %s", search_name)
23+
self.resp_info["search_query_steps"].append(search_name) # type: ignore [attr-defined]
6824
try:
69-
response = session.get(search_url, params=params, timeout=15) # type: ignore [arg-type]
70-
response.raise_for_status()
25+
response = self._req(search_url, params=params, timeout=15)
7126
data = response.json()
72-
time.sleep(NOMINATIM_DELAY)
7327
except Exception as e:
7428
logger.debug(" OSM search error for '%s': %s", facility_name, e)
75-
resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
76-
continue
77-
if data:
78-
return resp_info
79-
# when the URL result is a "way" this is usually correct.
80-
# checks top five results.
81-
match_terms = ["prison", "detention", "correctional", "jail"]
82-
for result in data:
83-
osm_type = result.get("type", "").lower()
84-
display_name = result.get("display_name", "").lower()
85-
if any(term in osm_type for term in match_terms) or any(term in display_name for term in match_terms):
86-
# todo courthouse could be added, or other tags such as "prison:for=migrant" as a clear positive search result.
87-
osm_id = result.get("osm_id", "")
88-
osm_type_prefix = result.get("osm_type", "")
89-
title = result.get("display_name", "")
90-
if osm_id and osm_type_prefix:
91-
resp_info["url"] = f"https://www.openstreetmap.org/{osm_type_prefix}/{osm_id}"
92-
resp_info["title"] = title
93-
return resp_info
94-
# fallback to first result
95-
first_result = data[0]
96-
logger.debug("Address searches didn't directly find anything, just using the first result: %s", first_result)
97-
# default to Washington, D.C.?
98-
lat = first_result.get("lat", "38.89511000")
99-
lon = first_result.get("lon", "-77.03637000")
100-
title = first_result.get("display_name", "")
101-
resp_info["search_query_steps"].append(f"{lat}&{lon}") # type: ignore [attr-defined]
102-
if lat and lon:
103-
resp_info["url"] = f"https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom=15"
104-
resp_info["title"] = title
105-
return resp_info
29+
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
30+
return self.resp_info
31+
else:
32+
full_address = (
33+
f"{address['street']} {address['locality']}, {address['administrative_area']} {address['postal_code']}"
34+
)
35+
locality = f"{address['locality']}, {address['administrative_area']} {address['postal_code']}"
36+
search_url = "https://nominatim.openstreetmap.org/search"
37+
search_params = {
38+
"facility_name": {
39+
"q": f"{search_name} {full_address}",
40+
"format": "json",
41+
"limit": 5,
42+
"dedupe": 1,
43+
},
44+
"street_address": {
45+
"q": f"{full_address}",
46+
"format": "json",
47+
"limit": 5,
48+
"dedupe": 1,
49+
},
50+
"locality": {
51+
"q": f"{locality}",
52+
"format": "json",
53+
"limit": 5,
54+
"dedupe": 1,
55+
},
56+
}
57+
for search_name, params in search_params.items():
58+
logger.debug("Searching OSM for %s", params["q"])
59+
self.resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
60+
try:
61+
response = self._req(search_url, params=params, timeout=15)
62+
data = response.json()
63+
except Exception as e:
64+
logger.debug(" OSM search error for '%s': %s", facility_name, e)
65+
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
66+
continue
67+
if data:
68+
return self.resp_info
69+
# when the URL result is a "way" this is usually correct.
70+
# checks top five results.
71+
match_terms = ["prison", "detention", "correctional", "jail"]
72+
for result in data:
73+
osm_type = result.get("type", "").lower()
74+
lat = result.get("lat", default_coords["latitude"])
75+
lon = result.get("lon", default_coords["longitude"])
76+
display_name = result.get("display_name", "").lower()
77+
if any(term in osm_type for term in match_terms) or any(term in display_name for term in match_terms):
78+
# todo courthouse could be added, or other tags such as "prison:for=migrant" as a clear positive search result.
79+
osm_id = result.get("osm_id", "")
80+
osm_type_prefix = result.get("osm_type", "")
81+
title = result.get("display_name", "")
82+
if osm_id and osm_type_prefix:
83+
self.resp_info["url"] = f"https://www.openstreetmap.org/{osm_type_prefix}/{osm_id}"
84+
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
85+
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
86+
self.resp_info["title"] = title
87+
return self.resp_info
88+
# fallback to first result
89+
first_result = data[0]
90+
logger.debug("Address searches didn't directly find anything, just using the first result: %s", first_result)
91+
title = first_result.get("display_name", "")
92+
lat = first_result.get("lat", default_coords["latitude"])
93+
lon = first_result.get("lon", default_coords["longitude"])
94+
self.resp_info["search_query_steps"].append(f"{lat}&{lon}") # type: ignore [attr-defined]
95+
if lat and lon:
96+
self.resp_info["url"] = f"https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom=15"
97+
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
98+
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
99+
self.resp_info["title"] = title
100+
return self.resp_info

0 commit comments

Comments
 (0)