Skip to content

Commit 37310e6

Browse files
committed
document enricment a bit and improve layout somewhat
Signed-off-by: John Seekins <[email protected]>
1 parent 5a15b31 commit 37310e6

File tree

6 files changed

+70
-38
lines changed

6 files changed

+70
-38
lines changed

enrichers/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Facility enrichment scrapers
2+
3+
These functions let us collect data about facilities from additional sources.
4+
5+
## Enrichment class
6+
7+
The base class we can build enrichment tools from. Largely ensures some consistent in functionality between enrichment tools.
8+
9+
### Available functions
10+
11+
Sub-classing `Enrichment` provides the following functions/objects:
12+
13+
* `self.resp_info`
14+
* Pre-created response object following our expected schema
15+
* `self._wait_time`
16+
* simple rate-limiting through `time.sleep()` calls, `wait_time` tells us how long we should sleep between calls to an individual API/site.
17+
* Defaults to `1` (seconds)
18+
* `self._req(...)`
19+
* Wrapper function around a call to `requests.get` (using a properly configured `session` object)
20+
* handles redirects
21+
* supports most normal requests function calls (`params`, `timeout`, `stream`, custom headers)
22+
* raises for non-2xx/3xx status
23+
* returns the entire `requests.Response` object for manipulation
24+
* `_minimal_clean_facility_name(str)`
25+
* standardizes facility name for searching
26+
* `_clean_facility_name(str)`
27+
* standardizes facility name for searching
28+
* more aggressive formatting than `_minimal_...` above
29+
30+
> All child functions should implement the `search()` function, which should return a dictionary using the `enrich_resp_schema` schema.

enrichers/__init__.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
"""
2+
Import order here is a touch weird, but we need it so
3+
types exist before attempting to import functions that
4+
may call them
5+
"""
6+
17
import copy
28
import requests
39
from schemas import enrich_resp_schema
@@ -7,39 +13,28 @@
713
session,
814
)
915

10-
OSM_DELAY = 1.0 # 1 second between requests as per OSM policy
11-
WIKIDATA_DELAY = 0.5 # Be respectful to Wikidata
12-
WIKIPEDIA_DELAY = 0.5 # Be respectful to Wikipedia
13-
14-
# default to Washington, D.C.?
15-
default_coords: dict = {
16-
"latitude": 38.89511000,
17-
"longitude": -77.03637000,
18-
}
19-
2016

2117
class Enrichment(object):
22-
required_keys = [
18+
_required_keys = [
2319
"facility_name",
2420
]
21+
# in seconds
22+
_wait_time: float = 1
2523

2624
def __init__(self, **kwargs):
2725
self.resp_info = copy.deepcopy(enrich_resp_schema)
28-
for k in self.required_keys:
26+
for k in self._required_keys:
2927
if k not in kwargs.keys():
3028
raise KeyError("Missing required key %s in %s", k, kwargs)
3129
self.search_args = copy.deepcopy(kwargs)
32-
self.wait_time = int(kwargs.get("wait_time", 1))
3330

3431
def search(self) -> dict:
3532
"""Child objects should implement this"""
3633
return {}
3734

38-
def _req(
39-
self, url: str, params: dict = {}, timeout: int = 10, stream: bool = False, headers: dict = default_headers
40-
) -> requests.Response:
35+
def _req(self, url: str, **kwargs) -> requests.Response:
4136
"""requests response wrapper to ensure we honor waits"""
42-
37+
headers = kwargs.get("headers", {})
4338
# ensure we get all headers configured correctly
4439
# but manually applied headers win the argument
4540
for k, v in default_headers.items():
@@ -48,10 +43,15 @@ def _req(
4843
headers[k] = v
4944

5045
response = session.get(
51-
url, allow_redirects=True, timeout=timeout, params=params, stream=stream, headers=headers
46+
url,
47+
allow_redirects=True,
48+
timeout=kwargs.get("timeout", 10),
49+
params=kwargs.get("params", {}),
50+
stream=kwargs.get("stream", False),
51+
headers=headers,
5252
)
5353
response.raise_for_status()
54-
time.sleep(self.wait_time)
54+
time.sleep(self._wait_time)
5555
return response
5656

5757
def _minimal_clean_facility_name(self, name: str) -> str:

enrichers/general.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
from concurrent.futures import ProcessPoolExecutor
22
import copy
33
from enrichers import (
4-
default_coords,
5-
OSM_DELAY,
64
openstreetmap,
75
wikidata,
8-
WIKIDATA_DELAY,
96
wikipedia,
10-
WIKIPEDIA_DELAY,
117
)
128
from schemas import (
139
facilities_schema,
@@ -43,17 +39,16 @@ def _enrich_facility(facility_data: tuple) -> tuple:
4339
logger.info("Enriching facility %s...", facility_name)
4440
enriched_facility = copy.deepcopy(facility)
4541

46-
wiki_res = wikipedia.Wikipedia(facility_name=facility_name, wait_time=WIKIPEDIA_DELAY).search()
47-
wd_res = wikidata.Wikidata(facility_name=facility_name, wait_time=WIKIDATA_DELAY).search()
48-
osm_res = openstreetmap.OpenStreetMap(
49-
facility_name=facility_name, address=facility.get("address", {}), wait_time=OSM_DELAY
50-
).search()
42+
wiki_res = wikipedia.Wikipedia(facility_name=facility_name).search()
43+
wd_res = wikidata.Wikidata(facility_name=facility_name).search()
44+
osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {}))
45+
osm_res = osm.search()
5146
enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
5247
enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
5348
enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")
5449
enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "")
55-
enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", default_coords["latitude"])
56-
enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", default_coords["longitude"])
50+
enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"])
51+
enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"])
5752
enriched_facility["osm"]["url"] = osm_res.get("url", "")
5853
enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "")
5954

enrichers/openstreetmap.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1-
from enrichers import (
2-
default_coords,
3-
Enrichment,
4-
)
1+
from enrichers import Enrichment
52
from utils import logger
63

74

85
class OpenStreetMap(Enrichment):
6+
# default to Washington, D.C.?
7+
default_coords: dict = {
8+
"latitude": 38.89511000,
9+
"longitude": -77.03637000,
10+
}
11+
912
def search(self) -> dict:
1013
facility_name = self.search_args["facility_name"]
1114
address = self.search_args.get("address", {})
@@ -74,8 +77,8 @@ def search(self) -> dict:
7477
match_terms = ["prison", "detention", "correctional", "jail"]
7578
for result in data:
7679
osm_type = result.get("type", "").lower()
77-
lat = result.get("lat", default_coords["latitude"])
78-
lon = result.get("lon", default_coords["longitude"])
80+
lat = result.get("lat", self.default_coords["latitude"])
81+
lon = result.get("lon", self.default_coords["longitude"])
7982
display_name = result.get("display_name", "").lower()
8083
if any(term in osm_type for term in match_terms) or any(term in display_name for term in match_terms):
8184
# todo courthouse could be added, or other tags such as "prison:for=migrant" as a clear positive search result.
@@ -92,8 +95,8 @@ def search(self) -> dict:
9295
first_result = data[0]
9396
logger.debug("Address searches didn't directly find anything, just using the first result: %s", first_result)
9497
title = first_result.get("display_name", "")
95-
lat = first_result.get("lat", default_coords["latitude"])
96-
lon = first_result.get("lon", default_coords["longitude"])
98+
lat = first_result.get("lat", self.default_coords["latitude"])
99+
lon = first_result.get("lon", self.default_coords["longitude"])
97100
self.resp_info["search_query_steps"].append(f"{lat}&{lon}") # type: ignore [attr-defined]
98101
if lat and lon:
99102
self.resp_info["url"] = f"https://www.openstreetmap.org/?mlat={lat}&mlon={lon}&zoom=15"

enrichers/wikidata.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44

55
class Wikidata(Enrichment):
6+
# wikidata can handle slightly more aggressive requests
7+
_wait_time = 0.5
8+
69
def search(self) -> dict:
710
facility_name = self.search_args["facility_name"]
811
# Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.

enrichers/wikipedia.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
class Wikipedia(Enrichment):
7+
_wait_time = 0.5
78
static_search: str = "https://en.wikipedia.org/wiki/"
89
api_search: str = "https://en.wikipedia.org/w/api.php"
910
facility_terms: list = [

0 commit comments

Comments
 (0)