Skip to content

Commit fb5965a

Browse files
committed
handle streaming results in some streams
Signed-off-by: John Seekins <[email protected]>
1 parent 4cc4ed4 commit fb5965a

File tree

4 files changed

+22
-12
lines changed

4 files changed

+22
-12
lines changed

enricher.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from concurrent.futures import ProcessPoolExecutor
22
import copy
33
from enrichers import (
4+
openstreetmap,
45
wikidata,
56
wikipedia,
6-
openstreetmap,
77
)
88
from schemas import (
99
default_coords,
@@ -43,14 +43,11 @@ def enrich_facility(facility_data: tuple) -> tuple:
4343
logger.info("Enriching facility %s...", facility_name)
4444
enriched_facility = copy.deepcopy(facility)
4545

46-
wiki = wikipedia.Wikipedia(facility_name=facility_name, wait_time=WIKIPEDIA_DELAY)
47-
wiki_res = wiki.search()
48-
wd = wikidata.Wikidata(facility_name=facility_name, wait_time=WIKIDATA_DELAY)
49-
wd_res = wd.search()
50-
osm = openstreetmap.OpenStreetMap(
46+
wiki_res = wikipedia.Wikipedia(facility_name=facility_name, wait_time=WIKIPEDIA_DELAY).search()
47+
wd_res = wikidata.Wikidata(facility_name=facility_name, wait_time=WIKIDATA_DELAY).search()
48+
osm_res = openstreetmap.OpenStreetMap(
5149
facility_name=facility_name, address=facility.get("address", {}), wait_time=OSM_DELAY
52-
)
53-
osm_res = osm.search()
50+
).search()
5451
enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
5552
enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
5653
enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")

enrichers/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from schemas import enrich_resp_schema
44
import time
55
from utils import (
6+
default_headers,
67
session,
78
)
89

@@ -24,10 +25,21 @@ def search(self) -> dict:
2425
"""Child objects should implement this"""
2526
return {}
2627

27-
def _req(self, url: str, params: dict = {}, timeout: int = 10) -> requests.Response:
28+
def _req(
29+
self, url: str, params: dict = {}, timeout: int = 10, stream: bool = False, headers: dict = default_headers
30+
) -> requests.Response:
2831
"""requests response wrapper to ensure we honor waits"""
2932

30-
response = session.get(url, allow_redirects=True, timeout=timeout, params=params)
33+
# ensure we get all headers configured correctly
34+
# but manually applied headers win the argument
35+
for k, v in default_headers.items():
36+
if k in headers.keys():
37+
continue
38+
headers[k] = v
39+
40+
response = session.get(
41+
url, allow_redirects=True, timeout=timeout, params=params, stream=stream, headers=headers
42+
)
3143
response.raise_for_status()
3244
time.sleep(self.wait_time)
3345
return response

scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _download_sheet(self) -> None:
5858
logger.debug("Found sheet at: %s", actual_link)
5959
self.sheet_url = actual_link
6060
logger.info("Downloading detention stats sheet from %s", self.sheet_url)
61-
resp = session.get(self.sheet_url, timeout=120)
61+
resp = session.get(self.sheet_url, timeout=120, stream=True)
6262
with open(self.filename, "wb") as f:
6363
for chunk in resp.iter_content(chunk_size=1024):
6464
if chunk:

utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,12 @@
1313
total=4,
1414
backoff_factor=1,
1515
)
16+
default_headers = {"User-Agent": "ICE-Facilities-Research/1.0 (Educational Research Purpose)"}
1617
_adapter = HTTPAdapter(max_retries=_retry_strategy)
1718
session = requests.Session()
1819
session.mount("https://", _adapter)
1920
session.mount("http://", _adapter)
20-
session.headers.update({"User-Agent": "ICE-Facilities-Research/1.0 (Educational Research Purpose)"})
21+
session.headers.update(default_headers)
2122

2223
default_timestamp = "1970-01-01T00:00:00-+0000"
2324
timestamp_format = "%Y-%m-%dT%H:%M:%S-%z"

0 commit comments

Comments
 (0)