Skip to content

Commit 6b7da7b

Browse files
committed
start adding participating agency data
Signed-off-by: John Seekins <[email protected]>
1 parent 0795548 commit 6b7da7b

File tree

7 files changed

+124
-10
lines changed

7 files changed

+124
-10
lines changed

ice_scrapers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,9 @@
131131
}
132132
field_office_to_aor = {v: k for k, v in area_of_responsibility.items()}
133133

134+
from .agencies import scrape_agencies # noqa: F401,E402
134135
from .utils import ( # noqa: E402
136+
download_file, # noqa: F401
135137
get_ice_scrape_pages, # noqa: F401
136138
repair_locality, # noqa: F401
137139
repair_street, # noqa: F401

ice_scrapers/agencies.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# ICEFieldOfficeScraper class and scraping-related code
2+
from bs4 import BeautifulSoup
3+
import copy
4+
from .utils import download_file
5+
import os
6+
import polars
7+
import re
8+
from schemas import (
9+
agencies_287g,
10+
active_agency,
11+
pending_agency,
12+
)
13+
import time
14+
from utils import (
15+
logger,
16+
session,
17+
)
18+
19+
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
20+
base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"
21+
22+
23+
def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
24+
"""Collect data on participating agencies"""
25+
start_time = time.time()
26+
resp = session.get(base_xlsx_url, timeout=120)
27+
resp.raise_for_status()
28+
soup = BeautifulSoup(resp.content, "html.parser")
29+
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
30+
if not links:
31+
raise Exception(f"Could not find any XLSX files on {base_xlsx_url}")
32+
logger.debug(links)
33+
date_re = re.compile(r"\d{8}pm")
34+
agencies = copy.deepcopy(agencies_287g)
35+
for link in links:
36+
data = list(dict())
37+
match link:
38+
case x if "participating" in x:
39+
schema = copy.deepcopy(active_agency)
40+
case x if "pending" in x:
41+
schema = copy.deepcopy(pending_agency)
42+
case _:
43+
raise(f"Found an unsupported agency datasheet: {link}")
44+
"""
45+
Yes, polars supports loading from a URL. But this pattern
46+
lets us cache the download
47+
"""
48+
# remove the date so we can easily overwrite the local (cached) file
49+
filename = date_re.sub("", link.split('/')[-1])
50+
path = f"{SCRIPT_DIR}{os.sep}{filename}"
51+
if force_download or not os.path.exists(path):
52+
logger.info("Downloading agency info sheet from %s", link)
53+
download_file(link, path)
54+
df = polars.read_excel(
55+
drop_empty_rows=True,
56+
raise_if_empty=True,
57+
source=open(path, "rb")
58+
)
59+
for row in df.iter_rows(named=True):
60+
data = copy.deepcopy(schema)
61+
data["state"] = row["STATE"]
62+
data["agency"] = row["LAW ENFORCEMENT AGENCY"]
63+
data["county"] = row["COUNTY"]
64+
data["type"] = row["TYPE"]
65+
data["support_type"] = row["SUPPORT TYPE"]
66+
if "participating" in filename:
67+
data["moa"] = row["MOA"]
68+
data["signed"] = row["SIGNED"]
69+
data["addendum"] = row["ADDENDUM"]
70+
agencies["active"].append(data)
71+
else:
72+
agencies["pending"].append(data)
73+
if not keep_sheet:
74+
os.unlink(path)
75+
logger.info(" Collected %s active and %s pending agencies", len(agencies["active"]), len(agencies["pending"]))
76+
agencies["scrape_runtime"] = time.time() - start_time
77+
return agencies

ice_scrapers/general.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
insert_additional_facilities,
55
load_sheet,
66
merge_field_offices,
7+
scrape_agencies,
78
scrape_facilities,
89
scrape_field_offices,
910
)
1011
from schemas import facilities_schema
1112

1213

13-
def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> dict:
14+
def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False) -> tuple[dict, dict]:
15+
agencies = scrape_agencies(keep_sheet, force_download)
1416
facilities_data = copy.deepcopy(facilities_schema)
1517
facilities = load_sheet(keep_sheet, force_download)
1618
facilities_data["facilities"] = copy.deepcopy(facilities)
@@ -21,4 +23,4 @@ def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = Tr
2123
facilities_data = merge_field_offices(facilities_data, field_offices)
2224
facilities_data = insert_additional_facilities(facilities_data)
2325

24-
return facilities_data
26+
return facilities_data, agencies

ice_scrapers/spreadsheet_load.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import copy
33
import datetime
44
from ice_scrapers import (
5+
download_file,
56
ice_facility_types,
67
ice_inspection_types,
78
repair_locality,
@@ -84,13 +85,7 @@ def _download_sheet(keep_sheet: bool = True, force_download: bool = True) -> Tup
8485
logger.debug("Found sheet at: %s", actual_link)
8586
if force_download or not os.path.exists(filename):
8687
logger.info("Downloading detention stats sheet from %s", actual_link)
87-
resp = session.get(actual_link, timeout=120, stream=True)
88-
size = len(resp.content)
89-
with open(filename, "wb") as f:
90-
for chunk in resp.iter_content(chunk_size=1024):
91-
if chunk:
92-
f.write(chunk)
93-
logger.debug("Wrote %s byte sheet to %s", size, filename)
88+
download_file(actual_link, filename)
9489
df = polars.read_excel(
9590
drop_empty_rows=True,
9691
has_header=False,

ice_scrapers/utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@
77
)
88

99

10+
def download_file(link: str, path: str) -> None:
11+
"""
12+
Standard pattern for downloading a binary file from a URL
13+
"""
14+
resp = session.get(link, timeout=120, stream=True)
15+
size = len(resp.content)
16+
with open(path, "wb") as f:
17+
for chunk in resp.iter_content(chunk_size=1024):
18+
if chunk:
19+
f.write(chunk)
20+
logger.debug("Wrote %s byte sheet to %s", size, path)
21+
22+
1023
def special_facilities(facility: dict) -> dict:
1124
"""
1225
Some very specific facilities have unique fixes

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def main() -> None:
128128
exit(1)
129129

130130
if args.scrape:
131-
facilities_data = facilities_scrape_wrapper(
131+
facilities_data, agencies = facilities_scrape_wrapper(
132132
keep_sheet=not args.delete_sheets,
133133
force_download=not args.skip_downloads,
134134
skip_vera=args.skip_vera,

schemas.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,31 @@
105105
},
106106
}
107107

108+
agencies_287g: dict = {
109+
"active": [{}],
110+
"pending": [{}],
111+
"scrape_runtime": 0,
112+
"scraped_date": datetime.datetime.now(datetime.UTC),
113+
}
114+
115+
active_agency: dict = {
116+
"state": "",
117+
"agency": "",
118+
"county": "",
119+
"type": "",
120+
"signed": None,
121+
"moa": "",
122+
"addendum": "",
123+
"support_type": "",
124+
}
125+
126+
pending_agency: dict = {
127+
"state": "",
128+
"agency": "",
129+
"county": "",
130+
"type": "",
131+
"support_type": "",
132+
}
108133

109134
# enrichment response object
110135
enrich_resp_schema: dict = {

0 commit comments

Comments
 (0)