Skip to content

Commit d89171f

Browse files
authored
Merge pull request #25 from johnseekins/update-dentention-sheet
update sheet from ice.gov (as it gets updated weekly)
2 parents 891c5e1 + dbf3c3e commit d89171f

File tree

1 file changed

+29
-10
lines changed

1 file changed

+29
-10
lines changed

scraper.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,41 @@
2424

2525

2626
class ICEGovFacilityScraper(object):
27-
base_url = "https://www.ice.gov/detention-facilities"
28-
sheet_url = "https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx"
27+
base_scrape_url = "https://www.ice.gov/detention-facilities"
28+
base_xlsx_url = "https://www.ice.gov/detain/detention-management"
2929

3030
# All methods for scraping ice.gov websites
3131
def __init__(self):
3232
self.facilities_data = copy.deepcopy(facilities_schema)
3333
self.filename = f"{SCRIPT_DIR}{os.sep}detentionstats.xlsx"
3434

3535
def _download_sheet(self) -> None:
36-
if not os.path.isfile(self.filename) or os.path.getsize(self.filename) < 1:
37-
logger.info("Downloading sheet from %s", self.sheet_url)
38-
resp = session.get(self.sheet_url, timeout=120)
39-
with open(self.filename, "wb") as f:
40-
for chunk in resp.iter_content(chunk_size=1024):
41-
if chunk:
42-
f.write(chunk)
36+
resp = session.get(self.base_xlsx_url, timeout=120)
37+
soup = BeautifulSoup(resp.content, "html.parser")
38+
links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))
39+
if not links:
40+
raise Exception(f"Could not find any XLSX files on {self.base_xlsx_url}")
41+
fy_re = re.compile(r".+FY(\d{2}).+")
42+
actual_link = links[0]["href"]
43+
cur_year = int(datetime.datetime.now().strftime("%y"))
44+
# try to find the most recent
45+
for link in links:
46+
match = fy_re.match(link["href"])
47+
if not match:
48+
continue
49+
year = int(match.group(1))
50+
if year >= cur_year:
51+
actual_link = link["href"]
52+
cur_year = year
53+
54+
logger.debug("Found sheet at: %s", actual_link)
55+
self.sheet_url = actual_link
56+
logger.info("Downloading detention stats sheet from %s", self.sheet_url)
57+
resp = session.get(self.sheet_url, timeout=120)
58+
with open(self.filename, "wb") as f:
59+
for chunk in resp.iter_content(chunk_size=1024):
60+
if chunk:
61+
f.write(chunk)
4362

4463
def _clean_street(self, street: str, locality: str = "") -> Tuple[str, bool]:
4564
"""Generally, we'll let the spreadsheet win arguments just to be consistent"""
@@ -263,7 +282,7 @@ def scrape_facilities(self):
263282
self.facilities_data["facilities"] = self._load_sheet()
264283

265284
# URLs for all pages
266-
urls = [f"{self.base_url}?exposed_form_display=1&page={i}" for i in range(6)]
285+
urls = [f"{self.base_scrape_url}?exposed_form_display=1&page={i}" for i in range(6)]
267286

268287
for page_num, url in enumerate(urls):
269288
logger.info("Scraping page %s/6...", page_num + 1)

0 commit comments

Comments
 (0)