Skip to content

Commit 42ec02a

Browse files
committed
handle missing links and actually find most recent sheet (probably)
Signed-off-by: John Seekins <[email protected]>
1 parent 3f31244 commit 42ec02a

File tree

1 file changed

+17
-2
lines changed

1 file changed

+17
-2
lines changed

scraper.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,23 @@ def _download_sheet(self) -> None:
3636
resp = session.get(self.base_xlsx_url, timeout=120)
3737
soup = BeautifulSoup(resp.content, "html.parser")
3838
links = soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))
39-
# quick solution is first result
40-
self.sheet_url = links[0]["href"]
39+
if not links:
40+
raise Exception(f"Could not find any XLSX files on {self.base_xlsx_url}")
41+
fy_re = re.compile(r".+FY(\d{2}).+")
42+
actual_link = links[0]["href"]
43+
cur_year = int(datetime.datetime.now().strftime("%y"))
44+
# try to find the most recent
45+
for link in links:
46+
match = fy_re.match(link["href"])
47+
if not match:
48+
continue
49+
year = int(match.group(1))
50+
if year >= cur_year:
51+
actual_link = link["href"]
52+
cur_year = year
53+
54+
logger.debug("Found sheet at: %s", actual_link)
55+
self.sheet_url = actual_link
4156
now = time.time()
4257
# one day in seconds is 86400
4358
if (

0 commit comments

Comments
 (0)