Skip to content

Commit 965d40f

Browse files
Merge pull request #2288 from IFRCGo/fix/ingest-icrc
Fix: Change scraping logic for ICRC website
2 parents 5752f46 + 7ecd0bc commit 965d40f

File tree

1 file changed

+23
-20
lines changed

1 file changed

+23
-20
lines changed

api/management/commands/ingest_icrc.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ class Command(BaseCommand):
1313

1414
@monitor(monitor_slug=SentryMonitor.INGEST_ICRC)
1515
def handle(self, *args, **kwargs):
16-
logger.info("Strating ICRC data ingest")
16+
logger.info("Starting ICRC data ingest")
1717
HEADERS = {
1818
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", # noqa
1919
}
20-
response = requests.get(
21-
url="https://www.icrc.org/en/where-we-work",
22-
headers=HEADERS,
23-
)
20+
icrc_url = "https://www.icrc.org"
21+
icrc_where_we_work_url = "https://www.icrc.org/en/where-we-work"
22+
response = requests.get(url=icrc_where_we_work_url, headers=HEADERS)
23+
2424
if response.status_code != 200:
2525
text_to_log = "Error querying ICRC feed at https://www.icrc.org/en/where-we-work"
2626
logger.error(text_to_log)
@@ -36,32 +36,36 @@ def handle(self, *args, **kwargs):
3636
response.raise_for_status()
3737
soup = BeautifulSoup(response.content, "html.parser")
3838

39-
# Get the countries information from the "Where we work" page
40-
regions_list = soup.find("div", {"id": "blockRegionalList"}).find_all("ul", {"class": "list"})
39+
# Get countries information from "Where we work" page
40+
regions_list = soup.find("div", {"class": "js-select-country-list"}).find("ul").find_all("ul")
41+
4142
country_list = []
4243
for region in regions_list:
43-
for country in region.find_all("li", {"class": "item"}):
44-
# Get key information
44+
for country in region.find_all("li"):
4545
name = country.text.strip()
46-
url = country.find("a")["href"] if country.find("a") else None
47-
presence = True if url else False
48-
key_operation = True if "keyOperations" in country["class"] else False
49-
# Get the description from the country page
46+
href = country.find("a")["href"] if country.find("a") else None
47+
country_url = icrc_url + href if href else None
48+
presence = bool(country_url)
5049
description = None
51-
if url:
50+
key_operation = False
51+
52+
if country_url:
5253
try:
53-
country_page = requests.get(url=url, headers={"User-Agent": ""})
54+
country_page = requests.get(url=country_url, headers=HEADERS)
5455
country_page.raise_for_status()
5556
country_soup = BeautifulSoup(country_page.content, "html.parser")
56-
description = country_soup.find("div", {"class": "block-introduction"}).find_all()[2].text.strip()
57+
description_tag = country_soup.find("div", class_="description").find("div", class_="ck-text")
58+
key_operation = bool(description_tag)
59+
description = description_tag.text.strip() if description_tag else None
5760
except Exception:
5861
pass
59-
# Append all the information to the list
62+
63+
# Append to list
6064
country_list.append(
6165
{
6266
"Country": name,
6367
"ICRC presence": presence,
64-
"URL": url,
68+
"URL": country_url,
6569
"Key operation": key_operation,
6670
"Description": description,
6771
}
@@ -72,15 +76,14 @@ def handle(self, *args, **kwargs):
7276
country = Country.objects.filter(name__exact=data["Country"]).first()
7377
if country:
7478
country_icrc_presence, _ = CountryICRCPresence.objects.get_or_create(country=country)
75-
7679
country_icrc_presence.icrc_presence = data["ICRC presence"]
7780
country_icrc_presence.url = data["URL"]
7881
country_icrc_presence.key_operation = data["Key operation"]
7982
country_icrc_presence.description = data["Description"]
8083
country_icrc_presence.save()
8184
added += 1
8285

83-
text_to_log = "%s ICRC added" % added
86+
text_to_log = f"{added} ICRC added"
8487
logger.info(text_to_log)
8588
body = {"name": "ingest_icrc", "message": text_to_log, "num_result": added, "status": CronJobStatus.SUCCESSFUL}
8689
CronJob.sync_cron(body)

0 commit comments

Comments
 (0)