Skip to content

Commit e1f26ad

Browse files
committed
feat: update crawling exercises to be about JS
1 parent 822f353 commit e1f26ad

File tree

2 files changed

+71
-59
lines changed

2 files changed

+71
-59
lines changed

sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md

Lines changed: 68 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v
206206

207207
### Scrape calling codes of African countries
208208

209-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
209+
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
210210

211211
```text
212212
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
@@ -225,43 +225,53 @@ https://en.wikipedia.org/wiki/Cameroon +237
225225
...
226226
```
227227

228-
Hint: Locating cells in tables is sometimes easier if you know how to [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
228+
Hint: Locating cells in tables is sometimes easier if you know how to [filter](https://cheerio.js.org/docs/api/classes/Cheerio#filter) or [navigate up](https://cheerio.js.org/docs/api/classes/Cheerio#parent) in the HTML element tree.
229229

230230
<details>
231231
<summary>Solution</summary>
232232

233-
```py
234-
import httpx
235-
from bs4 import BeautifulSoup
236-
from urllib.parse import urljoin
237-
238-
def download(url):
239-
response = httpx.get(url)
240-
response.raise_for_status()
241-
return BeautifulSoup(response.text, "html.parser")
242-
243-
def parse_calling_code(soup):
244-
for label in soup.select("th.infobox-label"):
245-
if label.text.strip() == "Calling code":
246-
data = label.parent.select_one("td.infobox-data")
247-
return data.text.strip()
248-
return None
249-
250-
listing_url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa"
251-
listing_soup = download(listing_url)
252-
for name_cell in listing_soup.select(".wikitable tr td:nth-child(3)"):
253-
link = name_cell.select_one("a")
254-
country_url = urljoin(listing_url, link["href"])
255-
country_soup = download(country_url)
256-
calling_code = parse_calling_code(country_soup)
257-
print(country_url, calling_code)
233+
```js
234+
import * as cheerio from 'cheerio';
235+
236+
async function download(url) {
237+
const response = await fetch(url);
238+
if (response.ok) {
239+
const html = await response.text();
240+
return cheerio.load(html);
241+
} else {
242+
throw new Error(`HTTP ${response.status}`);
243+
}
244+
}
245+
246+
const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
247+
const $ = await download(listingURL);
248+
249+
const $promises = $(".wikitable tr td:nth-child(3)").map(async (i, element) => {
250+
const $nameCell = $(element);
251+
const $link = $nameCell.find("a").first();
252+
const countryURL = new URL($link.attr("href"), listingURL).href;
253+
254+
const $c = await download(countryURL);
255+
const $label = $c("th.infobox-label")
256+
.filter((i, element) => $c(element).text().trim() == "Calling code")
257+
.first();
258+
const callingCode = $label
259+
.parent()
260+
.find("td.infobox-data")
261+
.first()
262+
.text()
263+
.trim();
264+
265+
console.log(`${countryURL} ${callingCode || null}`);
266+
});
267+
await Promise.all($promises.get());
258268
```
259269

260270
</details>
261271

262272
### Scrape authors of F1 news articles
263273

264-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
274+
Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
265275

266276
```text
267277
https://www.theguardian.com/sport/formulaone
@@ -286,34 +296,36 @@ Hints:
286296
<details>
287297
<summary>Solution</summary>
288298

289-
```py
290-
import httpx
291-
from bs4 import BeautifulSoup
292-
from urllib.parse import urljoin
293-
294-
def download(url):
295-
response = httpx.get(url)
296-
response.raise_for_status()
297-
return BeautifulSoup(response.text, "html.parser")
298-
299-
def parse_author(article_soup):
300-
link = article_soup.select_one('aside a[rel="author"]')
301-
if link:
302-
return link.text.strip()
303-
address = article_soup.select_one('aside address')
304-
if address:
305-
return address.text.strip()
306-
return None
307-
308-
listing_url = "https://www.theguardian.com/sport/formulaone"
309-
listing_soup = download(listing_url)
310-
for item in listing_soup.select("#maincontent ul li"):
311-
link = item.select_one("a")
312-
article_url = urljoin(listing_url, link["href"])
313-
article_soup = download(article_url)
314-
title = article_soup.select_one("h1").text.strip()
315-
author = parse_author(article_soup)
316-
print(f"{author}: {title}")
299+
```js
300+
import * as cheerio from 'cheerio';
301+
302+
async function download(url) {
303+
const response = await fetch(url);
304+
if (response.ok) {
305+
const html = await response.text();
306+
return cheerio.load(html);
307+
} else {
308+
throw new Error(`HTTP ${response.status}`);
309+
}
310+
}
311+
312+
const listingURL = "https://www.theguardian.com/sport/formulaone";
313+
const $ = await download(listingURL);
314+
315+
const $promises = $("#maincontent ul li").map(async (i, element) => {
316+
const $item = $(element);
317+
const $link = $item.find("a").first();
318+
const authorURL = new URL($link.attr("href"), listingURL).href;
319+
320+
const $a = await download(authorURL);
321+
const title = $a("h1").text().trim();
322+
323+
const author = $a('a[rel="author"]').text().trim();
324+
const address = $a('aside address').text().trim();
325+
326+
console.log(`${author || address || null}: ${title}`);
327+
});
328+
await Promise.all($promises.get());
317329
```
318330

319331
</details>

sources/academy/webscraping/scraping_basics_python/10_crawling.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ In the next lesson, we'll scrape the product detail pages so that each product v
182182

183183
### Scrape calling codes of African countries
184184

185-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
185+
Scrape links to Wikipedia pages for all African states and territories. Follow each link and extract the _calling code_ from the info table. Print the URL and the calling code for each country. Start with this URL:
186186

187187
```text
188188
https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa
@@ -237,7 +237,7 @@ Hint: Locating cells in tables is sometimes easier if you know how to [navigate
237237

238238
### Scrape authors of F1 news articles
239239

240-
This is a follow-up to an exercise from the previous lesson, so feel free to reuse your code. Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
240+
Scrape links to the Guardian's latest F1 news articles. For each article, follow the link and extract both the author's name and the article's title. Print the author's name and the title for all the articles. Start with this URL:
241241

242242
```text
243243
https://www.theguardian.com/sport/formulaone
@@ -273,7 +273,7 @@ Hints:
273273
return BeautifulSoup(response.text, "html.parser")
274274

275275
def parse_author(article_soup):
276-
link = article_soup.select_one('aside a[rel="author"]')
276+
link = article_soup.select_one('a[rel="author"]')
277277
if link:
278278
return link.text.strip()
279279
address = article_soup.select_one('aside address')

0 commit comments

Comments
 (0)