Skip to content

Commit 86bb36f

Browse files
committed
refactor: use for/of and .toArray() when looping over Cheerio selections
Also fixes a few bugs I noticed when trying out the code.
1 parent f5e89f6 commit 86bb36f

File tree

7 files changed

+86
-98
lines changed

7 files changed

+86
-98
lines changed

sources/academy/webscraping/scraping_basics_javascript2/06_locating_elements.md

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,19 @@ const response = await fetch(url);
2323
if (response.ok) {
2424
const html = await response.text();
2525
const $ = cheerio.load(html);
26-
// highlight-next-line
27-
$(".product-item").each((i, element) => {
28-
// highlight-next-line
26+
// highlight-start
27+
for (const element of $(".product-item").toArray()) {
2928
console.log($(element).text());
30-
// highlight-next-line
31-
});
29+
}
30+
// highlight-end
3231
} else {
3332
throw new Error(`HTTP ${response.status}`);
3433
}
3534
```
3635

37-
We're using [`each()`](https://cheerio.js.org/docs/api/classes/Cheerio#each) to loop over the items in the Cheerio container. It calls the given function for each of the elements, with two arguments. The first is an index (0, 1, 2…), and the second is the element being processed.
36+
Calling [`toArray()`](https://cheerio.js.org/docs/api/classes/Cheerio#toarray) converts the Cheerio selection to a standard JavaScript array. We can then loop over that array and process each selected element.
3837

39-
Cheerio requires us to wrap the element with `$()` again before we can work with it further, and then we call `.text()`. If we run the code, it… well, it definitely prints _something_
38+
Cheerio requires us to wrap each element with `$()` again before we can work with it further, and then we call `.text()`. If we run the code, it… well, it definitely prints _something_
4039

4140
```text
4241
$ node index.js
@@ -79,7 +78,7 @@ if (response.ok) {
7978
const html = await response.text();
8079
const $ = cheerio.load(html);
8180

82-
$(".product-item").each((i, element) => {
81+
for (const element of $(".product-item").toArray()) {
8382
const $productItem = $(element);
8483

8584
const $title = $productItem.find(".product-item__title");
@@ -89,7 +88,7 @@ if (response.ok) {
8988
const price = $price.text();
9089

9190
console.log(`${title} | ${price}`);
92-
});
91+
}
9392
} else {
9493
throw new Error(`HTTP ${response.status}`);
9594
}
@@ -175,7 +174,7 @@ if (response.ok) {
175174
const html = await response.text();
176175
const $ = cheerio.load(html);
177176

178-
$(".product-item").each((i, element) => {
177+
for (const element of $(".product-item").toArray()) {
179178
const $productItem = $(element);
180179

181180
const $title = $productItem.find(".product-item__title");
@@ -186,7 +185,7 @@ if (response.ok) {
186185
const price = $price.text();
187186

188187
console.log(`${title} | ${price}`);
189-
});
188+
}
190189
} else {
191190
throw new Error(`HTTP ${response.status}`);
192191
}
@@ -248,11 +247,11 @@ Djibouti
248247
const html = await response.text();
249248
const $ = cheerio.load(html);
250249

251-
$(".wikitable").each((i, tableElement) => {
250+
for (const tableElement of $(".wikitable").toArray()) {
252251
const $table = $(tableElement);
253252
const $rows = $table.find("tr");
254253

255-
$rows.each((j, rowElement) => {
254+
for (const rowElement of $rows.toArray()) {
256255
const $row = $(rowElement);
257256
const $cells = $row.find("td");
258257

@@ -261,12 +260,11 @@ Djibouti
261260
const $link = $thirdColumn.find("a").first();
262261
console.log($link.text());
263262
}
264-
});
265-
});
263+
}
264+
}
266265
} else {
267266
throw new Error(`HTTP ${response.status}`);
268267
}
269-
270268
```
271269

272270
Because some rows contain [table headers](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/th), we skip processing a row if `table_row.select("td")` doesn't find any [table data](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td) cells.
@@ -293,11 +291,11 @@ Simplify the code from previous exercise. Use a single for loop and a single CSS
293291
const html = await response.text();
294292
const $ = cheerio.load(html);
295293

296-
$(".wikitable tr td:nth-child(3)").each((i, element) => {
294+
for (const element of $(".wikitable tr td:nth-child(3)").toArray()) {
297295
const $nameCell = $(element);
298296
const $link = $nameCell.find("a").first();
299297
console.log($link.text());
300-
});
298+
}
301299
} else {
302300
throw new Error(`HTTP ${response.status}`);
303301
}
@@ -335,9 +333,9 @@ Max Verstappen wins Canadian Grand Prix: F1 – as it happened
335333
const html = await response.text();
336334
const $ = cheerio.load(html);
337335

338-
$("#maincontent ul li h3").each((i, element) => {
336+
for (const element of $("#maincontent ul li h3").toArray()) {
339337
console.log($(element).text());
340-
});
338+
}
341339
} else {
342340
throw new Error(`HTTP ${response.status}`);
343341
}

sources/academy/webscraping/scraping_basics_javascript2/07_extracting_data.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ if (response.ok) {
7070
const html = await response.text();
7171
const $ = cheerio.load(html);
7272

73-
$(".product-item").each((i, element) => {
73+
for (const element of $(".product-item").toArray()) {
7474
const $productItem = $(element);
7575

7676
const $title = $productItem.find(".product-item__title");
@@ -87,7 +87,7 @@ if (response.ok) {
8787
}
8888

8989
console.log(`${title} | ${priceRange.minPrice} | ${priceRange.price}`);
90-
});
90+
}
9191
} else {
9292
throw new Error(`HTTP ${response.status}`);
9393
}
@@ -177,7 +177,7 @@ if (response.ok) {
177177
const html = await response.text();
178178
const $ = cheerio.load(html);
179179

180-
$(".product-item").each((i, element) => {
180+
for (const element of $(".product-item").toArray()) {
181181
const $productItem = $(element);
182182

183183
const $title = $productItem.find(".product-item__title");
@@ -200,7 +200,7 @@ if (response.ok) {
200200
}
201201

202202
console.log(`${title} | ${priceRange.minPrice} | ${priceRange.price}`);
203-
});
203+
}
204204
} else {
205205
throw new Error(`HTTP ${response.status}`);
206206
}
@@ -258,7 +258,7 @@ Denon AH-C720 In-Ear Headphones | 236
258258
const html = await response.text();
259259
const $ = cheerio.load(html);
260260

261-
$(".product-item").each((i, element) => {
261+
for (const element of $(".product-item").toArray()) {
262262
const $productItem = $(element);
263263

264264
const title = $productItem.find(".product-item__title");
@@ -268,7 +268,7 @@ Denon AH-C720 In-Ear Headphones | 236
268268
const unitsCount = parseUnitsText(unitsText);
269269

270270
console.log(`${title} | ${unitsCount}`);
271-
});
271+
}
272272
} else {
273273
throw new Error(`HTTP ${response.status}`);
274274
}
@@ -307,7 +307,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve
307307
const html = await response.text();
308308
const $ = cheerio.load(html);
309309

310-
$(".product-item").each((i, element) => {
310+
for (const element of $(".product-item").toArray()) {
311311
const $productItem = $(element);
312312

313313
const $title = $productItem.find(".product-item__title");
@@ -317,7 +317,7 @@ Simplify the code from previous exercise. Use [regular expressions](https://deve
317317
const unitsCount = parseUnitsText(unitsText);
318318

319319
console.log(`${title} | ${unitsCount}`);
320-
});
320+
}
321321
} else {
322322
throw new Error(`HTTP ${response.status}`);
323323
}
@@ -369,7 +369,7 @@ Hints:
369369
const html = await response.text();
370370
const $ = cheerio.load(html);
371371

372-
$("#maincontent ul li").each((i, element) => {
372+
for (const element of $("#maincontent ul li").toArray()) {
373373
const $article = $(element);
374374

375375
const title = $article
@@ -383,7 +383,7 @@ Hints:
383383
const date = new Date(dateText);
384384

385385
console.log(`${title} | ${date.toDateString()}`);
386-
});
386+
}
387387
} else {
388388
throw new Error(`HTTP ${response.status}`);
389389
}

sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ if (response.ok) {
3838
const $ = cheerio.load(html);
3939

4040
// highlight-next-line
41-
const $items = $(".product-item").map((i, element) => {
41+
const data = $(".product-item").toArray().map(element => {
4242
const $productItem = $(element);
4343

4444
const $title = $productItem.find(".product-item__title");
@@ -64,15 +64,13 @@ if (response.ok) {
6464
return { title, ...priceRange };
6565
});
6666
// highlight-next-line
67-
const data = $items.get();
68-
// highlight-next-line
6967
console.log(data);
7068
} else {
7169
throw new Error(`HTTP ${response.status}`);
7270
}
7371
```
7472

75-
Instead of printing each line, we now return the data for each product as a JavaScript object. We've replaced `.each()` with [`.map()`](https://cheerio.js.org/docs/api/classes/Cheerio#map-3), which also iterates over the selection but, in addition, collects all the results and returns them as a Cheerio collection. We then convert it into a standard JavaScript array by calling [`.get()`](https://cheerio.js.org/docs/api/classes/Cheerio#call-signature-32). Near the end of the program, we print the entire array.
73+
Instead of printing each line, we now return the data for each product as a JavaScript object. We've replaced the `for` loop with [`.map()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map), which also iterates over the selection but, in addition, collects all the results and returns them as another array. Near the end of the program, we print this entire array.
7674

7775
:::tip Advanced syntax
7876

sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ if (response.ok) {
4343
const html = await response.text();
4444
const $ = cheerio.load(html);
4545

46-
const $items = $(".product-item").map((i, element) => {
46+
const data = $(".product-item").toArray().map(element => {
4747
const $productItem = $(element);
4848

4949
const $title = $productItem.find(".product-item__title");
@@ -67,7 +67,6 @@ if (response.ok) {
6767

6868
return { title, ...priceRange };
6969
});
70-
const data = $items.get();
7170

7271
const jsonData = JSON.stringify(data);
7372
await writeFile('products.json', jsonData);
@@ -190,12 +189,11 @@ async function exportCSV(data) {
190189
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
191190
const $ = await download(listingURL);
192191

193-
const $items = $(".product-item").map((i, element) => {
192+
const data = $(".product-item").toArray().map(element => {
194193
const $productItem = $(element);
195194
const item = parseProduct($productItem);
196195
return item;
197196
});
198-
const data = $items.get();
199197

200198
await writeFile('products.json', exportJSON(data));
201199
await writeFile('products.csv', await exportCSV(data));
@@ -286,13 +284,12 @@ Now we'll pass the base URL to the function in the main body of our program:
286284
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
287285
const $ = await download(listingURL);
288286

289-
const $items = $(".product-item").map((i, element) => {
287+
const data = $(".product-item").toArray().map(element => {
290288
const $productItem = $(element);
291289
// highlight-next-line
292290
const item = parseProduct($productItem, listingURL);
293291
return item;
294292
});
295-
const data = $items.get();
296293
```
297294

298295
When we run the scraper now, we should see full URLs in our exports:
@@ -353,12 +350,12 @@ https://en.wikipedia.org/wiki/Botswana
353350
const html = await response.text();
354351
const $ = cheerio.load(html);
355352

356-
$(".wikitable tr td:nth-child(3)").each((i, element) => {
353+
for (const element of $(".wikitable tr td:nth-child(3)").toArray()) {
357354
const nameCell = $(element);
358355
const link = nameCell.find("a").first();
359356
const url = new URL(link.attr("href"), listingURL).href;
360357
console.log(url);
361-
});
358+
}
362359
} else {
363360
throw new Error(`HTTP ${response.status}`);
364361
}
@@ -397,11 +394,11 @@ https://www.theguardian.com/sport/article/2024/sep/02/max-verstappen-damns-his-u
397394
const html = await response.text();
398395
const $ = cheerio.load(html);
399396

400-
$("#maincontent ul li").each((i, element) => {
397+
for (const element of $("#maincontent ul li").toArray()) {
401398
const link = $(element).find("a").first();
402399
const url = new URL(link.attr("href"), listingURL).href;
403400
console.log(url);
404-
});
401+
}
405402
} else {
406403
throw new Error(`HTTP ${response.status}`);
407404
}

sources/academy/webscraping/scraping_basics_javascript2/10_crawling.md

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,12 @@ async function exportCSV(data) {
6767
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
6868
const $ = await download(listingURL);
6969

70-
const $items = $(".product-item").map((i, element) => {
70+
const data = $(".product-item").toArray().map(element => {
7171
const $productItem = $(element);
7272
// highlight-next-line
7373
const item = parseProduct($productItem, listingURL);
7474
return item;
7575
});
76-
const data = $items.get();
7776

7877
await writeFile('products.json', exportJSON(data));
7978
await writeFile('products.csv', await exportCSV(data));
@@ -131,20 +130,20 @@ But where do we put this line in our program?
131130

132131
In the `.map()` loop, we're already going through all the products. Let's expand it to include downloading the product detail page, parsing it, extracting the vendor's name, and adding it to the item object.
133132

134-
First, we need to make the loop asynchronous so that we can use `await download()` for each product. We'll add the `async` keyword to the inner function and rename the collection to `$promises`, since it will now store promises that resolve to items rather than the items themselves. We'll still convert the collection to a standard JavaScript array, but this time we'll pass it to `await Promise.all()` to resolve all the promises and retrieve the actual items.
133+
First, we need to make the loop asynchronous so that we can use `await download()` for each product. We'll add the `async` keyword to the inner function and rename the collection to `promises`, since it will now store promises that resolve to items rather than the items themselves. We'll pass it to `await Promise.all()` to resolve all the promises and retrieve the actual items.
135134

136135
```js
137136
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
138137
const $ = await download(listingURL);
139138

140139
// highlight-next-line
141-
const $promises = $(".product-item").map(async (i, element) => {
140+
const promises = $(".product-item").toArray().map(async element => {
142141
const $productItem = $(element);
143142
const item = parseProduct($productItem, listingURL);
144143
return item;
145144
});
146145
// highlight-next-line
147-
const data = await Promise.all($promises.get());
146+
const data = await Promise.all(promises);
148147
```
149148

150149
The program behaves the same as before, but now the code is prepared to make HTTP requests from within the inner function. Let's do it:
@@ -153,7 +152,7 @@ The program behaves the same as before, but now the code is prepared to make HTT
153152
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
154153
const $ = await download(listingURL);
155154

156-
const $promises = $(".product-item").map(async (i, element) => {
155+
const promises = $(".product-item").toArray().map(async element => {
157156
const $productItem = $(element);
158157
const item = parseProduct($productItem, listingURL);
159158

@@ -248,7 +247,8 @@ Hint: Locating cells in tables is sometimes easier if you know how to [filter](h
248247
const listingURL = "https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_in_Africa";
249248
const $ = await download(listingURL);
250249

251-
const $promises = $(".wikitable tr td:nth-child(3)").map(async (i, element) => {
250+
const $cells = $(".wikitable tr td:nth-child(3)");
251+
const promises = $cells.toArray().map(async element => {
252252
const $nameCell = $(element);
253253
const $link = $nameCell.find("a").first();
254254
const countryURL = new URL($link.attr("href"), listingURL).href;
@@ -266,7 +266,7 @@ Hint: Locating cells in tables is sometimes easier if you know how to [filter](h
266266

267267
console.log(`${countryURL} ${callingCode || null}`);
268268
});
269-
await Promise.all($promises.get());
269+
await Promise.all(promises);
270270
```
271271

272272
</details>
@@ -314,7 +314,7 @@ Hints:
314314
const listingURL = "https://www.theguardian.com/sport/formulaone";
315315
const $ = await download(listingURL);
316316

317-
const $promises = $("#maincontent ul li").map(async (i, element) => {
317+
const promises = $("#maincontent ul li").toArray().map(async element => {
318318
const $item = $(element);
319319
const $link = $item.find("a").first();
320320
const authorURL = new URL($link.attr("href"), listingURL).href;
@@ -327,7 +327,7 @@ Hints:
327327

328328
console.log(`${author || address || null}: ${title}`);
329329
});
330-
await Promise.all($promises.get());
330+
await Promise.all(promises);
331331
```
332332

333333
</details>

0 commit comments

Comments
 (0)