Skip to content

Commit 9f8727d

Browse files
committed
feat: update the rest of scraping variants to be about JS
1 parent 8736bbc commit 9f8727d

File tree

2 files changed

+120
-107
lines changed

2 files changed

+120
-107
lines changed

sources/academy/webscraping/scraping_basics_javascript2/11_scraping_variants.md

Lines changed: 118 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -197,110 +197,121 @@ Perhaps surprisingly, some products with variants will have the price field set.
197197

198198
## Parsing price
199199

200-
The items now contain the variant as text, which is good for a start, but we want the price to be in the `price` key. Let's introduce a new function to handle that:
200+
The items now contain the variant as text, which is good for a start, but we want the price to be in the `price` property. Let's introduce a new function to handle that:
201201

202-
```py
203-
def parse_variant(variant):
204-
text = variant.text.strip()
205-
name, price_text = text.split(" - ")
206-
price = Decimal(
207-
price_text
208-
.replace("$", "")
209-
.replace(",", "")
210-
)
211-
return {"variant_name": name, "price": price}
202+
```js
203+
function parseVariant($option) {
204+
const [variantName, priceText] = $option
205+
.text()
206+
.trim()
207+
.split(" - ");
208+
const price = parseInt(
209+
priceText
210+
.replace("$", "")
211+
.replace(".", "")
212+
.replace(",", "")
213+
);
214+
return { variantName, price };
215+
}
212216
```
213217

214-
First, we split the text into two parts, then we parse the price as a decimal number. This part is similar to what we already do for parsing product listing prices. The function returns a dictionary we can merge with `item`.
218+
First, we split the text into two parts, then we parse the price as a number. This part is similar to what we already do for parsing product listing prices. The function returns an object we can merge with `item`.
215219

216220
## Saving price
217221

218222
Now, if we use our new function, we should finally get a program that can scrape exact prices for all products, even if they have variants. The whole code should look like this now:
219223

220-
```py
221-
import httpx
222-
from bs4 import BeautifulSoup
223-
from decimal import Decimal
224-
import json
225-
import csv
226-
from urllib.parse import urljoin
227-
228-
def download(url):
229-
response = httpx.get(url)
230-
response.raise_for_status()
231-
232-
html_code = response.text
233-
return BeautifulSoup(html_code, "html.parser")
234-
235-
def parse_product(product, base_url):
236-
title_element = product.select_one(".product-item__title")
237-
title = title_element.text.strip()
238-
url = urljoin(base_url, title_element["href"])
239-
240-
price_text = (
241-
product
242-
.select_one(".price")
243-
.contents[-1]
244-
.strip()
245-
.replace("$", "")
246-
.replace(",", "")
247-
)
248-
if price_text.startswith("From "):
249-
min_price = Decimal(price_text.removeprefix("From "))
250-
price = None
251-
else:
252-
min_price = Decimal(price_text)
253-
price = min_price
254-
255-
return {"title": title, "min_price": min_price, "price": price, "url": url}
256-
257-
def parse_variant(variant):
258-
text = variant.text.strip()
259-
name, price_text = text.split(" - ")
260-
price = Decimal(
261-
price_text
262-
.replace("$", "")
263-
.replace(",", "")
264-
)
265-
return {"variant_name": name, "price": price}
266-
267-
def export_csv(file, data):
268-
fieldnames = list(data[0].keys())
269-
writer = csv.DictWriter(file, fieldnames=fieldnames)
270-
writer.writeheader()
271-
for row in data:
272-
writer.writerow(row)
273-
274-
def export_json(file, data):
275-
def serialize(obj):
276-
if isinstance(obj, Decimal):
277-
return str(obj)
278-
raise TypeError("Object not JSON serializable")
279-
280-
json.dump(data, file, default=serialize, indent=2)
281-
282-
listing_url = "https://warehouse-theme-metal.myshopify.com/collections/sales"
283-
listing_soup = download(listing_url)
284-
285-
data = []
286-
for product in listing_soup.select(".product-item"):
287-
item = parse_product(product, listing_url)
288-
product_soup = download(item["url"])
289-
vendor = product_soup.select_one(".product-meta__vendor").text.strip()
290-
291-
if variants := product_soup.select(".product-form__option.no-js option"):
292-
for variant in variants:
293-
# highlight-next-line
294-
data.append(item | parse_variant(variant))
295-
else:
296-
item["variant_name"] = None
297-
data.append(item)
298-
299-
with open("products.csv", "w") as file:
300-
export_csv(file, data)
301-
302-
with open("products.json", "w") as file:
303-
export_json(file, data)
224+
```js
225+
import * as cheerio from 'cheerio';
226+
import { writeFile } from 'fs/promises';
227+
import { AsyncParser } from '@json2csv/node';
228+
229+
async function download(url) {
230+
const response = await fetch(url);
231+
if (response.ok) {
232+
const html = await response.text();
233+
return cheerio.load(html);
234+
} else {
235+
throw new Error(`HTTP ${response.status}`);
236+
}
237+
}
238+
239+
function parseProduct(productItem, baseURL) {
240+
const title = productItem.find(".product-item__title");
241+
const titleText = title.text().trim();
242+
const url = new URL(title.attr("href"), baseURL).href;
243+
244+
const price = productItem.find(".price").contents().last();
245+
const priceRange = { minPrice: null, price: null };
246+
const priceText = price
247+
.text()
248+
.trim()
249+
.replace("$", "")
250+
.replace(".", "")
251+
.replace(",", "");
252+
253+
if (priceText.startsWith("From ")) {
254+
priceRange.minPrice = parseInt(priceText.replace("From ", ""));
255+
} else {
256+
priceRange.minPrice = parseInt(priceText);
257+
priceRange.price = priceRange.minPrice;
258+
}
259+
260+
return { url, title: titleText, ...priceRange };
261+
}
262+
263+
async function exportJSON(data) {
264+
return JSON.stringify(data, null, 2);
265+
}
266+
267+
async function exportCSV(data) {
268+
const parser = new AsyncParser();
269+
return await parser.parse(data).promise();
270+
}
271+
272+
// highlight-start
273+
function parseVariant($option) {
274+
const [variantName, priceText] = $option
275+
.text()
276+
.trim()
277+
.split(" - ");
278+
const price = parseInt(
279+
priceText
280+
.replace("$", "")
281+
.replace(".", "")
282+
.replace(",", "")
283+
);
284+
return { variantName, price };
285+
}
286+
// highlight-end
287+
288+
const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales"
289+
const $ = await download(listingURL);
290+
291+
const $promises = $(".product-item").map(async (i, element) => {
292+
const $productItem = $(element);
293+
const item = parseProduct($productItem, listingURL);
294+
295+
const $p = await download(item.url);
296+
item.vendor = $p(".product-meta__vendor").text().trim();
297+
298+
const $items = $p(".product-form__option.no-js option").map((j, element) => {
299+
// highlight-next-line
300+
const variant = parseVariant($(element));
301+
// highlight-next-line
302+
return { ...item, ...variant };
303+
});
304+
305+
if ($items.length > 0) {
306+
return $items.get();
307+
}
308+
return [{ variantName: null, ...item }];
309+
});
310+
const itemLists = await Promise.all($promises.get());
311+
const data = itemLists.flat();
312+
313+
await writeFile('products.json', await exportJSON(data));
314+
await writeFile('products.csv', await exportCSV(data));
304315
```
305316

306317
Let's run the scraper and see if all the items in the data contain prices:
@@ -310,26 +321,26 @@ Let's run the scraper and see if all the items in the data contain prices:
310321
[
311322
...
312323
{
313-
"variant_name": "Red",
314-
"title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
315-
"min_price": "128.00",
316-
"price": "178.00",
317324
"url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
318-
"vendor": "Sony"
325+
"title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
326+
"minPrice": 12800,
327+
"price": 17800,
328+
"vendor": "Sony",
329+
"variantName": "Red"
319330
},
320331
{
321-
"variant_name": "Black",
322-
"title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
323-
"min_price": "128.00",
324-
"price": "178.00",
325332
"url": "https://warehouse-theme-metal.myshopify.com/products/sony-xb950-extra-bass-wireless-headphones-with-app-control",
326-
"vendor": "Sony"
333+
"title": "Sony XB-950B1 Extra Bass Wireless Headphones with App Control",
334+
"minPrice": 12800,
335+
"price": 17800,
336+
"vendor": "Sony",
337+
"variantName": "Black"
327338
},
328339
...
329340
]
330341
```
331342

332-
Success! We managed to build a Python application for watching prices!
343+
Success! We managed to build a Node.js application for watching prices!
333344

334345
Is this the end? Maybe! In the next lesson, we'll use a scraping framework to build the same application, but with less code, faster requests, and better visibility into what's happening while we wait for the program to finish.
335346

sources/academy/webscraping/scraping_basics_python/11_scraping_variants.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def parse_product(product, base_url):
221221

222222
return {"title": title, "min_price": min_price, "price": price, "url": url}
223223

224+
# highlight-start
224225
def parse_variant(variant):
225226
text = variant.text.strip()
226227
name, price_text = text.split(" - ")
@@ -230,6 +231,7 @@ def parse_variant(variant):
230231
.replace(",", "")
231232
)
232233
return {"variant_name": name, "price": price}
234+
# highlight-end
233235

234236
def export_json(file, data):
235237
def serialize(obj):

0 commit comments

Comments
 (0)