-
Notifications
You must be signed in to change notification settings - Fork 11
Description
I am trying to scrape printers and cartridges compatibility from a specific website (https://www.cartridgesave.co.uk/printers.html?p=1), but not all the printers/cartridges are scraped. Even after increasing the delay to 20000ms and pageLoadDelay to 15000ms, I got 183 printers from a total of 467, some printers (and even cartridges) are being skipped, and I can't figure why.
I did start a scrape using this sitemap on the chrome browser using the extension and it seems to be scraping all the items as it is suppose to.
I believe the problem is on the SelectorLink that is skipping some items, on the extension I did element preview and all of them are properly selected, so still no clue.
Here is my sitemap:
{"_id":"printers-test-amount","startUrl":["https://www.cartridgesave.co.uk/printers.html?p=1"],"selectors":[{"id":"pagination","type":"SelectorLink","parentSelectors":["_root","pagination"],"selector":".search div:nth-of-type(2) a.next","multiple":true,"delay":0},{"id":"product-link","type":"SelectorLink","parentSelectors":["_root","pagination"],"selector":".product-item-inner a.product-item-link","multiple":true,"delay":0},{"id":"ManufacturerPartNo","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Manufacturer Part No.:') td","multiple":false,"regex":"","delay":0},{"id":"Brand","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Brand:') td","multiple":false,"regex":"","delay":0},{"id":"ProductType","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Product Type:') td","multiple":false,"regex":"","delay":0},{"id":"Connectivity","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Connectivity:') td","multiple":false,"regex":"","delay":0},{"id":"Height","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Height:') td","multiple":false,"regex":"","delay":0},{"id":"Width","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Width:') td","multiple":false,"regex":"","delay":0},{"id":"Depth","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Depth:') td","multiple":false,"regex":"","delay":0},{"id":"CartridgesLink","type":"SelectorLink","parentSelectors":["product-link"],"selector":"a.catridge_printer_link","multiple":false,"delay":0},{"id":"Catridges","type":"SelectorLink","parentSelectors":["CartridgesLink"],"selector":".product-item-inner a.product-item-link","multiple":true,"delay":0},{"id":"CatridgesModel","type":"SelectorText","parentSelectors":["Catridges"],"selector":"#information tr:contains('Manufacturer Part No.:') td","multiple":false,"regex":"","delay":0},{"id":"Title","type":"SelectorText","parentSelectors":["product-link"],"selector":"span[itemprop='name']","multiple":false,"regex":"","delay":0},{"id":"ShippingWeight","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Shipping weight:') td","multiple":false,"regex":"","delay":0},{"id":"Functionality","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information td[data-th='Functionality']","multiple":false,"regex":"","delay":0},{"id":"ColourMono","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Colour or mono:') td","multiple":false,"regex":"","delay":0},{"id":"PaperSize","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Paper size:') td","multiple":false,"regex":"","delay":0},{"id":"StandardTrayMediaTypes","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Standard tray media types:') td","multiple":false,"regex":"","delay":0},{"id":"ISOASeriesSizes","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('ISO A-series sizes (A0...A9):') td","multiple":false,"regex":"","delay":0},{"id":"ISOBSeriesSizes","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('ISO B-series sizes (B0...B9):') td","multiple":false,"regex":"","delay":0},{"id":"NonISOSizes","type":"SelectorText","parentSelectors":["product-link"],"selector":"#information tr:contains('Non-ISO print media sizes:') td","multiple":false,"regex":"","delay":0}]}
It goes through all the pages, because I checked the results and I have printers from the last page (page 47), but for some reason there are printers missing, and I would require this module to work flawless.
Any thoughts on this issue?
My code is simple:
const webscraper = require('web-scraper-headless');
const fs = require('fs');
const util = require('util');
const writeFilePromise = util.promisify(fs.writeFile);
const scraperOpts = {
delay: 2000,
pageLoadDelay: 2000
};
const sitemap = {_id:'mysitemap_above'};
async function start(){
const scrapedData = await webscraper(sitemap, scraperOpts);
await writeFilePromise("data.json", JSON.stringify(scrapedData));
}
start();
Thank you in advance!