Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class PuppeteerPlugin {
await blockNavigation(page, url);
}

await page.goto(url, this.gotoOptions);
const puppeteerResponse = await page.goto(url, this.gotoOptions);

if (this.scrollToBottom) {
await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
Expand All @@ -56,10 +56,12 @@ class PuppeteerPlugin {
const content = await page.content();
await page.close();

// convert utf-8 -> binary string because website-scraper needs binary
return Buffer.from(content).toString('binary');
const encoding = extractEncodingFromHeader(puppeteerResponse.headers());
const body = Buffer.from(content).toString(encoding);

return { body, encoding };
} else {
return response.body;
return { body: response.body };
}
});

Expand Down Expand Up @@ -91,4 +93,10 @@ async function blockNavigation (page, url) {
await page.setRequestInterception(true);
}

function extractEncodingFromHeader (headers) {
const contentTypeHeader = headers['content-type'];

return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary';
}

export default PuppeteerPlugin;
2 changes: 1 addition & 1 deletion test/mock/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* with cheerio and website-scraper itself.
* See https://github.com/cheeriojs/cheerio/pull/2280
*/
document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні!';
document.getElementById('special-characters-test').innerText = '7년 동안 한국에서 살았어요. Слава Україні! 磁致伸缩位移传感器 影响大跨度桥梁施工控制的因素';
};
</script>

Expand Down
2 changes: 1 addition & 1 deletion test/puppeteer-plugin.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ describe('Puppeteer plugin test', () => {
});

it('should render special characters correctly', async () => {
expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні!</div>');
expect(content).to.contain('<div id="special-characters-test">7년 동안 한국에서 살았어요. Слава Україні! 磁致伸缩位移传感器 影响大跨度桥梁施工控制的因素</div>');
});
});

Expand Down