Skip to content

Commit 22b2b14

Browse files
authored
Add blockNavigation option (#17)
1 parent 686aebb commit 22b2b14

File tree

7 files changed

+114
-28
lines changed

7 files changed

+114
-28
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ scrape({
3232
```
3333
Puppeteer plugin constructor accepts next params:
3434
* `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/GoogleChrome/puppeteer/blob/v1.20.0/docs/api.md#puppeteerlaunchoptions)
35-
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached :
35+
* `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached:
3636
* `timeout` - in milliseconds
3737
* `viewportN` - viewport height multiplier
38+
* `blockNavigation` - *(optional)* - defines whether navigation away from the page is permitted or not. If it is set to true, then the page is locked to the current url and redirects with `location.replace(anotherPage)` will not pass. Defaults to `false`
3839

3940
## How it works
4041
It starts Chromium in headless mode which just opens page and waits until page is loaded.

index.js renamed to lib/index.js

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
const puppeteer = require('puppeteer');
2+
const logger = require('./logger.js');
23

34
class PuppeteerPlugin {
45
constructor({
56
launchOptions = {},
6-
scrollToBottom = null
7+
scrollToBottom = null,
8+
blockNavigation = false
79
} = {}) {
810
this.launchOptions = launchOptions;
911
this.scrollToBottom = scrollToBottom;
12+
this.blockNavigation = blockNavigation;
1013
this.browser = null;
1114
this.headers = {};
15+
16+
logger.info('init plugin', { launchOptions, scrollToBottom, blockNavigation });
1217
}
1318

1419
apply(registerAction) {
@@ -28,19 +33,26 @@ class PuppeteerPlugin {
2833
const isHtml = contentType && contentType.split(';')[0] === 'text/html';
2934
if (isHtml) {
3035
const url = response.request.href;
31-
3236
const page = await this.browser.newPage();
37+
3338
if (hasValues(this.headers)) {
39+
logger.info('set headers to puppeteer page', this.headers);
3440
await page.setExtraHTTPHeaders(this.headers);
3541
}
42+
43+
if (this.blockNavigation) {
44+
await blockNavigation(page, url);
45+
}
46+
3647
await page.goto(url);
3748

38-
if(this.scrollToBottom) {
49+
if (this.scrollToBottom) {
3950
await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN);
4051
}
4152

4253
const content = await page.content();
4354
await page.close();
55+
4456
// convert utf-8 -> binary string because website-scraper needs binary
4557
return Buffer.from(content).toString('binary');
4658
} else {
@@ -58,6 +70,8 @@ function hasValues(obj) {
5870

5971

6072
async function scrollToBottom(page, timeout, viewportN) {
73+
logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`);
74+
6175
await page.evaluate(async (timeout, viewportN) => {
6276
await new Promise((resolve, reject) => {
6377
let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN;
@@ -74,4 +88,17 @@ async function scrollToBottom(page, timeout, viewportN) {
7488
}, timeout, viewportN);
7589
}
7690

91+
async function blockNavigation(page, url) {
92+
logger.info(`block navigation for puppeteer page from url ${url}`);
93+
94+
page.on('request', req => {
95+
if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== url) {
96+
req.abort('aborted');
97+
} else {
98+
req.continue();
99+
}
100+
});
101+
await page.setRequestInterception(true);
102+
}
103+
77104
module.exports = PuppeteerPlugin;

lib/logger.js

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
const debug = require('debug');
2+
3+
const appName = 'website-scraper-puppeteer';
4+
const logLevels = ['error', 'warn', 'info', 'debug', 'log'];
5+
6+
const logger = {};
7+
logLevels.forEach(logLevel => {
8+
logger[logLevel] = debug(`${appName}:${logLevel}`);
9+
});
10+
11+
module.exports = logger;

package-lock.json

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"version": "0.1.4",
44
"description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer",
55
"readmeFilename": "README.md",
6-
"main": "index.js",
6+
"main": "lib/index.js",
77
"keywords": [
88
"website-scraper",
99
"puppeteer",
@@ -13,6 +13,7 @@
1313
"html"
1414
],
1515
"dependencies": {
16+
"debug": "^4.1.1",
1617
"puppeteer": "^2.0.0"
1718
},
1819
"peerDependencies": {
@@ -40,6 +41,6 @@
4041
},
4142
"homepage": "https://github.com/website-scraper/website-scraper-puppeteer#readme",
4243
"files": [
43-
"index.js"
44+
"lib"
4445
]
4546
}

test/mock/navigation.html

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Test</title>
6+
</head>
7+
<body>
8+
9+
<div id="root"></div>
10+
11+
<script>
12+
window.onload = function() {
13+
window.location.replace('http://example.com');
14+
document.getElementById('root').innerText = 'Navigation blocked!';
15+
};
16+
</script>
17+
18+
</body>
19+
</html>

test/puppeteer-plugin.test.js

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,38 +4,65 @@ const finalhandler = require('finalhandler');
44
const serveStatic = require('serve-static');
55
const fs = require('fs-extra');
66
const scrape = require('website-scraper');
7-
const PuppeteerPlugin = require('../index');
7+
const PuppeteerPlugin = require('../lib');
88

99
const directory = __dirname + '/tmp';
10+
const SERVE_WEBSITE_PORT = 4567;
1011

1112
describe('Puppeteer plugin test', () => {
1213
let result, content;
1314

14-
before('serve website', () => serveWebsite(4567));
15-
before('scrape website', async () => {
16-
result = await scrape({
17-
urls: ['http://localhost:4567'],
18-
directory: directory,
19-
plugins: [ new PuppeteerPlugin() ]
15+
before('serve website', () => serveWebsite(SERVE_WEBSITE_PORT));
16+
17+
describe('Dynamic content', () => {
18+
before('scrape website', async () => {
19+
result = await scrape({
20+
urls: [`http://localhost:${SERVE_WEBSITE_PORT}`],
21+
directory: directory,
22+
plugins: [ new PuppeteerPlugin() ]
23+
});
2024
});
21-
});
22-
before('get content from file', () => {
23-
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
24-
});
25+
before('get content from file', () => {
26+
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
27+
});
28+
after('delete dir', () => fs.removeSync(directory));
2529

26-
after('delete dir', () => fs.removeSync(directory));
30+
it('should have 1 item in result array', () => {
31+
expect(result.length).eql(1);
32+
});
2733

28-
it('should have 1 item in result array', () => {
29-
expect(result.length).eql(1);
30-
});
34+
it('should render dymanic website', async () => {
35+
expect(content).to.contain('<div id="root">Hello world from JS!</div>');
36+
});
3137

32-
it('should render dymanic website', async () => {
33-
expect(content).to.contain('<div id="root">Hello world from JS!</div>');
38+
it('should render special characters correctly', async () => {
39+
expect(content).to.contain('<div id="special-characters-test">저는 7년 동안 한국에서 살았어요. Слава Україні!</div>');
40+
});
3441
});
3542

36-
it('should render special characters correctly', async () => {
37-
expect(content).to.contain('<div id="special-characters-test">저는 7년 동안 한국에서 살았어요. Слава Україні!</div>');
43+
describe('Block navigation', () => {
44+
before('scrape website', async () => {
45+
result = await scrape({
46+
urls: [`http://localhost:${SERVE_WEBSITE_PORT}/navigation.html`],
47+
directory: directory,
48+
plugins: [
49+
new PuppeteerPlugin({
50+
blockNavigation: true
51+
})
52+
]
53+
});
54+
});
55+
before('get content from file', () => {
56+
content = fs.readFileSync(`${directory}/${result[0].filename}`).toString();
57+
});
58+
after('delete dir', () => fs.removeSync(directory));
59+
60+
it('should render content (and not be redirected)', async () => {
61+
expect(content).to.contain('<div id="root">Navigation blocked!</div>');
62+
});
3863
});
64+
65+
3966
});
4067

4168
function serveWebsite(port = 3000) {

0 commit comments

Comments
 (0)