Skip to content

Commit b7c35b4

Browse files
authored
Refactor CMPCollector and CookiePopupCollector (#143)
* Inject autoconsent in iframes * Rely on unique execution context ids in CMPCollector and CookiePopupCollector * Fix CMPCollector tests * Add exception handlers around CDP calls * tweak cmp timeouts * helpful log message * Log which site timed out * Refactor: Merge CookiePopupCollector and CMPCollector * lint fixes * Encode unique context id in the binding name to prevent collisions * remove unused variable * Rename CMPCollector into CookiePopupsCollector * Remove redundant CDP calls * remove shadowing variables * clean up when execution contexts are destroyed * do not log stack trace * lint fix * Fix tests * Limit cookie consent scraping and add timing logs * Increase the total timeout (allow more slow pages) and add more timing logs * Do not let remote browser throw during closing * Limit browser close timeout * Clean up getData() * lint * Template function for autoconsetn script * Protect agains screenshot timeouts * Add more logs * Lint fix
1 parent fa23d22 commit b7c35b4

19 files changed

+553
-469
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Available options:
3232
- `--chromium-version <version_number>` - use custom version of Chromium (e.g. "843427") instead of using the default
3333
- `--selenium-hub <url>` - If provided, browsers will be requested from selenium hub instead of spawning local processes (e.g. `--selenium-hub http://my-selenium-hub-host:4444`).
3434
- `--config <path>` - path to a config file that allows to set all the above settings (and more). Note that CLI flags have a higher priority than settings passed via config. You can find a sample config file in `tests/cli/sampleConfig.json`.
35-
- `--autoconsent-action <action>` - automatic autoconsent action (requires the `cmps` collector). Possible values: optIn, optOut
35+
- `--autoconsent-action <action>` - automatic autoconsent action (requires the `cookiepopups` collector). Possible values: optIn, optOut
3636

3737
### Use it as a module
3838

@@ -97,7 +97,7 @@ const data = await crawler(new URL('https://example.com'), {
9797
## Output format
9898

9999
Each successfully crawled website will create a separate file named after the website (when using the CLI tool). Output data format is specified in `crawler.js` (see `CollectResult` type definition).
100-
Additionally, for each crawl `metadata.json` file will be created containing crawl configuration, system configuration and some high-level stats.
100+
Additionally, for each crawl `metadata.json` file will be created containing crawl configuration, system configuration and some high-level stats.
101101

102102
## Data post-processing
103103

browser/BaseBrowser.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ class BaseBrowser {
66
throw new Error('Not implemented');
77
}
88

9+
/**
10+
* @returns {Promise<void>}
11+
*/
912
close() {
1013
throw new Error('Not implemented');
1114
}

browser/RemoteChrome.js

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,19 @@ class RemoteChrome extends BaseBrowser {
9898
try {
9999
await this.connection.send('Browser.close');
100100
} catch (error) {
101-
console.error('Error when closing browser connection', error);
101+
console.error('Error when closing browser connection', error.message);
102102
}
103-
this.connection.dispose();
103+
try {
104+
this.connection.dispose();
105+
} catch (error) {
106+
console.error('Error when disposing browser connection', error.message);
107+
}
108+
}
109+
try {
110+
await this.driver?.quit();
111+
} catch (error) {
112+
console.error('Error when quitting driver', error.message);
104113
}
105-
await this.driver?.quit();
106114
}
107115

108116
/**

cli/crawl-cli.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ program
2727
.option('-r, --region-code <region>', 'optional 2 letter region code. Used for metadata only.')
2828
.option('-a, --disable-anti-bot', 'disable anti bot detection protections injected to every frame')
2929
.option('--config <path>', 'crawl configuration file')
30-
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optOut, optIn. Works only when cmps collector is enabled.')
30+
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optOut, optIn. Works only when cookiepopups collector is enabled.')
3131
.option('--chromium-version <version_number>', 'use custom version of chromium')
3232
.option('--selenium-hub <url>', 'selenium hub endpoint to request browsers from')
3333
.parse(process.argv);
@@ -137,7 +137,7 @@ async function run({
137137
* @type {Array<Array<number>>}
138138
*/
139139
let crawlTimes = [];
140-
140+
141141
// eslint-disable-next-line arrow-parens
142142
const updateProgress = (/** @type {string} */site = '', /** @type {import('../crawler').CollectResult} */data) => {
143143
reporters.forEach(reporter => {
@@ -274,7 +274,7 @@ if (!config.urls || !config.output) {
274274
dataCollectors: item.dataCollectors.map(id => createCollector(id))
275275
};
276276
}
277-
277+
278278
return item;
279279
});
280280

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
const BaseCollector = require('./BaseCollector');
2+
3+
const ISOLATED_WORLD_PREFIX = 'iw_for_';
4+
5+
/**
6+
* @param {String|Error} e
7+
*/
8+
function isIgnoredCDPError(e) {
9+
// ignore evaluation errors (sometimes frames reload too fast)
10+
const error = (typeof e === 'string') ? e : e.message;
11+
return (
12+
error.includes('TargetCloseError:') ||
13+
error.includes('No frame for given id found') ||
14+
error.includes('Target closed') ||
15+
error.includes('Session closed') ||
16+
error.includes('Cannot find context with specified id') ||
17+
error.includes('uniqueContextId not found')
18+
);
19+
}
20+
21+
/**
22+
* @abstract
23+
* Base class for collectors that need to create isolated worlds for each frame.
24+
*/
25+
class ContentScriptCollector extends BaseCollector {
26+
/**
27+
* @param {import('./BaseCollector').CollectorInitOptions} options
28+
*/
29+
init(options) {
30+
this.log = options.log;
31+
/**
32+
* maps isolated world uniqueId to page world uniqueId
33+
* @type {Map<import('devtools-protocol/types/protocol').Protocol.Runtime.ExecutionContextDescription['uniqueId'], import('devtools-protocol/types/protocol').Protocol.Runtime.ExecutionContextDescription['uniqueId']>}
34+
*/
35+
this.isolated2pageworld = new Map();
36+
/**
37+
* maps isolated world uniqueId to CDPSession
38+
* @type {Map<import('devtools-protocol/types/protocol').Protocol.Runtime.ExecutionContextDescription['uniqueId'], import('puppeteer-core').CDPSession>}
39+
*/
40+
this.cdpSessions = new Map();
41+
this.iwPrefix = `${ISOLATED_WORLD_PREFIX}${this.id()}_`;
42+
}
43+
44+
/**
45+
* @param {import('puppeteer-core').CDPSession} session
46+
* @param {import('devtools-protocol/types/protocol').Protocol.Target.TargetInfo} targetInfo
47+
*/
48+
addTarget(session, targetInfo) {
49+
if (targetInfo.type !== 'page' && targetInfo.type !== 'iframe') {
50+
return;
51+
}
52+
53+
session.on('Runtime.executionContextDestroyed', ({executionContextUniqueId}) => {
54+
this.log(`context destroyed ${executionContextUniqueId}`);
55+
this.isolated2pageworld.delete(executionContextUniqueId);
56+
this.cdpSessions.delete(executionContextUniqueId);
57+
});
58+
59+
// inject the content script into every frame in isolated world
60+
session.on('Runtime.executionContextCreated', async ({context}) => {
61+
// new isolated world for our content script
62+
if (context.auxData.type === 'isolated' && context.name.startsWith(this.iwPrefix)) {
63+
const pageWorldUniqueId = context.name.slice(this.iwPrefix.length);
64+
this.log(`isolated world created ${context.uniqueId} for ${pageWorldUniqueId}`);
65+
this.isolated2pageworld.set(context.uniqueId, pageWorldUniqueId);
66+
this.cdpSessions.set(context.uniqueId, session);
67+
await this.onIsolatedWorldCreated(session, context);
68+
return;
69+
}
70+
71+
// ignore other special contexts
72+
if (!context.origin || context.origin === '://' || context.auxData.type !== 'default') {
73+
return;
74+
}
75+
76+
this.log(`creating isolated world for ${context.uniqueId}`);
77+
// request an isolated world for this frame
78+
try {
79+
await session.send('Page.createIsolatedWorld', {
80+
frameId: context.auxData.frameId,
81+
worldName: `${this.iwPrefix}${context.uniqueId}`,
82+
});
83+
} catch (e) {
84+
if (!this.isIgnoredCdpError(e)) {
85+
this.log(`Error creating isolated world for ${context.uniqueId}: ${e}`);
86+
}
87+
}
88+
});
89+
}
90+
91+
/**
92+
* @abstract
93+
* @param {import('puppeteer-core').CDPSession} session
94+
* @param {import('devtools-protocol/types/protocol').Protocol.Runtime.ExecutionContextDescription} context
95+
* @returns {Promise<void>}
96+
*/
97+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
98+
onIsolatedWorldCreated(session, context) {
99+
throw new Error('Not implemented');
100+
}
101+
102+
/**
103+
* @param {Error|string} e
104+
*/
105+
isIgnoredCdpError(e) {
106+
return isIgnoredCDPError(e);
107+
}
108+
}
109+
110+
module.exports = ContentScriptCollector;

collectors/CookiePopupCollector.js

Lines changed: 0 additions & 137 deletions
This file was deleted.

collectors/CookiePopups/scrapeScript.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ function getUniqueSelector(el) {
212212
}
213213

214214
/**
215-
* @returns {import('../CookiePopupCollector').ContentScriptResult}
215+
* @returns {import('../CookiePopupsCollector').ScrapeScriptResult}
216216
*/
217217
function collectPotentialPopups() {
218218
const isFramed = window.top !== window || location.ancestorOrigins?.length > 0;

0 commit comments

Comments
 (0)