Skip to content

Commit 52b9543

Browse files
authored
Detect cookie popups based on full document text (#148)
* Define the desired data types * Collect document-level text and buttons for cookie popup detection * Prevent empty result entries * Annotate buttons in extracted text * Switch to a simpler document text scraping * Add an LLM detection script * Skip some shadowroot elements * Add heuristic to filter common false positives * use document.body when possible * Add regex-based detection * Fix detection script after refactoring * Move all LLM calssification in one script * Move button text verification into the llm script * Adapt the rule generation script to after crawler refactoring * Cleanup: move the types in a separate file * cleanup: refactor the generation functions * Lint fixes * fix unit tests * Clean up: split out classification utility functions * Parallelize crawl processing * Classify buttons in the document context * Do not render progress bar in CI * Synchronize autoconsent with the scraping job * More verbose lifecycle logs * Add two columns in clickhouse * populate the new detection columns * Fix typo * Remove unnecessary import * Escape whitespaces in selector components * Handle scrapescript errors better * Defend against DOM clobbering * Clarify the comment * Lint fix * Clarify the TS types * minor code style fix * Fix lint errors * Fix unit test * Add a comment about openAI usage costs. * More logs for debugging * Log last crawled site * naive attempt to prevent duplicate script injection * Do context deduplication inside the ContentScriptSelector * Report the current sites/min rate * Fix unit tests * lint fix * Add collector-specific extra timeouts and give Autoconsent more time to finish * Wait for scrape job to finish before waiting for optOut * Lint fix * More timeouts for CPM collector: some rules take a long time to finish. * Prevent unnecessary OpenAI calls
1 parent a04d882 commit 52b9543

File tree

15 files changed

+948
-384
lines changed

15 files changed

+948
-384
lines changed

collectors/BaseCollector.js

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
class BaseCollector {
22

3+
/**
4+
* Override this to increase the total crawl time when this collector is enabled.
5+
*/
6+
collectorExtraTimeMs = 0;
7+
38
id() {
49
return 'base';
510
}
611

712
/**
813
* Called before the crawl begins. Can be async, can throw errors.
9-
*
10-
* @param {CollectorInitOptions} options
14+
*
15+
* @param {CollectorInitOptions} options
1116
*/
1217
// eslint-disable-next-line @typescript-eslint/no-unused-vars
1318
init(options) {

collectors/ContentScriptCollector.js

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
const BaseCollector = require('./BaseCollector');
22

33
const ISOLATED_WORLD_PREFIX = 'iw_for_';
4+
const ISOLATED_WORLD_SEPARATOR = '_frameId_';
45

56
/**
67
* @param {String|Error} e
@@ -60,8 +61,15 @@ class ContentScriptCollector extends BaseCollector {
6061
session.on('Runtime.executionContextCreated', async ({context}) => {
6162
// new isolated world for our content script
6263
if (context.auxData.type === 'isolated' && context.name.startsWith(this.iwPrefix)) {
63-
const pageWorldUniqueId = context.name.slice(this.iwPrefix.length);
64-
this.log(`isolated world created ${context.uniqueId} for ${pageWorldUniqueId}`);
64+
// Chromium will create a new isolated context for each frame in the page, even the ones we already asked for.
65+
// We need to filter those out and ignore.
66+
const pageWorldUniqueId = context.name.slice(this.iwPrefix.length, context.name.indexOf(ISOLATED_WORLD_SEPARATOR));
67+
const intendedFrameId = context.name.slice(context.name.indexOf(ISOLATED_WORLD_SEPARATOR) + ISOLATED_WORLD_SEPARATOR.length);
68+
if (intendedFrameId !== context.auxData.frameId) {
69+
this.log(`Skipping isolated context for fId ${context.auxData.frameId} (waiting for fId ${intendedFrameId})`);
70+
return;
71+
}
72+
this.log(`isolated world created ${context.uniqueId} for cId ${pageWorldUniqueId}: fId ${context.auxData.frameId}`);
6573
this.isolated2pageworld.set(context.uniqueId, pageWorldUniqueId);
6674
this.cdpSessions.set(context.uniqueId, session);
6775
await this.onIsolatedWorldCreated(session, context);
@@ -73,12 +81,12 @@ class ContentScriptCollector extends BaseCollector {
7381
return;
7482
}
7583

76-
this.log(`creating isolated world for ${context.uniqueId}`);
84+
this.log(`creating isolated world for cId ${context.uniqueId} fId ${context.auxData.frameId}`);
7785
// request an isolated world for this frame
7886
try {
7987
await session.send('Page.createIsolatedWorld', {
8088
frameId: context.auxData.frameId,
81-
worldName: `${this.iwPrefix}${context.uniqueId}`,
89+
worldName: `${this.iwPrefix}${context.uniqueId}${ISOLATED_WORLD_SEPARATOR}${context.auxData.frameId}`,
8290
});
8391
} catch (e) {
8492
if (!this.isIgnoredCdpError(e)) {

collectors/CookiePopups/scrapeScript.js

Lines changed: 151 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,29 @@
1-
/* global window, document, HTMLElement, Node, NodeFilter, location */
1+
/* global window, document, HTMLElement, Node, NodeFilter, location, NamedNodeMap, DOMTokenList, DOMException */
2+
3+
const BUTTON_LIKE_ELEMENT_SELECTOR = 'button, input[type="button"], input[type="submit"], a, [role="button"], [class*="button"]';
4+
const LIMIT_TEXT_LENGTH = 150000;
5+
const ELEMENT_TAGS_TO_SKIP = [
6+
'SCRIPT',
7+
'STYLE',
8+
'NOSCRIPT',
9+
'TEMPLATE',
10+
'META',
11+
'LINK',
12+
'SVG',
13+
'CANVAS',
14+
'IFRAME',
15+
'FRAME',
16+
'FRAMESET',
17+
'NOFRAMES',
18+
'NOEMBED',
19+
'AUDIO',
20+
'VIDEO',
21+
'SOURCE',
22+
'TRACK',
23+
'PICTURE',
24+
'IMG',
25+
'MAP',
26+
];
227

328
/**
429
* @param {HTMLElement} node
@@ -89,12 +114,62 @@ function getPopupLikeElements() {
89114
return excludeContainers(found);
90115
}
91116

117+
function getDocumentText() {
118+
/**
119+
* @param {Node} root
120+
*/
121+
function collectShadowDOMText(root) {
122+
const walker = document.createTreeWalker(
123+
root,
124+
NodeFilter.SHOW_ELEMENT,
125+
{
126+
/**
127+
* @param {Node} node
128+
*/
129+
acceptNode(node) {
130+
const element = /** @type {HTMLElement} */ (node);
131+
// Accept elements with shadow roots for special handling
132+
if (element.shadowRoot) {
133+
return NodeFilter.FILTER_ACCEPT;
134+
}
135+
// Skip other elements but continue traversing their children
136+
return NodeFilter.FILTER_SKIP;
137+
}
138+
}
139+
);
140+
141+
let result = '';
142+
let node;
143+
while ((node = walker.nextNode())) {
144+
const element = /** @type {HTMLElement} */ (node);
145+
let shadowText = '';
146+
for (const child of element.shadowRoot.children) {
147+
if (child instanceof HTMLElement && !ELEMENT_TAGS_TO_SKIP.includes(child.tagName)) {
148+
shadowText += ' ' + child.innerText;
149+
}
150+
if (child.shadowRoot) {
151+
shadowText += ' ' + collectShadowDOMText(child);
152+
}
153+
}
154+
if (shadowText.trim()) {
155+
result += ' ' + shadowText.trim();
156+
}
157+
}
158+
159+
return result;
160+
}
161+
162+
const visibleText = (document.body ?? document.documentElement).innerText;
163+
const shadowText = collectShadowDOMText(document.documentElement);
164+
return `${visibleText} ${shadowText}`.trim();
165+
}
166+
92167
/**
93168
* @param {HTMLElement} el
94169
* @returns {HTMLElement[]}
95170
*/
96-
function getButtons(el) {
97-
return Array.from(el.querySelectorAll('button, input[type="button"], input[type="submit"], a, [role="button"], [class*="button"]'));
171+
function getButtonLikeElements(el) {
172+
return Array.from(el.querySelectorAll(BUTTON_LIKE_ELEMENT_SELECTOR));
98173
}
99174

100175
/**
@@ -103,7 +178,7 @@ function getButtons(el) {
103178
* @returns {string}
104179
*/
105180
function insecureEscapeSelectorPart(selector) {
106-
return selector.replace(/[.*+?^${}()|[\]\\"]/g, '\\$&');
181+
return selector.replace(/[ .*+?^${}()|[\]\\"]/g, '\\$&');
107182
}
108183

109184
/**
@@ -142,22 +217,23 @@ function getSelector(el, specificity) {
142217
}
143218

144219
if (specificity.ids) {
145-
if (element.id) {
146-
localSelector += `#${insecureEscapeSelectorPart(element.id)}`;
220+
// use getAttribute() instead of element.id to protect against DOM clobbering
221+
if (element.getAttribute('id')) {
222+
localSelector += `#${insecureEscapeSelectorPart(element.getAttribute('id'))}`;
147223
} else if (!element.hasAttribute('id')) { // do not add it for id attribute without a value
148224
localSelector += `:not([id])`;
149225
}
150226
}
151227

152-
if (specificity.dataAttributes) {
228+
if (specificity.dataAttributes && element.attributes instanceof NamedNodeMap) {
153229
const dataAttributes = Array.from(element.attributes).filter(a => a.name.startsWith('data-'));
154230
dataAttributes.forEach(a => {
155231
const escapedValue = insecureEscapeSelectorPart(a.value);
156232
localSelector += `[${a.name}="${escapedValue}"]`;
157233
});
158234
}
159235

160-
if (specificity.classes) {
236+
if (specificity.classes && element.classList instanceof DOMTokenList) {
161237
const classes = Array.from(element.classList);
162238
if (classes.length > 0) {
163239
localSelector += `.${classes.map(c => insecureEscapeSelectorPart(c)).join('.')}`;
@@ -192,37 +268,52 @@ function getUniqueSelector(el) {
192268
};
193269
let selector = getSelector(el, specificity);
194270

195-
// verify that the selector is unique
196-
if (document.querySelectorAll(selector).length > 1) {
197-
specificity.dataAttributes = true;
198-
selector = getSelector(el, specificity);
199-
}
271+
try {
272+
// verify that the selector is unique
273+
if (document.querySelectorAll(selector).length > 1) {
274+
specificity.dataAttributes = true;
275+
selector = getSelector(el, specificity);
276+
}
200277

201-
if (document.querySelectorAll(selector).length > 1) {
202-
specificity.classes = true;
203-
selector = getSelector(el, specificity);
204-
}
278+
if (document.querySelectorAll(selector).length > 1) {
279+
specificity.classes = true;
280+
selector = getSelector(el, specificity);
281+
}
205282

206-
if (document.querySelectorAll(selector).length > 1) {
207-
specificity.absoluteOrder = true;
208-
selector = getSelector(el, specificity);
283+
if (document.querySelectorAll(selector).length > 1) {
284+
specificity.absoluteOrder = true;
285+
selector = getSelector(el, specificity);
286+
}
287+
} catch (e) {
288+
console.error(`Error getting unique selector for`, el, e);
289+
if (e instanceof DOMException && e.message.includes('is not a valid selector')) {
290+
return 'cookiepopups-collector-selector-error';
291+
}
209292
}
210293

211294
return selector;
212295
}
213296

214297
/**
215-
* @returns {import('../CookiePopupsCollector').ScrapeScriptResult}
298+
* Serialize all actionable buttons on the page
299+
* @param {HTMLElement} el
300+
* @returns {import('../CookiePopupsCollector').ButtonData[]}
216301
*/
217-
function collectPotentialPopups() {
218-
const isFramed = window.top !== window || location.ancestorOrigins?.length > 0;
219-
// do not inspect frames that are more than one level deep
220-
if (isFramed && window.parent && window.parent !== window.top) {
221-
return {
222-
potentialPopups: [],
223-
};
224-
}
302+
function getButtonData(el) {
303+
const actionableButtons = excludeContainers(getButtonLikeElements(el))
304+
.filter(b => isVisible(b) && !isDisabled(b));
225305

306+
return actionableButtons.map(b => ({
307+
text: b.innerText,
308+
selector: getUniqueSelector(b),
309+
}));
310+
}
311+
312+
/**
313+
* @param {boolean} isFramed
314+
* @returns {import('../CookiePopupsCollector').PopupData[]}
315+
*/
316+
function collectPotentialPopups(isFramed) {
226317
let elements = [];
227318
if (!isFramed) {
228319
elements = getPopupLikeElements();
@@ -234,27 +325,48 @@ function collectPotentialPopups() {
234325
}
235326
}
236327

328+
/**
329+
* @type {import('../CookiePopupsCollector').PopupData[]}
330+
*/
237331
const potentialPopups = [];
238332

239333
// for each potential popup, get the buttons
240334
for (const el of elements) {
241-
const buttons = excludeContainers(getButtons(el))
242-
.filter(b => isVisible(b) && !isDisabled(b));
243335
if (el.innerText) {
244336
potentialPopups.push({
245337
text: el.innerText,
246338
selector: getUniqueSelector(el),
247-
buttons: buttons.map(b => ({
248-
text: b.innerText,
249-
selector: getUniqueSelector(b),
250-
})),
251-
isTop: !isFramed,
252-
origin: window.location.origin,
339+
buttons: getButtonData(el),
253340
});
254341
}
255342
}
256343

257-
return { potentialPopups };
344+
return potentialPopups;
345+
}
346+
347+
/**
348+
* @returns {import('../CookiePopupsCollector').ScrapeScriptResult}
349+
*/
350+
function scrapePage() {
351+
const isFramed = window.top !== window || location.ancestorOrigins?.length > 0;
352+
// do not inspect frames that are more than one level deep
353+
if (isFramed && window.parent && window.parent !== window.top) {
354+
return {
355+
isTop: !isFramed,
356+
origin: window.location.origin,
357+
buttons: [],
358+
cleanedText: '',
359+
potentialPopups: [],
360+
};
361+
}
362+
363+
return {
364+
isTop: !isFramed,
365+
origin: window.location.origin,
366+
buttons: getButtonData(document.documentElement),
367+
cleanedText: getDocumentText().slice(0, LIMIT_TEXT_LENGTH),
368+
potentialPopups: collectPotentialPopups(isFramed),
369+
};
258370
}
259371

260-
collectPotentialPopups();
372+
scrapePage();

0 commit comments

Comments
 (0)