Skip to content

Commit 855d483

Browse files
authored
Tweaks in cookie popup scraping (#152)
* temporatily do not look at unsuccessful but detected cases * Use a built-in escape function * Add a note about auto-generated rules * add data-testid * do not scrape buttons with empty text * verify self-test in generated rules * Lint fix * prettify the output
1 parent 52b9543 commit 855d483

File tree

3 files changed

+37
-27
lines changed

3 files changed

+37
-27
lines changed

collectors/CookiePopups/scrapeScript.js

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* global window, document, HTMLElement, Node, NodeFilter, location, NamedNodeMap, DOMTokenList, DOMException */
1+
/* global window, document, HTMLElement, Node, NodeFilter, location, NamedNodeMap, DOMTokenList, DOMException, CSS */
22

33
const BUTTON_LIKE_ELEMENT_SELECTOR = 'button, input[type="button"], input[type="submit"], a, [role="button"], [class*="button"]';
44
const LIMIT_TEXT_LENGTH = 150000;
@@ -172,19 +172,10 @@ function getButtonLikeElements(el) {
172172
return Array.from(el.querySelectorAll(BUTTON_LIKE_ELEMENT_SELECTOR));
173173
}
174174

175-
/**
176-
* Naive selector escaping. Use with caution.
177-
* @param {string} selector
178-
* @returns {string}
179-
*/
180-
function insecureEscapeSelectorPart(selector) {
181-
return selector.replace(/[ .*+?^${}()|[\]\\"]/g, '\\$&');
182-
}
183-
184175
/**
185176
* Get the selector for an element
186177
* @param {HTMLElement} el - The element to get the selector for
187-
* @param {{ order?: boolean, ids?: boolean, dataAttributes?: boolean, classes?: boolean, absoluteOrder?: boolean }} specificity - details to add to the selector
178+
* @param {{ order?: boolean, ids?: boolean, dataAttributes?: boolean, classes?: boolean, absoluteOrder?: boolean, testid?: boolean }} specificity - details to add to the selector
188179
* @returns {string} The selector for the element
189180
*/
190181
function getSelector(el, specificity) {
@@ -216,10 +207,10 @@ function getSelector(el, specificity) {
216207
}
217208
}
218209

219-
if (specificity.ids) {
210+
if (specificity.ids && tagName !== 'body') {
220211
// use getAttribute() instead of element.id to protect against DOM clobbering
221212
if (element.getAttribute('id')) {
222-
localSelector += `#${insecureEscapeSelectorPart(element.getAttribute('id'))}`;
213+
localSelector += `#${CSS.escape(element.getAttribute('id'))}`;
223214
} else if (!element.hasAttribute('id')) { // do not add it for id attribute without a value
224215
localSelector += `:not([id])`;
225216
}
@@ -228,15 +219,21 @@ function getSelector(el, specificity) {
228219
if (specificity.dataAttributes && element.attributes instanceof NamedNodeMap) {
229220
const dataAttributes = Array.from(element.attributes).filter(a => a.name.startsWith('data-'));
230221
dataAttributes.forEach(a => {
231-
const escapedValue = insecureEscapeSelectorPart(a.value);
222+
const escapedValue = CSS.escape(a.value);
232223
localSelector += `[${a.name}="${escapedValue}"]`;
233224
});
225+
} else if (specificity.testid) {
226+
// data-testid is a common attribute used by testing frameworks to identify elements
227+
const testid = element.getAttribute('data-testid');
228+
if (testid) {
229+
localSelector += `[data-testid="${CSS.escape(testid)}"]`;
230+
}
234231
}
235232

236233
if (specificity.classes && element.classList instanceof DOMTokenList) {
237234
const classes = Array.from(element.classList);
238235
if (classes.length > 0) {
239-
localSelector += `.${classes.map(c => insecureEscapeSelectorPart(c)).join('.')}`;
236+
localSelector += `.${classes.map(c => CSS.escape(c)).join('.')}`;
240237
}
241238
}
242239

@@ -257,19 +254,30 @@ function getUniqueSelector(el) {
257254
// We need to strike a balance here. Selector has to be unique, but we want to avoid auto-generated (randomized) identifiers to make the it resilient. Assumptions:
258255
// - Classes are the most common thing to randomize, so we use them as the last resort.
259256
// - The general shape of the DOM doesn't change that much, so order is always preferred
260-
// - data attributes can contain anything, so don't add them by default
261-
// - IDs are often used on the popup containers, so are very useful. And they are definitely less commonly randomized than classes, but it's still possible, so we may want to change this logic later if we see randomized ids in the results.
257+
// - data attributes can contain anything, so don't add them by default (except for data-testid, which is usually fine)
258+
// - IDs are often used on the popup containers, so are very useful. Sometimes they are randomized too, but it's not as common.
262259
const specificity = {
260+
testid: true,
261+
ids: true,
263262
order: true,
264-
ids: true, // consider disabling this by default for auto-generated IDs
265263
dataAttributes: false,
266264
classes: false,
267265
absoluteOrder: false,
268266
};
269267
let selector = getSelector(el, specificity);
270268

269+
// increase specificity until the selector is unique
271270
try {
272-
// verify that the selector is unique
271+
if (document.querySelectorAll(selector).length > 1) {
272+
specificity.order = true;
273+
selector = getSelector(el, specificity);
274+
}
275+
276+
if (document.querySelectorAll(selector).length > 1) {
277+
specificity.ids = true;
278+
selector = getSelector(el, specificity);
279+
}
280+
273281
if (document.querySelectorAll(selector).length > 1) {
274282
specificity.dataAttributes = true;
275283
selector = getSelector(el, specificity);
@@ -301,7 +309,7 @@ function getUniqueSelector(el) {
301309
*/
302310
function getButtonData(el) {
303311
const actionableButtons = excludeContainers(getButtonLikeElements(el))
304-
.filter(b => isVisible(b) && !isDisabled(b));
312+
.filter(b => isVisible(b) && !isDisabled(b) && b.innerText.trim());
305313

306314
return actionableButtons.map(b => ({
307315
text: b.innerText,

post-processing/detect-cookie-popups.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,10 +257,10 @@ async function main() {
257257
});
258258

259259
console.log('Done');
260-
console.log(`Sites with LLM detected text (full text page): ${sitesWithPopupsLlm} (${sitesWithPopupsLlm / pages.length * 100}%)`);
261-
console.log(`Sites with regex detected popups (full text page): ${sitesWithPopupsRegex} (${sitesWithPopupsRegex / pages.length * 100}%)`);
262-
console.log(`Sites with LLM detected popups (popup elements): ${sitesWithDetectedPopupLlm} (${sitesWithDetectedPopupLlm / pages.length * 100}%)`);
263-
console.log(`Sites with regex detected popups (popup elements): ${sitesWithDetectedPopupRegex} (${sitesWithDetectedPopupRegex / pages.length * 100}%)`);
260+
console.log(`Sites with LLM detected text (full text page): ${sitesWithPopupsLlm} (${(sitesWithPopupsLlm / pages.length * 100).toFixed(1)}%)`);
261+
console.log(`Sites with regex detected popups (full text page): ${sitesWithPopupsRegex} (${(sitesWithPopupsRegex / pages.length * 100).toFixed(1)}%)`);
262+
console.log(`Sites with LLM detected popups (popup elements): ${sitesWithDetectedPopupLlm} (${(sitesWithDetectedPopupLlm / pages.length * 100).toFixed(1)}%)`);
263+
console.log(`Sites with regex detected popups (popup elements): ${sitesWithDetectedPopupRegex} (${(sitesWithDetectedPopupRegex / pages.length * 100).toFixed(1)}%)`);
264264
console.log(`Reject button texts (${rejectButtonTexts.size}) saved in ${rejectButtonTextsFile}`);
265265
console.log(`Other button texts (${otherButtonTexts.size}) saved in ${otherButtonTextsFile}`);
266266
}

post-processing/generate-autoconsent-rules/main.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ const { generateRulesForSite } = require('./generation.js');
1111
*/
1212
function generateTestFile(ruleName, testUrls, regions) {
1313
return `import generateCMPTests from "../../playwright/runner";
14-
generateCMPTests('${ruleName}', ${JSON.stringify(testUrls)}, {testOptIn: false, testSelfTest: false, onlyRegions: ${JSON.stringify(regions)}});
14+
generateCMPTests('${ruleName}', ${JSON.stringify(testUrls)}, {testOptIn: false, testSelfTest: true, onlyRegions: ${JSON.stringify(regions)}});
1515
`;
1616
}
1717

@@ -55,8 +55,10 @@ function hasKnownCmp(cmps) {
5555
cmps.some(cmp => cmp &&
5656
cmp.name &&
5757
cmp.name.trim() !== '' &&
58-
cmp.succeeded &&
59-
!cmp.name.trim().startsWith('auto_')) // we may override existing autogenerated rules
58+
// TODO: re-enable this once Autoconsent is more reliable in crawls
59+
// cmp.succeeded &&
60+
// we always verify existing autogenerated rules, and override them unless the selector is exactly the same.
61+
!cmp.name.trim().startsWith('auto_'))
6062
);
6163
}
6264

0 commit comments

Comments
 (0)