Skip to content

Commit 653ae51

Browse files
authored
Mark PR notes for manual review (#155)
* Ignore lint PR in git blame * Mark notes that require a manual review * add a note * Require review when updating manually edited rules * Do not flag HTTP->HTTPS protocol changes * do not trigger review on www. redirects * Tweak link formatting * Use the new _metadata key from autoconsent * rename reviewUpdates -> manuallyReviewUpdates * Lint fix * Use both initial and final urls for matching sites, and use initial url for referring to new rules * Lint fix
1 parent fc856d1 commit 653ae51

File tree

4 files changed

+89
-29
lines changed

4 files changed

+89
-29
lines changed

.git-blame-ignore-revs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# .git-blame-ignore-revs
2+
# Switched to a shared DDG ESLint config and introduced Prettier: https://github.com/duckduckgo/tracker-radar-collector/pull/158
3+
fc856d1fa643b409768d62e4f819fb915cf050ad

post-processing/generate-autoconsent-rules/generation.js

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,10 @@ function generateAutoconsentRule(region, url, frame, button) {
7272
const ruleName = `auto_${region}_${topDomain}_${Math.random().toString(36).substring(2, 5)}`;
7373
return {
7474
name: ruleName,
75-
vendorUrl: url,
7675
cosmetic: false,
76+
_metadata: {
77+
vendorUrl: url,
78+
},
7779
runContext: {
7880
main: frame.isTop,
7981
frame: !frame.isTop,
@@ -99,7 +101,7 @@ function generateAutoconsentRule(region, url, frame, button) {
99101
* @param {object} context - The context.
100102
* @param {import('./types').AutoConsentCMPRule} context.newRule
101103
* @param {import('./types').AutoConsentCMPRule[]} context.existingRulesWithSameRegion
102-
* @param {string} context.url
104+
* @param {string} context.initialUrl
103105
* @param {string} context.region
104106
* @param {import('./types').AutoConsentCMPRule[]} context.matchingRules
105107
* @param {import('./types').AutoConsentCMPRule[]} context.rulesToOverride
@@ -109,16 +111,17 @@ function generateAutoconsentRule(region, url, frame, button) {
109111
function overrideExistingRegionRules({
110112
newRule,
111113
existingRulesWithSameRegion,
112-
url,
114+
initialUrl,
113115
region,
114116
matchingRules,
115117
rulesToOverride,
116118
newRules,
117119
reviewNotes,
118120
}) {
119121
if (existingRulesWithSameRegion.length > 1) {
120-
console.warn('Multiple existing rules with the same region found for', url, region);
122+
console.warn('Multiple existing rules with the same region found for', initialUrl, region);
121123
reviewNotes.push({
124+
needsReview: true,
122125
note: 'Multiple existing rules with the same region found, consider removing all but one',
123126
ruleNames: existingRulesWithSameRegion.map((rule) => rule.name),
124127
existingRules: matchingRules.map((rule) => rule.name),
@@ -129,8 +132,9 @@ function overrideExistingRegionRules({
129132
// find an existing rule that we haven't overridden yet
130133
const ruleToOverride = existingRulesWithSameRegion.find((rule) => !rulesToOverride.some((r) => r.name === rule.name));
131134
if (!ruleToOverride) {
132-
console.warn('Already overridden all existing rules for', url, region, 'creating a new one');
135+
console.warn('Already overridden all existing rules for', initialUrl, region, 'creating a new one');
133136
reviewNotes.push({
137+
needsReview: true,
134138
note: 'Already overridden all existing rules, creating a new one',
135139
ruleName: newRule.name,
136140
existingRules: existingRulesWithSameRegion.map((rule) => rule.name),
@@ -141,13 +145,25 @@ function overrideExistingRegionRules({
141145
rulesToOverride.push({
142146
...newRule,
143147
name: ruleToOverride.name, // keep the existing rule name
148+
_metadata: ruleToOverride._metadata, // keep the existing metadata
144149
});
145-
reviewNotes.push({
146-
note: 'Overriding existing rule',
147-
ruleName: ruleToOverride.name,
148-
existingRules: matchingRules.map((rule) => rule.name),
149-
region,
150-
});
150+
if (ruleToOverride._metadata?.manuallyReviewUpdates) {
151+
reviewNotes.push({
152+
needsReview: true,
153+
note: 'Updated rule that has been manually edited before',
154+
ruleName: ruleToOverride.name,
155+
existingRules: matchingRules.map((rule) => rule.name),
156+
region,
157+
});
158+
} else {
159+
reviewNotes.push({
160+
needsReview: false,
161+
note: 'Overriding existing rule',
162+
ruleName: ruleToOverride.name,
163+
existingRules: matchingRules.map((rule) => rule.name),
164+
region,
165+
});
166+
}
151167
}
152168
}
153169

@@ -156,27 +172,29 @@ function overrideExistingRegionRules({
156172
* override an existing one, or do nothing.
157173
* @param {object} context - The context for processing the button.
158174
* @param {string} context.region
159-
* @param {string} context.url
175+
* @param {string} context.initialUrl
176+
* @param {string} context.finalUrl
160177
* @param {import('./types').ScrapeScriptResult} context.frame
161178
* @param {import('./types').ButtonData} context.button
162179
* @param {import('./types').AutoConsentCMPRule[]} context.matchingRules
163180
* @param {import('./types').AutoConsentCMPRule[]} context.newRules
164181
* @param {import('./types').AutoConsentCMPRule[]} context.rulesToOverride
165182
* @param {import('./types').ReviewNote[]} context.reviewNotes
166183
*/
167-
function processRejectButton({ region, url, frame, button, matchingRules, newRules, rulesToOverride, reviewNotes }) {
184+
function processRejectButton({ region, initialUrl, frame, button, matchingRules, newRules, rulesToOverride, reviewNotes }) {
168185
let newRule;
169186
try {
170-
newRule = generateAutoconsentRule(region, url, frame, button);
187+
newRule = generateAutoconsentRule(region, initialUrl, frame, button);
171188
} catch (err) {
172-
console.error(`Error generating rule for ${url} (${frame.origin})`, err);
189+
console.error(`Error generating rule for ${initialUrl} (${frame.origin})`, err);
173190
return;
174191
}
175192

176193
if (matchingRules.length === 0) {
177194
// add the first rule for this site
178195
newRules.push(newRule);
179196
reviewNotes.push({
197+
needsReview: false,
180198
note: 'New rule added',
181199
ruleName: newRule.name,
182200
});
@@ -192,7 +210,7 @@ function processRejectButton({ region, url, frame, button, matchingRules, newRul
192210
overrideExistingRegionRules({
193211
newRule,
194212
existingRulesWithSameRegion,
195-
url,
213+
initialUrl,
196214
region,
197215
matchingRules,
198216
rulesToOverride,
@@ -203,6 +221,7 @@ function processRejectButton({ region, url, frame, button, matchingRules, newRul
203221
// assume it's a new region-specific popup, but flag it for review
204222
newRules.push(newRule);
205223
reviewNotes.push({
224+
needsReview: false,
206225
note: 'New region-specific popup',
207226
ruleName: newRule.name,
208227
existingRules: matchingRules.map((rule) => rule.name),
@@ -214,12 +233,13 @@ function processRejectButton({ region, url, frame, button, matchingRules, newRul
214233
/**
215234
* Analyze existing rules and generate new rules when necessary.
216235
* @param {string} region
217-
* @param {string} url - The URL being processed.
236+
* @param {string} initialUrl - The URL of the initial page.
237+
* @param {string} finalUrl - The URL of the final page (after load redirects).
218238
* @param {import('./types').CookiePopupsCollectorResult} collectorResult - The collector result.
219239
* @param {import('./types').AutoConsentCMPRule[]} matchingRules - Array of existing rules.
220240
* @returns {{newRules: import('./types').AutoConsentCMPRule[], rulesToOverride: import('./types').AutoConsentCMPRule[], reviewNotes: import('./types').ReviewNote[], keptCount: number}}
221241
*/
222-
function generateRulesForSite(region, url, collectorResult, matchingRules) {
242+
function generateRulesForSite(region, initialUrl, finalUrl, collectorResult, matchingRules) {
223243
/** @type {import('./types').AutoConsentCMPRule[]} */
224244
const newRules = [];
225245
/** @type {import('./types').AutoConsentCMPRule[]} */
@@ -230,10 +250,11 @@ function generateRulesForSite(region, url, collectorResult, matchingRules) {
230250

231251
const llmConfirmedPopups = collectorResult.scrapedFrames.flatMap((frame) => frame.potentialPopups).filter((popup) => popup.llmMatch);
232252
if (llmConfirmedPopups.length > 1 || llmConfirmedPopups[0].rejectButtons.length > 1) {
233-
console.warn('Multiple cookie popups or reject buttons found in', url);
253+
console.warn('Multiple cookie popups or reject buttons found in', initialUrl);
234254
reviewNotes.push({
255+
needsReview: false, // it's not a problem by itself, unless this leads to multiple _rules_ generated, but we check that separately.
235256
note: 'Multiple popups or reject buttons found',
236-
url,
257+
url: initialUrl,
237258
region,
238259
});
239260
}
@@ -247,13 +268,14 @@ function generateRulesForSite(region, url, collectorResult, matchingRules) {
247268
keptCount++;
248269
continue;
249270
}
250-
processRejectButton({ region, url, frame, button, matchingRules, newRules, rulesToOverride, reviewNotes });
271+
processRejectButton({ region, initialUrl, finalUrl, frame, button, matchingRules, newRules, rulesToOverride, reviewNotes });
251272
}
252273
}
253274
}
254275

255276
if (newRules.length > 1) {
256277
reviewNotes.push({
278+
needsReview: true,
257279
note: 'Multiple new rules generated',
258280
ruleNames: newRules.map((rule) => rule.name),
259281
});

post-processing/generate-autoconsent-rules/main.js

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,27 @@ generateCMPTests('${ruleName}', ${JSON.stringify(testUrls)}, {testOptIn: false,
1717

1818
/**
1919
* Find existing rules that match a given URL/domain.
20-
* @param {string} url - The URL to match against.
20+
* @param {string} initialUrl - The URL of the initial page.
21+
* @param {string} finalUrl - The URL of the final page (after load redirects).
2122
* @param {import('./types').CookiePopupsCollectorResult} collectorResult - Array of processed cookie popups.
2223
* @param {import('./types').AutoConsentCMPRule[]} existingRules - Array of existing rules.
2324
* @returns {import('./types').AutoConsentCMPRule[]} Array of matching existing rules.
2425
*/
25-
function findMatchingExistingRules(url, collectorResult, existingRules) {
26+
function findMatchingExistingRules(initialUrl, finalUrl, collectorResult, existingRules) {
2627
return existingRules.filter((rule) => {
2728
if (rule.runContext && rule.runContext.urlPattern) {
2829
try {
2930
const pattern = new RegExp(rule.runContext.urlPattern);
3031
// rule is a match iff:
3132
// urlPattern matches the crawled site
3233
return (
33-
pattern.test(url) ||
34+
pattern.test(initialUrl) ||
35+
pattern.test(finalUrl) ||
3436
// OR vendorUrl matches the crawled site (this is more like a heuristic as vendorUrl is not used by Autoconsent)
35-
rule.vendorUrl === url ||
37+
rule.vendorUrl === initialUrl ||
38+
rule.vendorUrl === finalUrl ||
39+
rule._metadata?.vendorUrl === initialUrl ||
40+
rule._metadata?.vendorUrl === finalUrl ||
3641
// OR the rule matched a frame
3742
(rule.runContext?.frame && collectorResult.scrapedFrames.some((frame) => pattern.test(frame.origin)))
3843
);
@@ -120,7 +125,8 @@ async function writeRuleFiles({ rule, url, rulesDir, testDir, autoconsentDir, re
120125
* Process cookie popups for a single site and generate/update rules.
121126
* @param {import('./types').GlobalParams} globalParams
122127
* @param {{
123-
* finalUrl: string, // URL of the site
128+
* finalUrl: string, // final URL of the site
129+
* initialUrl: string, // initial URL of the site
124130
* collectorResult: import('./types').CookiePopupsCollectorResult,
125131
* existingRules: import('./types').AutoConsentCMPRule[], // existing Autoconsent rules
126132
* }} params
@@ -132,7 +138,7 @@ async function writeRuleFiles({ rule, url, rulesDir, testDir, autoconsentDir, re
132138
* updatedExistingRules: import('./types').AutoConsentCMPRule[],
133139
* }>}
134140
*/
135-
async function processCookiePopupsForSite(globalParams, { finalUrl, collectorResult, existingRules }) {
141+
async function processCookiePopupsForSite(globalParams, { finalUrl, initialUrl, collectorResult, existingRules }) {
136142
const { rulesDir, testDir, autoconsentDir, region } = globalParams;
137143

138144
/** @type {import('./types').AutoconsentManifestFileData[]} */
@@ -149,11 +155,37 @@ async function processCookiePopupsForSite(globalParams, { finalUrl, collectorRes
149155
return { newRuleFiles, updatedRuleFiles, keptCount: 0, reviewNotes: [], updatedExistingRules };
150156
}
151157

152-
const matchingRules = findMatchingExistingRules(finalUrl, collectorResult, existingRules);
158+
const matchingRules = findMatchingExistingRules(initialUrl, finalUrl, collectorResult, existingRules);
153159
console.log(
154160
`Detected ${llmConfirmedPopups.length} unhandled cookie popup(s) on ${finalUrl} (matched ${matchingRules.length} existing rules)`,
155161
);
156-
const { newRules, rulesToOverride, reviewNotes, keptCount } = generateRulesForSite(region, finalUrl, collectorResult, matchingRules);
162+
const { newRules, rulesToOverride, reviewNotes, keptCount } = generateRulesForSite(
163+
region,
164+
initialUrl,
165+
finalUrl,
166+
collectorResult,
167+
matchingRules,
168+
);
169+
try {
170+
const initialHost = new URL(initialUrl).host;
171+
const finalHost = new URL(finalUrl).host;
172+
if (
173+
initialHost !== finalHost &&
174+
// ignore redirects from www. to non-www and vice versa
175+
initialHost !== 'www.' + finalHost &&
176+
finalHost !== 'www.' + initialHost
177+
) {
178+
reviewNotes.push({
179+
note: `Site changed host: \`${initialHost}\` -> \`${finalHost}\``,
180+
needsReview: true,
181+
});
182+
}
183+
} catch (err) {
184+
reviewNotes.push({
185+
note: `Failed to parse URL for \`${initialUrl}\` or \`${finalUrl}\`: \`${err.message}\``,
186+
needsReview: true,
187+
});
188+
}
157189

158190
updatedExistingRules.push(...newRules);
159191
rulesToOverride.forEach((rule) => {
@@ -294,6 +326,7 @@ async function processFiles(globalParams, existingRules) {
294326
totalUnhandled++;
295327
const result = await processCookiePopupsForSite(globalParams, {
296328
finalUrl: jsonData.finalUrl,
329+
initialUrl: jsonData.initialUrl,
297330
collectorResult,
298331
existingRules: existingRulesAfter,
299332
});

post-processing/generate-autoconsent-rules/types.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* data: {
44
* cookiepopups: import('../../collectors/CookiePopupsCollector').CookiePopupsCollectorResult;
55
* };
6+
* initialUrl: string;
67
* finalUrl: string;
78
* }} CrawlData
89
*/
@@ -19,6 +20,7 @@
1920
/**
2021
* @typedef {{
2122
* note: string;
23+
* needsReview: boolean;
2224
* url?: string;
2325
* region?: string;
2426
* ruleName?: string;

0 commit comments

Comments
 (0)