Skip to content

Commit 10d8892

Browse files
committed
Deduplicate Moscary scan results
Deduplication is relatively primitive, but it currently works as follows: For every broker, sort the scan results by source (where `moscary` comes before `onerep`), then by `updated_at`. Remove scan results that point to the same URL. Then only keep the first three. Note that for some brokers, different scan results will have the same URL, so we might want to revisit that in the future.
1 parent 9053012 commit 10d8892

File tree

5 files changed

+300
-10
lines changed

5 files changed

+300
-10
lines changed

src/apiMocks/mockData.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ export type RandomScanResultOptions = Partial<{
4444
status: RemovalStatus;
4545
manually_resolved: boolean;
4646
broker_status: DataBrokerRemovalStatus;
47+
data_broker: string;
48+
source: MoscaryData["ScanResult"]["source"];
49+
url: MoscaryData["ScanResult"]["url"];
50+
updated_at: Date;
4751
}>;
4852

4953
/**
@@ -60,7 +64,7 @@ export function createRandomOnerepScanResult(
6064
options.status === "waiting_for_verification"
6165
? faker.number.int({ min: 1, max: 42 })
6266
: undefined;
63-
const url = faker.internet.url();
67+
const url = options.url ?? faker.internet.url();
6468
return {
6569
id: faker.number.int(),
6670
onerep_scan_result_id: faker.number.int(),
@@ -85,10 +89,10 @@ export function createRandomOnerepScanResult(
8589
emails: [faker.internet.exampleEmail()],
8690
relatives: Array.from({ length: 3 }, () => faker.person.fullName()),
8791
link: url,
88-
data_broker: new URL(url).hostname,
92+
data_broker: options.data_broker ?? new URL(url).hostname,
8993
data_broker_id: faker.number.int(),
9094
created_at: options.createdDate ?? faker.date.recent({ days: 2 }),
91-
updated_at: faker.date.recent({ days: 1 }),
95+
updated_at: options.updated_at ?? faker.date.recent({ days: 1 }),
9296
optout_attempts,
9397
last_optout_at:
9498
typeof optout_attempts === "number" && optout_attempts > 0
@@ -116,7 +120,7 @@ export function createRandomMoscaryScanResult(
116120
options.status === "waiting_for_verification"
117121
? faker.number.int({ min: 1, max: 42 })
118122
: undefined;
119-
const url = faker.internet.url();
123+
const url = options.url ?? faker.internet.url();
120124
return {
121125
id: faker.string.uuid(),
122126
scan_id: faker.string.uuid(),
@@ -140,18 +144,20 @@ export function createRandomMoscaryScanResult(
140144
emails: [faker.internet.exampleEmail()],
141145
relatives: Array.from({ length: 3 }, () => faker.person.fullName()),
142146
link: url,
143-
data_broker: new URL(url).hostname,
147+
data_broker: options.data_broker ?? new URL(url).hostname,
144148
data_broker_id: faker.number.int(),
145149
created_at: (
146150
options.createdDate ?? faker.date.recent({ days: 2 })
147151
).toISOString(),
148-
updated_at: faker.date.recent({ days: 1 }).toISOString(),
152+
updated_at:
153+
options.updated_at?.toISOString() ??
154+
faker.date.recent({ days: 1 }).toISOString(),
149155
optout_attempts,
150156
last_optout_at:
151157
typeof optout_attempts === "number" && optout_attempts > 0
152158
? faker.date.recent({ days: 3 }).toISOString()
153159
: undefined,
154-
url: url,
160+
source: options.source ?? faker.helpers.arrayElement(["monitor", "onerep"]),
155161
verification_attempts: 0,
156162
};
157163
}

src/app/functions/server/moscary.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { logger } from "./logging";
1313
import { ISO8601DateString } from "../../../utils/parse";
1414
import { StateAbbr } from "../../../utils/states";
1515
import { getSubscriberByFxaUid } from "../../../db/tables/subscribers";
16+
import { dedupeScanResults } from "../universal/dedupeScanResults";
1617

1718
export type MoscaryData = Components["schemas"];
1819

@@ -254,7 +255,11 @@ export async function listScans(
254255
>;
255256
}
256257

257-
export async function listScanResults(
258+
// Note: this function is not exported on purpose.
259+
// `getAllScanResults` does some additional filtering to hide scan results
260+
// that are not relevant to the user, and we want that filter to be applied
261+
// consistently throughout the site, i.e. for that function to be used everywhere.
262+
async function listScanResults(
258263
profileId: NonNullable<SubscriberRow["moscary_id"]>,
259264
options: Partial<{
260265
page: number;
@@ -344,14 +349,15 @@ export async function getScanAndResults(
344349
return { scan, results };
345350
}
346351

347-
export async function getAllScanResults(
352+
async function getAllScanResults(
348353
profileId: NonNullable<SubscriberRow["moscary_id"]>,
349354
): Promise<
350355
Paths["/scan-results"]["get"]["responses"]["200"]["content"]["application/json"]["data"]
351356
> {
352-
return fetchAllPages((page: number) =>
357+
const allScanResults = await fetchAllPages((page: number) =>
353358
listScanResults(profileId, { per_page: 100, page: page }),
354359
);
360+
return dedupeScanResults(allScanResults);
355361
}
356362

357363
export async function resolveScanResult(
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
import { it, expect } from "@jest/globals";
6+
import { dedupeScanResults } from "./dedupeScanResults";
7+
import { createRandomMoscaryScanResult } from "../../../apiMocks/mockData";
8+
9+
it("sorts Moscary results before OneRep results", () => {
10+
const scanResultsToDedupe = [
11+
createRandomMoscaryScanResult({
12+
source: "monitor",
13+
data_broker: "broker-to-dedupe",
14+
updated_at: new Date(2022, 8, 23),
15+
}),
16+
createRandomMoscaryScanResult({
17+
source: "onerep",
18+
data_broker: "broker-to-dedupe",
19+
updated_at: new Date(2022, 8, 22),
20+
}),
21+
];
22+
23+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
24+
scanResultsToDedupe[0],
25+
scanResultsToDedupe[1],
26+
]);
27+
});
28+
29+
it("preserves OneRep results if no Moscary result for the same broker is available", () => {
30+
const scanResultsToDedupe = [
31+
createRandomMoscaryScanResult({
32+
source: "onerep",
33+
data_broker: "broker-to-preserve",
34+
}),
35+
createRandomMoscaryScanResult({
36+
source: "monitor",
37+
data_broker: "other-data-broker",
38+
}),
39+
];
40+
41+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
42+
scanResultsToDedupe[0],
43+
// Theoretically the order of scan results from different brokers
44+
// is not guaranteed, but in practice, it should maintain the
45+
// order from above (famous last words):
46+
scanResultsToDedupe[1],
47+
]);
48+
});
49+
50+
it("sorts by updated_at date", () => {
51+
const scanResultsToDedupe = [
52+
createRandomMoscaryScanResult({
53+
source: "monitor",
54+
data_broker: "broker-to-dedupe",
55+
updated_at: new Date(2022, 8, 23),
56+
}),
57+
createRandomMoscaryScanResult({
58+
source: "monitor",
59+
data_broker: "broker-to-dedupe",
60+
updated_at: new Date(2022, 8, 24),
61+
}),
62+
];
63+
64+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
65+
scanResultsToDedupe[1],
66+
scanResultsToDedupe[0],
67+
]);
68+
});
69+
70+
it("only preserves the latest scan result for a single result URL", () => {
71+
const scanResultsToDedupe = [
72+
createRandomMoscaryScanResult({
73+
source: "monitor",
74+
data_broker: "broker-to-dedupe",
75+
updated_at: new Date(2022, 8, 23),
76+
url: "https://example.com",
77+
}),
78+
createRandomMoscaryScanResult({
79+
source: "monitor",
80+
data_broker: "broker-to-dedupe",
81+
updated_at: new Date(2022, 8, 24),
82+
url: "https://example.com",
83+
}),
84+
];
85+
86+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
87+
scanResultsToDedupe[1],
88+
]);
89+
});
90+
91+
it("only preserves the 3 latest scan results per broker", () => {
92+
const scanResultsToDedupe = [
93+
createRandomMoscaryScanResult({
94+
source: "monitor",
95+
data_broker: "broker-to-dedupe",
96+
updated_at: new Date(2022, 8, 23),
97+
}),
98+
createRandomMoscaryScanResult({
99+
source: "monitor",
100+
data_broker: "broker-to-dedupe",
101+
updated_at: new Date(2022, 8, 24),
102+
}),
103+
createRandomMoscaryScanResult({
104+
source: "monitor",
105+
data_broker: "broker-to-dedupe",
106+
updated_at: new Date(2022, 8, 25),
107+
}),
108+
createRandomMoscaryScanResult({
109+
source: "monitor",
110+
data_broker: "broker-to-dedupe",
111+
updated_at: new Date(2022, 8, 26),
112+
}),
113+
createRandomMoscaryScanResult({
114+
source: "monitor",
115+
data_broker: "another-broker",
116+
updated_at: new Date(2022, 8, 27),
117+
}),
118+
];
119+
120+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
121+
scanResultsToDedupe[3],
122+
scanResultsToDedupe[2],
123+
scanResultsToDedupe[1],
124+
// Theoretically the order of scan results from different brokers
125+
// is not guaranteed, but in practice, it should maintain the
126+
// order from above (famous last words):
127+
scanResultsToDedupe[4],
128+
]);
129+
});
130+
131+
it("handles an empty list of scan results", () => {
132+
expect(dedupeScanResults([])).toStrictEqual([]);
133+
});
134+
135+
it("handles a list with a single scan result", () => {
136+
const singleScanResult = createRandomMoscaryScanResult();
137+
expect(dedupeScanResults([singleScanResult])).toStrictEqual([
138+
singleScanResult,
139+
]);
140+
});
141+
142+
it("prioritizes Moscary over OneRep even when OneRep is newer", () => {
143+
const scanResultsToDedupe = [
144+
createRandomMoscaryScanResult({
145+
source: "onerep",
146+
data_broker: "broker-to-dedupe",
147+
updated_at: new Date(2022, 8, 25), // Newer
148+
}),
149+
createRandomMoscaryScanResult({
150+
source: "monitor",
151+
data_broker: "broker-to-dedupe",
152+
updated_at: new Date(2022, 8, 23), // Older
153+
}),
154+
];
155+
156+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
157+
// Moscary result should come first despite being older
158+
scanResultsToDedupe[1],
159+
scanResultsToDedupe[0],
160+
]);
161+
});
162+
163+
it("keeps the most relevant result when URLs match after sorting", () => {
164+
const scanResultsToDedupe = [
165+
createRandomMoscaryScanResult({
166+
source: "onerep",
167+
data_broker: "broker-to-dedupe",
168+
// Newer but OneRep:
169+
updated_at: new Date(2022, 8, 25),
170+
url: "https://example.com",
171+
}),
172+
createRandomMoscaryScanResult({
173+
source: "monitor",
174+
data_broker: "broker-to-dedupe",
175+
// Older but Moscary:
176+
updated_at: new Date(2022, 8, 23),
177+
url: "https://example.com",
178+
}),
179+
];
180+
181+
expect(dedupeScanResults(scanResultsToDedupe)).toStrictEqual([
182+
scanResultsToDedupe[1],
183+
]);
184+
});
185+
186+
it("handles exactly 3 results per broker", () => {
187+
const scanResultsToDedupe = [
188+
createRandomMoscaryScanResult({
189+
source: "monitor",
190+
data_broker: "broker-to-dedupe",
191+
updated_at: new Date(2022, 8, 23),
192+
}),
193+
createRandomMoscaryScanResult({
194+
source: "monitor",
195+
data_broker: "broker-to-dedupe",
196+
updated_at: new Date(2022, 8, 24),
197+
}),
198+
createRandomMoscaryScanResult({
199+
source: "monitor",
200+
data_broker: "broker-to-dedupe",
201+
updated_at: new Date(2022, 8, 25),
202+
}),
203+
];
204+
205+
const result = dedupeScanResults(scanResultsToDedupe);
206+
207+
expect(result).toHaveLength(3);
208+
});
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/* This Source Code Form is subject to the terms of the Mozilla Public
2+
* License, v. 2.0. If a copy of the MPL was not distributed with this
3+
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4+
5+
import { CONST_MAX_SCAN_RESULTS_PER_BROKER } from "../../../constants";
6+
import { parseIso8601Datetime } from "../../../utils/parse";
7+
import { MoscaryData } from "../server/moscary";
8+
9+
export function dedupeScanResults(
10+
scanResults: MoscaryData["ScanResult"][],
11+
): MoscaryData["ScanResult"][] {
12+
const scanResultsByBroker: Record<string, MoscaryData["ScanResult"][]> = {};
13+
scanResults.forEach((scanResult) => {
14+
scanResultsByBroker[scanResult.data_broker] ??= [];
15+
scanResultsByBroker[scanResult.data_broker].push(scanResult);
16+
});
17+
18+
return Object.values(scanResultsByBroker).flatMap(
19+
(scanResultsForSingleBroker) =>
20+
getMostRelevantScanResults(scanResultsForSingleBroker),
21+
);
22+
}
23+
24+
/**
25+
* Given a list of results for the broker, returns the most recently-updated Moscary one, falling back to the most recently-updated OneRep result if not available.
26+
*/
27+
function getMostRelevantScanResults(
28+
scanResultsForSingleBroker: MoscaryData["ScanResult"][],
29+
): MoscaryData["ScanResult"][] {
30+
const sortedResults = sortByRelevance(scanResultsForSingleBroker);
31+
32+
const filteredResults = removeIrrelevantResults(sortedResults);
33+
34+
return filteredResults.slice(0, CONST_MAX_SCAN_RESULTS_PER_BROKER);
35+
}
36+
37+
function sortByRelevance(
38+
scanResults: MoscaryData["ScanResult"][],
39+
): MoscaryData["ScanResult"][] {
40+
return scanResults.toSorted((scanResultA, scanResultB) => {
41+
if (scanResultA.source === "monitor" && scanResultB.source === "onerep") {
42+
return -1;
43+
}
44+
if (scanResultA.source === "onerep" && scanResultB.source === "monitor") {
45+
return 1;
46+
}
47+
48+
const scanResultAUpdated = parseIso8601Datetime(scanResultA.updated_at);
49+
const scanResultBUpdated = parseIso8601Datetime(scanResultB.updated_at);
50+
51+
// Newest one first:
52+
return scanResultBUpdated.getTime() - scanResultAUpdated.getTime();
53+
});
54+
}
55+
56+
function removeIrrelevantResults(scanResults: MoscaryData["ScanResult"][]) {
57+
const seenResultLinks = new Set<string>();
58+
return scanResults.filter((scanResult) => {
59+
// Only include one scan result per result URL:
60+
if (scanResult.link) {
61+
const trimmedLink = scanResult.link.trim();
62+
if (seenResultLinks.has(trimmedLink)) {
63+
return false;
64+
}
65+
seenResultLinks.add(trimmedLink);
66+
}
67+
return true;
68+
});
69+
}

src/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@ export const CONST_DATA_BROKER_PROFILE_DETAIL_LIMITS = {
6565
phone_numbers: 10,
6666
addresses: 10,
6767
} as const;
68+
export const CONST_MAX_SCAN_RESULTS_PER_BROKER = 3 as const;

0 commit comments

Comments
 (0)