Skip to content

Commit f4e98fb

Browse files
committed
update dupe count when aggregating
1 parent 53e8656 commit f4e98fb

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

src/util/state.ts

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -411,11 +411,11 @@ export class RedisDedupeIndex {
411411
}
412412

413413
// COUNT STATS
414-
incrDeduped(pipe: ChainableCommander, key: string, value: number) {
414+
incrDeduped(pipe: ChainableCommander, key: string, value: number, count = 1) {
415415
if (value > 0) {
416416
pipe.hincrby(key, "conservedSize", value);
417417
}
418-
pipe.hincrby(key, "dupeUrls", 1);
418+
pipe.hincrby(key, "dupeUrls", count);
419419
}
420420

421421
incrTotalUrls(pipe: ChainableCommander, key: string) {
@@ -507,7 +507,7 @@ export class RedisDedupeIndex {
507507
}
508508

509509
async matchRevisitSize(hash: string, origSize: number) {
510-
const incrMap: Record<string, number> = {};
510+
const incrMap: Record<string, { size: number; count: number }> = {};
511511

512512
const length = 25;
513513
let start = 0;
@@ -521,7 +521,13 @@ export class RedisDedupeIndex {
521521

522522
for (const entry of sizeEntries) {
523523
const { size, crawlId } = JSON.parse(entry);
524-
incrMap[crawlId] = (incrMap[crawlId] || 0) + (origSize - size);
524+
let res = incrMap[crawlId];
525+
if (!res) {
526+
res = { size: 0, count: 0 };
527+
incrMap[crawlId] = res;
528+
}
529+
res.size += origSize - size;
530+
res.count += 1;
525531
}
526532

527533
if (sizeEntries.length < length) {
@@ -532,9 +538,9 @@ export class RedisDedupeIndex {
532538

533539
const pipe = this.dedupeRedis.pipeline();
534540

535-
for (const [crawlId, value] of Object.entries(incrMap)) {
536-
this.incrDeduped(pipe, `h:${crawlId}:counts`, value);
537-
this.incrDeduped(pipe, DUPE_ALL_COUNTS, value);
541+
for (const [crawlId, { size, count }] of Object.entries(incrMap)) {
542+
this.incrDeduped(pipe, `h:${crawlId}:counts`, size, count);
543+
this.incrDeduped(pipe, DUPE_ALL_COUNTS, size, count);
538544
}
539545

540546
pipe.del(`rev:${hash}`);

0 commit comments

Comments
 (0)