Skip to content

Commit c89b4aa

Browse files
committed
compute total size of revisits per crawl:
conservedSize += (origSize * num of revisits) - (sum of revisit sizes)
1 parent 898ca46 commit c89b4aa

File tree

1 file changed

+33
-25
lines changed

1 file changed

+33
-25
lines changed

src/util/state.ts

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,12 @@ export class RedisDedupeIndex {
496496

497497
// if orig size is not known, queue for later to resolve
498498
if (!origRecSize) {
499-
pipe.lpush(`rev:${hash}`, JSON.stringify({ size, crawlId }));
499+
//pipe.lpush(`rev:${hash}`, JSON.stringify({ size, crawlId }));
500+
// pipe.hincrby("rev", `${hash}:s`, size);
501+
// pipe.hincrby("rev", `${hash}:c`, 1);
502+
503+
pipe.hincrby(`rev:${hash}:s`, crawlId, size);
504+
pipe.hincrby(`rev:${hash}:c`, crawlId, 1);
500505
}
501506

502507
// incr dedupe count, and size if known
@@ -510,37 +515,40 @@ export class RedisDedupeIndex {
510515
}
511516

512517
async matchRevisitSize(hash: string, origSize: number) {
513-
const incrMap: Record<string, number> = {};
518+
const revCounts = await this.dedupeRedis.hgetall(`rev:${hash}:c`);
519+
if (!revCounts || !Object.keys(revCounts).length) {
520+
return;
521+
}
522+
const revSizes = await this.dedupeRedis.hgetall(`rev:${hash}:s`);
514523

515-
const length = 25;
516-
let start = 0;
524+
const pipe = this.dedupeRedis.pipeline();
517525

518-
while (true) {
519-
const sizeEntries = await this.dedupeRedis.lrange(
520-
`rev:${hash}`,
521-
start,
522-
start + length,
523-
);
526+
let totalSize = 0;
527+
let totalCount = 0;
524528

525-
for (const entry of sizeEntries) {
526-
const { size, crawlId } = JSON.parse(entry);
527-
incrMap[crawlId] = (incrMap[crawlId] || 0) + (origSize - size);
528-
}
529+
// compute size saved per crawl, and add totals
530+
for (const [crawlId, count] of Object.entries(revCounts)) {
531+
const size = Number(revSizes[crawlId]);
532+
totalSize += size;
529533

530-
if (sizeEntries.length < length) {
531-
break;
532-
}
533-
start += length;
534+
this.incrDeduped(
535+
pipe,
536+
`h:${crawlId}:counts`,
537+
Number(count) * origSize - size,
538+
0,
539+
);
540+
totalCount += Number(count);
534541
}
535542

536-
const pipe = this.dedupeRedis.pipeline();
537-
538-
for (const [crawlId, size] of Object.entries(incrMap)) {
539-
this.incrDeduped(pipe, `h:${crawlId}:counts`, size, 0);
540-
this.incrDeduped(pipe, DUPE_ALL_COUNTS, size, 0);
541-
}
543+
// incr total size saved
544+
this.incrDeduped(
545+
pipe,
546+
DUPE_ALL_COUNTS,
547+
totalCount * origSize - totalSize,
548+
0,
549+
);
542550

543-
pipe.del(`rev:${hash}`);
551+
pipe.del(`rev:${hash}:c`, `rev:${hash}:s`);
544552

545553
await pipe.exec();
546554
}

0 commit comments

Comments
 (0)