@@ -496,7 +496,12 @@ export class RedisDedupeIndex {
496496
497497 // if orig size is not known, queue for later to resolve
498498 if ( ! origRecSize ) {
499- pipe . lpush ( `rev:${ hash } ` , JSON . stringify ( { size, crawlId } ) ) ;
499+ //pipe.lpush(`rev:${hash}`, JSON.stringify({ size, crawlId }));
500+ // pipe.hincrby("rev", `${hash}:s`, size);
501+ // pipe.hincrby("rev", `${hash}:c`, 1);
502+
503+ pipe . hincrby ( `rev:${ hash } :s` , crawlId , size ) ;
504+ pipe . hincrby ( `rev:${ hash } :c` , crawlId , 1 ) ;
500505 }
501506
502507 // incr dedupe count, and size if known
@@ -510,37 +515,40 @@ export class RedisDedupeIndex {
510515 }
511516
512517 async matchRevisitSize ( hash : string , origSize : number ) {
513- const incrMap : Record < string , number > = { } ;
518+ const revCounts = await this . dedupeRedis . hgetall ( `rev:${ hash } :c` ) ;
519+ if ( ! revCounts || ! Object . keys ( revCounts ) . length ) {
520+ return ;
521+ }
522+ const revSizes = await this . dedupeRedis . hgetall ( `rev:${ hash } :s` ) ;
514523
515- const length = 25 ;
516- let start = 0 ;
524+ const pipe = this . dedupeRedis . pipeline ( ) ;
517525
518- while ( true ) {
519- const sizeEntries = await this . dedupeRedis . lrange (
520- `rev:${ hash } ` ,
521- start ,
522- start + length ,
523- ) ;
526+ let totalSize = 0 ;
527+ let totalCount = 0 ;
524528
525- for ( const entry of sizeEntries ) {
526- const { size , crawlId } = JSON . parse ( entry ) ;
527- incrMap [ crawlId ] = ( incrMap [ crawlId ] || 0 ) + ( origSize - size ) ;
528- }
529+ // compute size saved per crawl, and add totals
530+ for ( const [ crawlId , count ] of Object . entries ( revCounts ) ) {
531+ const size = Number ( revSizes [ crawlId ] ) ;
532+ totalSize += size ;
529533
530- if ( sizeEntries . length < length ) {
531- break ;
532- }
533- start += length ;
534+ this . incrDeduped (
535+ pipe ,
536+ `h:${ crawlId } :counts` ,
537+ Number ( count ) * origSize - size ,
538+ 0 ,
539+ ) ;
540+ totalCount += Number ( count ) ;
534541 }
535542
536- const pipe = this . dedupeRedis . pipeline ( ) ;
537-
538- for ( const [ crawlId , size ] of Object . entries ( incrMap ) ) {
539- this . incrDeduped ( pipe , `h:${ crawlId } :counts` , size , 0 ) ;
540- this . incrDeduped ( pipe , DUPE_ALL_COUNTS , size , 0 ) ;
541- }
543+ // incr total size saved
544+ this . incrDeduped (
545+ pipe ,
546+ DUPE_ALL_COUNTS ,
547+ totalCount * origSize - totalSize ,
548+ 0 ,
549+ ) ;
542550
543- pipe . del ( `rev:${ hash } ` ) ;
551+ pipe . del ( `rev:${ hash } :c` , `rev: ${ hash } :s `) ;
544552
545553 await pipe . exec ( ) ;
546554 }
0 commit comments