@@ -22,12 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222SOFTWARE.
2323*/
2424
25+ import XXH from 'xxhashjs'
2526import BaseFilter from '../base-filter'
2627import { AutoExportable , Field , Parameter } from '../exportable'
2728import { HashableInput , allocateArray } from '../utils'
2829
2930// 2^32, computed as a constant as we use it a lot in the HyperLogLog algorithm
30- const TWO_POW_32 = Math . pow ( 2 , 32 )
31+ const TWO_POW_32 = 2 ** 32
3132
3233/**
3334 * Estimlate the bias-correction constant, denoted alpha in the algorithm, based on the number of registers.
@@ -36,15 +37,17 @@ const TWO_POW_32 = Math.pow(2, 32)
3637 * @return The estimated bias-correction constant
3738 */
3839function computeAlpha ( m : number ) : number {
39- switch ( m ) {
40- case 16 :
41- return 0.673
42- case 32 :
43- return 0.697
44- case 64 :
45- return 0.709
46- default :
47- return 0.7213 / ( 1.0 + 1.079 / m )
40+ if ( m < 16 ) {
41+ return 1
42+ } else if ( m < 32 ) {
43+ return 0.673
44+ } else if ( m < 64 ) {
45+ return 0.697
46+ } else if ( m < 128 ) {
47+ return 0.709
48+ } else {
49+ // >= 128
50+ return 0.7213 / ( 1.0 + 1.079 / m )
4851 }
4952}
5053
@@ -85,8 +88,11 @@ export default class HyperLogLog extends BaseFilter {
8588 */
8689 constructor ( @Parameter ( '_nbRegisters' ) nbRegisters : number ) {
8790 super ( )
91+ if ( ( nbRegisters & ( nbRegisters - 1 ) ) !== 0 ) {
92+ throw new Error ( 'The number of registers should be a power of 2' )
93+ }
8894 this . _nbRegisters = nbRegisters
89- this . _nbBytesPerHash = Math . round ( Math . log2 ( nbRegisters ) )
95+ this . _nbBytesPerHash = Math . ceil ( Math . log2 ( nbRegisters ) )
9096 this . _correctionBias = computeAlpha ( nbRegisters )
9197 this . _registers = allocateArray ( this . _nbRegisters , 0 )
9298 }
@@ -103,23 +109,28 @@ export default class HyperLogLog extends BaseFilter {
103109 * @param element - Element to add
104110 */
105111 public update ( element : HashableInput ) : void {
106- // const hashedValue = Buffer.from(hashAsString(element, this.seed))
107- const hashedValue = this . _hashing . hashAsInt ( element , this . seed ) . toString ( 2 )
108- const registerIndex =
109- 1 + parseInt ( hashedValue . slice ( 0 , this . _nbBytesPerHash - 1 ) , 2 )
112+ const hashedValue = XXH . h64 ( element , this . seed )
113+ . toString ( 2 )
114+ . padStart ( 64 , '0' )
115+ const k = 64 - this . _nbBytesPerHash
116+ const registerIndex = parseInt ( hashedValue . slice ( k ) , 2 )
110117 // find the left most 1-bit in the second part of the buffer
111- const secondPart = hashedValue . slice ( this . _nbBytesPerHash )
112- let posLeftMost = 0
113- while (
114- secondPart [ posLeftMost ] !== '1' &&
115- posLeftMost < secondPart . length - 1
116- ) {
117- posLeftMost ++
118+ const second = hashedValue . slice ( 0 , k )
119+ let leftmost_pos = k - 1
120+ let found = false
121+ let i = 0
122+ while ( ! found && i < second . length ) {
123+ if ( second [ i ] === '1' ) {
124+ found = true
125+ leftmost_pos = i
126+ } else {
127+ i ++
128+ }
118129 }
119130 // update the register
120131 this . _registers [ registerIndex ] = Math . max (
121132 this . _registers [ registerIndex ] ,
122- posLeftMost
133+ leftmost_pos
123134 )
124135 }
125136
@@ -129,28 +140,38 @@ export default class HyperLogLog extends BaseFilter {
129140 */
130141 public count ( round = false ) : number {
131142 // Use the standard HyperLogLog estimator
132- const harmonicMean = this . _registers . reduce (
143+ const Z = this . _registers . reduce (
133144 ( acc : number , value : number ) => acc + Math . pow ( 2 , - value ) ,
134145 0
135146 )
136- let estimation =
137- ( this . _correctionBias * Math . pow ( this . _nbRegisters , 2 ) ) / harmonicMean
138-
139- // use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
140- /*if (estimation < ((5/2) * this._nbRegisters) && this._registers.some(value => value === 0)) {
141- const nbZeroRegisters = this._registers.filter(value => value === 0).length
142- estimation = this._nbRegisters * Math.log(this._nbRegisters / nbZeroRegisters)
143- }*/
144-
145- // correct the estimation for very large registers
146- if ( estimation > TWO_POW_32 / 30 ) {
147- estimation = - TWO_POW_32 * Math . log ( 1 - estimation / TWO_POW_32 )
147+ const raw_estimation =
148+ ( this . _correctionBias * this . _nbRegisters * this . _nbRegisters * 2 ) / Z
149+
150+ let corrected_estimation
151+
152+ if ( raw_estimation <= ( 5 / 2 ) * this . _nbRegisters ) {
153+ // use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
154+ const V = this . _registers . filter ( value => value === 0 ) . length
155+ if ( V > 0 ) {
156+ // small range correction: linear counting
157+ corrected_estimation =
158+ this . _nbRegisters * Math . log ( this . _nbRegisters / V )
159+ } else {
160+ corrected_estimation = raw_estimation
161+ }
162+ } else if ( raw_estimation <= TWO_POW_32 / 30 ) {
163+ // middle range correction; no correction
164+ corrected_estimation = raw_estimation
165+ } else {
166+ // raw_estimation > TWO_POW_32 / 30
167+ // large range correction
168+ corrected_estimation =
169+ - TWO_POW_32 * Math . log ( 1 - raw_estimation / TWO_POW_32 )
148170 }
149- // round if required
150171 if ( round ) {
151- estimation = Math . round ( estimation )
172+ return Math . round ( corrected_estimation )
152173 }
153- return estimation
174+ return corrected_estimation
154175 }
155176
156177 /**
@@ -173,7 +194,7 @@ export default class HyperLogLog extends BaseFilter {
173194 )
174195 }
175196 const newSketch = new HyperLogLog ( this . nbRegisters )
176- for ( let i = 0 ; i < this . nbRegisters - 1 ; i ++ ) {
197+ for ( let i = 0 ; i < this . nbRegisters ; i ++ ) {
177198 newSketch . _registers [ i ] = Math . max (
178199 this . _registers [ i ] ,
179200 other . _registers [ i ]
0 commit comments