Skip to content

Commit e841a7f

Browse files
authored
[v3.0.2] Fix hyperloglog + seedrandom typings (#73)
* fix seedrandom typing after update * fix seedrandom typing after update * fix hyperloglog * prettier * use next/* branches * use original seedrandom.PRNG type
1 parent ae96027 commit e841a7f

File tree

8 files changed

+596
-473
lines changed

8 files changed

+596
-473
lines changed

.github/workflows/npm_test_doc.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
name: 🔎 Tests
22
on:
33
push:
4-
branches: [ master, develop ]
4+
branches: [ master, develop, next/* ]
55
pull_request:
6-
branches: [ master, develop ]
6+
branches: [ master, develop, next/* ]
77
jobs:
88
ubuntu_build:
99
runs-on: ubuntu-latest

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "bloom-filters",
3-
"version": "3.0.1",
3+
"version": "3.0.2",
44
"description": "JS implementation of probabilistic data structures: Bloom Filter (and its derived), HyperLogLog, Count-Min Sketch, Top-K and MinHash",
55
"main": "dist/api.js",
66
"type": "commonjs",
@@ -52,7 +52,7 @@
5252
"@types/lodash.eq": "^4.0.X",
5353
"@types/lodash.indexof": "^4.0.X",
5454
"@types/node": "^17.0.17",
55-
"@types/seedrandom": "^3.0.2",
55+
"@types/seedrandom": "^3.0.8",
5656
"@types/xxhashjs": "^0.2.X",
5757
"@typescript-eslint/eslint-plugin": "^5.11.0",
5858
"@typescript-eslint/parser": "^5.11.0",

src/base-filter.ts

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,23 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222
SOFTWARE.
2323
*/
2424

25-
import seedrandom from 'seedrandom'
25+
import seedrandom, {PRNG} from 'seedrandom'
2626
import Hashing from './hashing/hashing'
2727
import {getDefaultSeed} from './utils'
2828

29-
/**
30-
* Exported prng type because it is not from seedrandom
31-
* Orignal type can be found in: @types/seedrandom
32-
*/
33-
export interface prng {
34-
(): number
35-
double(): number
36-
int32(): number
37-
quick(): number
38-
state(): seedrandom.State
39-
}
40-
4129
/**
4230
* A base class for implementing probailistic filters
4331
* @author Thomas Minier
4432
* @author Arnaud Grall
4533
*/
4634
export default abstract class BaseFilter {
4735
public _seed: number
48-
public _rng: prng
36+
public _rng: PRNG
4937
public _hashing: Hashing
5038

5139
constructor() {
5240
this._seed = getDefaultSeed()
53-
this._rng = seedrandom(`${this._seed}`) as prng
41+
this._rng = seedrandom(`${this._seed}`)
5442
this._hashing = new Hashing()
5543
}
5644

@@ -67,14 +55,14 @@ export default abstract class BaseFilter {
6755
*/
6856
public set seed(seed: number) {
6957
this._seed = seed
70-
this._rng = seedrandom(`${this._seed}`) as prng
58+
this._rng = seedrandom(`${this._seed}`)
7159
}
7260

7361
/**
7462
* Get a function used to draw random number
7563
* @return A factory function used to draw random integer
7664
*/
77-
public get random(): prng {
65+
public get random() {
7866
return this._rng
7967
}
8068

src/bloom/scalable-bloom-filter.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ SOFTWARE.
2323
*/
2424

2525
import ClassicFilter from '../interfaces/classic-filter'
26-
import BaseFilter, {prng} from '../base-filter'
26+
import BaseFilter from '../base-filter'
2727
import {AutoExportable, Field, Parameter} from '../exportable'
2828
import {HashableInput} from '../utils'
2929
import PartitionBloomFilter from './partitioned-bloom-filter'
@@ -118,7 +118,7 @@ export default class ScalableBloomFilter
118118
*/
119119
public set seed(seed: number) {
120120
this._seed = seed
121-
this._rng = seedrandom(`${this._seed}`) as prng
121+
this._rng = seedrandom(`${this._seed}`)
122122
this._filters.forEach((filter: PartitionBloomFilter) => {
123123
filter.seed = this.seed
124124
})

src/sketch/hyperloglog.ts

Lines changed: 61 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2222
SOFTWARE.
2323
*/
2424

25+
import XXH from 'xxhashjs'
2526
import BaseFilter from '../base-filter'
2627
import {AutoExportable, Field, Parameter} from '../exportable'
2728
import {HashableInput, allocateArray} from '../utils'
2829

2930
// 2^32, computed as a constant as we use it a lot in the HyperLogLog algorithm
30-
const TWO_POW_32 = Math.pow(2, 32)
31+
const TWO_POW_32 = 2 ** 32
3132

3233
/**
3334
* Estimlate the bias-correction constant, denoted alpha in the algorithm, based on the number of registers.
@@ -36,15 +37,17 @@ const TWO_POW_32 = Math.pow(2, 32)
3637
* @return The estimated bias-correction constant
3738
*/
3839
function computeAlpha(m: number): number {
39-
switch (m) {
40-
case 16:
41-
return 0.673
42-
case 32:
43-
return 0.697
44-
case 64:
45-
return 0.709
46-
default:
47-
return 0.7213 / (1.0 + 1.079 / m)
40+
if (m < 16) {
41+
return 1
42+
} else if (m < 32) {
43+
return 0.673
44+
} else if (m < 64) {
45+
return 0.697
46+
} else if (m < 128) {
47+
return 0.709
48+
} else {
49+
// >= 128
50+
return 0.7213 / (1.0 + 1.079 / m)
4851
}
4952
}
5053

@@ -85,8 +88,11 @@ export default class HyperLogLog extends BaseFilter {
8588
*/
8689
constructor(@Parameter('_nbRegisters') nbRegisters: number) {
8790
super()
91+
if ((nbRegisters & (nbRegisters - 1)) !== 0) {
92+
throw new Error('The number of registers should be a power of 2')
93+
}
8894
this._nbRegisters = nbRegisters
89-
this._nbBytesPerHash = Math.round(Math.log2(nbRegisters))
95+
this._nbBytesPerHash = Math.ceil(Math.log2(nbRegisters))
9096
this._correctionBias = computeAlpha(nbRegisters)
9197
this._registers = allocateArray(this._nbRegisters, 0)
9298
}
@@ -103,23 +109,28 @@ export default class HyperLogLog extends BaseFilter {
103109
* @param element - Element to add
104110
*/
105111
public update(element: HashableInput): void {
106-
// const hashedValue = Buffer.from(hashAsString(element, this.seed))
107-
const hashedValue = this._hashing.hashAsInt(element, this.seed).toString(2)
108-
const registerIndex =
109-
1 + parseInt(hashedValue.slice(0, this._nbBytesPerHash - 1), 2)
112+
const hashedValue = XXH.h64(element, this.seed)
113+
.toString(2)
114+
.padStart(64, '0')
115+
const k = 64 - this._nbBytesPerHash
116+
const registerIndex = parseInt(hashedValue.slice(k), 2)
110117
// find the left most 1-bit in the second part of the buffer
111-
const secondPart = hashedValue.slice(this._nbBytesPerHash)
112-
let posLeftMost = 0
113-
while (
114-
secondPart[posLeftMost] !== '1' &&
115-
posLeftMost < secondPart.length - 1
116-
) {
117-
posLeftMost++
118+
const second = hashedValue.slice(0, k)
119+
let leftmost_pos = k - 1
120+
let found = false
121+
let i = 0
122+
while (!found && i < second.length) {
123+
if (second[i] === '1') {
124+
found = true
125+
leftmost_pos = i
126+
} else {
127+
i++
128+
}
118129
}
119130
// update the register
120131
this._registers[registerIndex] = Math.max(
121132
this._registers[registerIndex],
122-
posLeftMost
133+
leftmost_pos
123134
)
124135
}
125136

@@ -129,28 +140,38 @@ export default class HyperLogLog extends BaseFilter {
129140
*/
130141
public count(round = false): number {
131142
// Use the standard HyperLogLog estimator
132-
const harmonicMean = this._registers.reduce(
143+
const Z = this._registers.reduce(
133144
(acc: number, value: number) => acc + Math.pow(2, -value),
134145
0
135146
)
136-
let estimation =
137-
(this._correctionBias * Math.pow(this._nbRegisters, 2)) / harmonicMean
138-
139-
// use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
140-
/*if (estimation < ((5/2) * this._nbRegisters) && this._registers.some(value => value === 0)) {
141-
const nbZeroRegisters = this._registers.filter(value => value === 0).length
142-
estimation = this._nbRegisters * Math.log(this._nbRegisters / nbZeroRegisters)
143-
}*/
144-
145-
// correct the estimation for very large registers
146-
if (estimation > TWO_POW_32 / 30) {
147-
estimation = -TWO_POW_32 * Math.log(1 - estimation / TWO_POW_32)
147+
const raw_estimation =
148+
(this._correctionBias * this._nbRegisters * this._nbRegisters * 2) / Z
149+
150+
let corrected_estimation
151+
152+
if (raw_estimation <= (5 / 2) * this._nbRegisters) {
153+
// use linear counting to correct the estimation if E < 5m/2 and some registers are set to zero
154+
const V = this._registers.filter(value => value === 0).length
155+
if (V > 0) {
156+
// small range correction: linear counting
157+
corrected_estimation =
158+
this._nbRegisters * Math.log(this._nbRegisters / V)
159+
} else {
160+
corrected_estimation = raw_estimation
161+
}
162+
} else if (raw_estimation <= TWO_POW_32 / 30) {
163+
// middle range correction; no correction
164+
corrected_estimation = raw_estimation
165+
} else {
166+
// raw_estimation > TWO_POW_32 / 30
167+
// large range correction
168+
corrected_estimation =
169+
-TWO_POW_32 * Math.log(1 - raw_estimation / TWO_POW_32)
148170
}
149-
// round if required
150171
if (round) {
151-
estimation = Math.round(estimation)
172+
return Math.round(corrected_estimation)
152173
}
153-
return estimation
174+
return corrected_estimation
154175
}
155176

156177
/**
@@ -173,7 +194,7 @@ export default class HyperLogLog extends BaseFilter {
173194
)
174195
}
175196
const newSketch = new HyperLogLog(this.nbRegisters)
176-
for (let i = 0; i < this.nbRegisters - 1; i++) {
197+
for (let i = 0; i < this.nbRegisters; i++) {
177198
newSketch._registers[i] = Math.max(
178199
this._registers[i],
179200
other._registers[i]

test/hyperloglog-test.js

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,52 @@ const {HyperLogLog} = require('../dist/api.js')
2828
describe('HyperLogLog', () => {
2929
describe('#update', () => {
3030
it('should support update and cardinality estimations (count) operations', () => {
31-
const nbDistinct = 100
32-
const sketch = new HyperLogLog(110)
31+
const m = 2 ** 8
32+
const n = 10e4
33+
const sketch = new HyperLogLog(m)
3334
// populate the sketch with some values
34-
for (let i = 0; i < 10e3; i++) {
35-
sketch.update(`${i % nbDistinct}`)
35+
for (let i = 0; i < n; i++) {
36+
sketch.update(i.toString())
3637
}
37-
sketch
38-
.count(true)
39-
.should.be.closeTo(nbDistinct, nbDistinct * sketch.accuracy())
40-
})
38+
39+
// Citation:
40+
// "Let σ ≈ 1.04/√m represent the standard error; the estimates provided by HYPERLOGLOG
41+
// are expected to be within σ, 2σ, 3σ of the exact count in respectively 65%, 95%, 99% of all
42+
// the cases"
43+
const exact_count = sketch.count()
44+
const relative_error = sketch.accuracy()
45+
46+
let error
47+
const relative_errors = [
48+
relative_error,
49+
2 * relative_error,
50+
3 * relative_error,
51+
]
52+
53+
for (const relative_err of relative_errors) {
54+
try {
55+
Math.abs(n - exact_count).should.be.below(n * relative_err)
56+
error = false
57+
break
58+
} catch (e) {
59+
error = e
60+
}
61+
}
62+
63+
if (error) {
64+
throw new Error(
65+
`should be withing σ, 2σ or 3σ: ${relative_errors
66+
.map(e => e * n)
67+
.toString()}: ${error.toString()}`
68+
)
69+
}
70+
}).timeout(0)
4171
})
4272

4373
describe('#merge', () => {
4474
it('should peforms the union of two HyperLogLog sketches', () => {
45-
const first = new HyperLogLog(10)
46-
const second = new HyperLogLog(10)
75+
const first = new HyperLogLog(8)
76+
const second = new HyperLogLog(8)
4777
first.update('alice')
4878
first.update('bob')
4979
second.update('carol')
@@ -59,8 +89,8 @@ describe('HyperLogLog', () => {
5989
})
6090

6191
it('should reject the union of two sketches with different number of registers', done => {
62-
const first = new HyperLogLog(10)
63-
const second = new HyperLogLog(20)
92+
const first = new HyperLogLog(8)
93+
const second = new HyperLogLog(32)
6494
try {
6595
first.merge(second)
6696
done(
@@ -76,8 +106,8 @@ describe('HyperLogLog', () => {
76106

77107
describe('#equals', () => {
78108
it('should returns True when two HyperLogLog sketches are equals', () => {
79-
const first = new HyperLogLog(10)
80-
const second = new HyperLogLog(10)
109+
const first = new HyperLogLog(8)
110+
const second = new HyperLogLog(8)
81111
first.update('alice')
82112
first.update('bob')
83113
second.update('alice')
@@ -86,14 +116,14 @@ describe('HyperLogLog', () => {
86116
})
87117

88118
it('should returns False when two sketches have different number of registers', () => {
89-
const first = new HyperLogLog(10)
90-
const second = new HyperLogLog(11)
119+
const first = new HyperLogLog(8)
120+
const second = new HyperLogLog(16)
91121
first.equals(second).should.equal(false)
92122
})
93123

94124
it('should returns False when two sketches have different content in their registers', () => {
95-
const first = new HyperLogLog(10)
96-
const second = new HyperLogLog(11)
125+
const first = new HyperLogLog(8)
126+
const second = new HyperLogLog(16)
97127
first.update('alice')
98128
first.update('bob')
99129
second.update('carol')
@@ -103,7 +133,7 @@ describe('HyperLogLog', () => {
103133
})
104134

105135
describe('#saveAsJSON', () => {
106-
const sketch = new HyperLogLog(10)
136+
const sketch = new HyperLogLog(8)
107137
sketch.update('alice')
108138
sketch.update('bob')
109139

test/iblt-test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ describe('Invertible Bloom Lookup Tables', () => {
174174
invalids.forEach(json => {
175175
;(function () {
176176
InvertibleBloomFilter.fromJSON(json)
177-
}.should.throw(Error))
177+
}).should.throw(Error)
178178
})
179179
})
180180

@@ -187,7 +187,7 @@ describe('Invertible Bloom Lookup Tables', () => {
187187
_elements: [],
188188
_seed: 1,
189189
})
190-
}.should.not.throw(Error))
190+
}).should.not.throw(Error)
191191
})
192192
})
193193

0 commit comments

Comments
 (0)