Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Unreleased

## sketch

- Add the `sketch.NUnique` class. It was previoulsy in the `stats` module. This sketch estimates the number of unique elements in a stream.

## stats

- Added `update_many` method to `stats.PearsonCorr`.
- Moved `stats.NUnique` to the `sketch` module, as it is more of a sketch than a statistical indicator.
3 changes: 2 additions & 1 deletion river/sketch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .counter import Counter
from .heavy_hitters import HeavyHitters
from .histogram import Histogram
from .n_unique import NUnique
from .set import Set

__all__ = ["Counter", "HeavyHitters", "Histogram", "Set"]
__all__ = ["Counter", "HeavyHitters", "Histogram", "NUnique", "Set"]
26 changes: 13 additions & 13 deletions river/stats/n_unique.py → river/sketch/n_unique.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

import numpy as np

from river import stats
from river import base


class NUnique(stats.base.Univariate):
class NUnique(base.Base):
"""Approximate number of unique values counter.

This is basically an implementation of the HyperLogLog algorithm. Adapted from
Expand All @@ -32,10 +32,10 @@ class NUnique(stats.base.Univariate):
--------

>>> import string
>>> from river import stats
>>> from river import sketch

>>> alphabet = string.ascii_lowercase
>>> n_unique = stats.NUnique(error_rate=0.2, seed=42)
>>> n_unique = sketch.NUnique(error_rate=0.2, seed=42)

>>> n_unique.update('a')
>>> n_unique.get()
Expand All @@ -52,7 +52,7 @@ class NUnique(stats.base.Univariate):

Lowering the `error_rate` parameter will increase the precision.

>>> n_unique = stats.NUnique(error_rate=0.01, seed=42)
>>> n_unique = sketch.NUnique(error_rate=0.01, seed=42)
>>> for letter in alphabet:
... n_unique.update(letter)
>>> n_unique.get()
Expand All @@ -67,7 +67,7 @@ class NUnique(stats.base.Univariate):

P32 = 2**32

def __init__(self, error_rate=0.01, seed: int | None = None):
def __init__(self, error_rate: float = 0.01, seed: int | None = None):
self.error_rate = error_rate
self.seed = seed

Expand All @@ -77,20 +77,20 @@ def __init__(self, error_rate=0.01, seed: int | None = None):
self._salt = np.random.RandomState(seed).bytes(hashlib.blake2s.SALT_SIZE)

@property
def name(self):
def name(self) -> str:
return "n_unique"

def _hash(self, x):
def _hash(self, x: str) -> int:
hexa = hashlib.blake2s(bytes(x, encoding="utf8"), salt=self._salt).hexdigest()
return int(hexa, 16)

def update(self, x):
x = self._hash(x)
i = x & NUnique.P32 - 1 >> 32 - self.n_bits
z = 35 - len(bin(NUnique.P32 - 1 & x << self.n_bits | 1 << self.n_bits - 1))
def update(self, x: str) -> None:
h = self._hash(x)
i = h & NUnique.P32 - 1 >> 32 - self.n_bits
z = 35 - len(bin(NUnique.P32 - 1 & h << self.n_bits | 1 << self.n_bits - 1))
self.buckets[i] = max(self.buckets[i], z)

def get(self):
def get(self) -> int:
a = (
{16: 0.673, 32: 0.697, 64: 0.709}[self.n_buckets]
if self.n_buckets <= 64
Expand Down
2 changes: 0 additions & 2 deletions river/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from .mean import BayesianMean, Mean
from .minimum import Min, RollingMin
from .mode import Mode, RollingMode
from .n_unique import NUnique
from .pearson import PearsonCorr
from .ptp import PeakToPeak, RollingPeakToPeak
from .quantile import Quantile, RollingQuantile
Expand Down Expand Up @@ -47,7 +46,6 @@
"Mean",
"Min",
"Mode",
"NUnique",
"PeakToPeak",
"PearsonCorr",
"Quantile",
Expand Down
2 changes: 0 additions & 2 deletions river/stats/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ def test_pickling_value(stat):
for i in range(10):
if isinstance(stat, stats.base.Bivariate):
stat.update(i, i)
elif isinstance(stat, stats.NUnique): # takes string in input
stat.update(str(i))
else:
stat.update(i)

Expand Down