Use Sample type with the RingBuffer (#186)

mathias-baumann-frequenz · web-flow · commit 30e3a730f3ec · 2023-02-01T18:52:22.000+01:00
fixes #165
diff --git a/benchmarks/timeseries/benchmark_ringbuffer.py b/benchmarks/timeseries/benchmark_ringbuffer.py
@@ -6,10 +6,11 @@
 import random
 import timeit
 from datetime import datetime, timedelta
-from typing import Any, TypeVar
+from typing import Any, Dict, TypeVar
 
 import numpy as np
 
+from frequenz.sdk.timeseries import Sample
 from frequenz.sdk.timeseries._ringbuffer import OrderedRingBuffer
 
 MINUTES_IN_A_DAY = 24 * 60
@@ -19,28 +20,22 @@
 T = TypeVar("T")
 
 
-def fill_buffer(
-    days: int, buffer: OrderedRingBuffer[T, Any], element_type: type
-) -> None:
+def fill_buffer(days: int, buffer: OrderedRingBuffer[Any]) -> None:
     """Fill the given buffer up to the given amount of days, one sample per minute."""
     random.seed(0)
     basetime = datetime(2022, 1, 1)
+    print("..filling", end="", flush=True)
 
     for day in range(days):
         # Push in random order
         for i in random.sample(range(MINUTES_IN_A_DAY), MINUTES_IN_A_DAY):
             buffer.update(
-                basetime + timedelta(days=day, minutes=i, seconds=i % 3),
-                element_type(i),
+                Sample(basetime + timedelta(days=day, minutes=i, seconds=i % 3))
             )
 
 
-def test_days(days: int, buffer: OrderedRingBuffer[int, Any]) -> None:
-    """Fills a buffer completely up and then gets the data for each of the 29 days."""
-    print(".", end="", flush=True)
-
-    fill_buffer(days, buffer, int)
-
+def test_days(days: int, buffer: OrderedRingBuffer[Any]) -> None:
+    """Gets the data for each of the 29 days."""
     basetime = datetime(2022, 1, 1)
 
     for day in range(days):
@@ -50,119 +45,166 @@ def test_days(days: int, buffer: OrderedRingBuffer[int, Any]) -> None:
         )
 
 
-def test_slices(days: int, buffer: OrderedRingBuffer[T, Any]) -> None:
+def test_slices(days: int, buffer: OrderedRingBuffer[Any], median: bool) -> None:
     """Benchmark slicing.
 
     Takes a buffer, fills it up and then excessively gets
     the data for each day to calculate the average/median.
     """
-    print(".", end="", flush=True)
-    fill_buffer(days, buffer, float)
-
-    # Chose uneven starting point so that for the first/last window data has to
-    # be copied
-    basetime = datetime(2022, 1, 1, 0, 5, 13, 88)
+    basetime = datetime(2022, 1, 1)
 
-    total_avg = 0.0
-    total_median = 0.0
+    total = 0.0
 
-    for _ in range(5):
+    for _ in range(3):
         for day in range(days):
             minutes = buffer.window(
                 basetime + timedelta(days=day), basetime + timedelta(days=day + 1)
             )
 
-            total_avg += float(np.average(minutes))
-            total_median += float(np.median(minutes))
+            if median:
+                total += float(np.median(minutes))
+            else:
+                total += float(np.average(minutes))
 
 
-def test_29_days_list() -> None:
+def test_29_days_list(num_runs: int) -> Dict[str, float]:
     """Run the 29 day test on the list backend."""
-    test_days(29, OrderedRingBuffer([0] * MINUTES_IN_29_DAYS, timedelta(minutes=1)))
+    days = 29
+    buffer = OrderedRingBuffer([0] * MINUTES_IN_29_DAYS, timedelta(minutes=1))
+
+    fill_time = timeit.Timer(lambda: fill_buffer(days, buffer)).timeit(number=1)
+    test_time = timeit.Timer(lambda: test_days(days, buffer)).timeit(number=num_runs)
+    return {"fill": fill_time, "test": test_time}
 
 
-def test_29_days_array() -> None:
+def test_29_days_array(num_runs: int) -> Dict[str, float]:
     """Run the 29 day test on the array backend."""
-    test_days(
-        29,
-        OrderedRingBuffer(
-            np.empty(
-                shape=MINUTES_IN_29_DAYS,
-            ),
-            timedelta(minutes=1),
+    days = 29
+    buffer = OrderedRingBuffer(
+        np.empty(
+            shape=MINUTES_IN_29_DAYS,
         ),
+        timedelta(minutes=1),
     )
 
+    fill_time = timeit.Timer(lambda: fill_buffer(days, buffer)).timeit(number=1)
+    test_time = timeit.Timer(lambda: test_days(days, buffer)).timeit(number=num_runs)
+    return {"fill": fill_time, "test": test_time}
 
-def test_29_days_slicing_list() -> None:
+
+def test_29_days_slicing_list(num_runs: int) -> Dict[str, float]:
     """Run slicing tests on list backend."""
-    test_slices(29, OrderedRingBuffer([0] * MINUTES_IN_29_DAYS, timedelta(minutes=1)))
+    days = 29
+    buffer = OrderedRingBuffer([0] * MINUTES_IN_29_DAYS, timedelta(minutes=1))
+
+    fill_time = timeit.Timer(lambda: fill_buffer(days, buffer)).timeit(number=1)
+    median_test_time = timeit.Timer(
+        lambda: test_slices(days, buffer, median=True)
+    ).timeit(number=num_runs)
+    avg_test_time = timeit.Timer(
+        lambda: test_slices(days, buffer, median=False)
+    ).timeit(number=num_runs)
 
+    return {"fill": fill_time, "median": median_test_time, "avg": avg_test_time}
 
-def test_29_days_slicing_array() -> None:
+
+def test_29_days_slicing_array(num_runs: int) -> Dict[str, float]:
     """Run slicing tests on array backend."""
-    test_slices(
-        29,
-        OrderedRingBuffer(
-            np.empty(
-                shape=MINUTES_IN_29_DAYS,
-            ),
-            timedelta(minutes=1),
+    days = 29
+    buffer = OrderedRingBuffer(
+        np.empty(
+            shape=MINUTES_IN_29_DAYS,
         ),
+        timedelta(minutes=1),
     )
 
+    fill_time = timeit.Timer(lambda: fill_buffer(days, buffer)).timeit(number=1)
+    median_test_time = timeit.Timer(
+        lambda: test_slices(days, buffer, median=True)
+    ).timeit(number=num_runs)
+    avg_test_time = timeit.Timer(
+        lambda: test_slices(days, buffer, median=False)
+    ).timeit(number=num_runs)
+
+    return {"fill": fill_time, "median": median_test_time, "avg": avg_test_time}
+
 
 def main() -> None:
     """Run benchmark.
 
     Result of previous run:
 
-    Date: Do 22. Dez 15:03:05 CET 2022
+    Date: Mi 1. Feb 17:19:51 CET 2023
     Result:
 
-           =========================================
-    Array: ........................................
-    List:  ........................................
+           =====================
+    Array: ..filling
+    List:  ..filling
     Time to fill 29 days with data:
-    Array: 0.09411649959984061 seconds
-    List:  0.0906366748000437 seconds
-    Diff:  0.0034798247997969156
-           =========================================
-    Array: ........................................
-    List:  ........................................
-    Filling 29 days and running average & mean on every day:
-    Array: 0.09842290654996759 seconds
-    List:  0.1316629376997298 seconds
-    Diff:  -0.03324003114976222
+            Array: 7.214875044999644 seconds
+            List:  7.174421657982748 seconds
+            Diff:  0.04045338701689616
+    Day-Slices into 29 days with data:
+            Array: 0.0001304591482039541 seconds
+            List:  0.00019963659869972616 seconds
+            Diff:  -6.917745049577205e-05
+           =====================
+    Array: ..filling
+    List:  ..filling
+    Avg of windows of 29 days and running average & mean on every day:
+            Array: 0.0007829780981410295 seconds
+            List:  0.0042931242496706545 seconds
+            Diff:  -0.0035101461515296252
+    Median of windows of 29 days and running average & mean on every day:
+            Array: 0.0021195551002165304 seconds
+            List:  0.00501448459981475 seconds
+            Diff:  -0.00289492949959822
     """
-    num_runs = 40
+    num_runs = 20
 
     print(f"       {''.join(['='] * (num_runs + 1))}")
     print("Array: ", end="")
-    duration_array = timeit.Timer(test_29_days_array).timeit(number=num_runs)
+    array_times = test_29_days_array(num_runs)
+
     print("\nList:  ", end="")
-    duration_list = timeit.Timer(test_29_days_list).timeit(number=num_runs)
+
+    list_times = test_29_days_list(num_runs)
     print("")
 
     print(
         "Time to fill 29 days with data:\n\t"
-        + f"Array: {duration_array/num_runs} seconds\n\t"
-        + f"List:  {duration_list/num_runs} seconds\n\t"
-        + f"Diff:  {duration_array/num_runs -  duration_list/num_runs}"
+        + f"Array: {array_times['fill']} seconds\n\t"
+        + f"List:  {list_times['fill']} seconds\n\t"
+        + f"Diff:  {array_times['fill'] -  list_times['fill']}"
+    )
+
+    print(
+        "Day-Slices into 29 days with data:\n\t"
+        + f"Array: {array_times['test']/num_runs} seconds\n\t"
+        + f"List:  {list_times['test']/num_runs} seconds\n\t"
+        + f"Diff:  {array_times['test']/num_runs -  list_times['test']/num_runs}"
     )
 
     print(f"       {''.join(['='] * (num_runs + 1))}")
     print("Array: ", end="")
-    duration_array = timeit.Timer(test_29_days_slicing_array).timeit(number=num_runs)
+    slicing_array_times = test_29_days_slicing_array(num_runs)
     print("\nList:  ", end="")
-    duration_list = timeit.Timer(test_29_days_slicing_list).timeit(number=num_runs)
+    slicing_list_times = test_29_days_slicing_list(num_runs)
     print("")
 
     print(
-        "Filling 29 days and running average & mean on every day:\n\t"
-        + f"Array: {duration_array/num_runs} seconds\n\t"
-        + f"List:  {duration_list/num_runs} seconds\n\t"
-        + f"Diff:  {duration_array/num_runs -  duration_list/num_runs}"
+        "Avg of windows of 29 days and running average & mean on every day:\n\t"
+        + f"Array: {slicing_array_times['avg']/num_runs} seconds\n\t"
+        + f"List:  {slicing_list_times['avg']/num_runs} seconds\n\t"
+        + f"Diff:  {slicing_array_times['avg']/num_runs -  slicing_list_times['avg']/num_runs}"
+    )
+
+    print(
+        "Median of windows of 29 days and running average & mean on every day:\n\t"
+        + f"Array: {slicing_array_times['median']/num_runs} seconds\n\t"
+        + f"List:  {slicing_list_times['median']/num_runs} seconds\n\t"
+        + "Diff:  "
+        + f"{slicing_array_times['median']/num_runs -  slicing_list_times['median']/num_runs}"
     )
 
 
diff --git a/src/frequenz/sdk/timeseries/_ringbuffer.py b/src/frequenz/sdk/timeseries/_ringbuffer.py
@@ -11,8 +11,9 @@
 from typing import Any, Generic, List, Sequence, TypeVar
 
 import numpy as np
+from numpy.lib import math
 
-T = TypeVar("T")
+from frequenz.sdk.timeseries import Sample
 
 Container = TypeVar("Container", list, np.ndarray)
 
@@ -41,7 +42,7 @@ def contains(self, timestamp: datetime):
         return False
 
 
-class OrderedRingBuffer(Generic[T, Container]):
+class OrderedRingBuffer(Generic[Container]):
     """Time aware ringbuffer that keeps its entries sorted by time."""
 
     def __init__(
@@ -105,28 +106,25 @@ def maxlen(self) -> int:
         """
         return len(self._buffer)
 
-    def update(self, timestamp: datetime, value: T, missing: bool = False) -> None:
+    def update(self, sample: Sample) -> None:
         """Update the buffer with a new value for the given timestamp.
 
+        Missing values are written as NaN. Be advised that when
+        `update()` is called with samples newer than the current time
+        + `sampling_period` (as is the case for loading historical data
+        after an app restart), a gap of missing data exists. This gap
+        does not contain NaN values but simply the old invalid values.
+        The list of gaps returned by `gaps()` will reflect this and
+        should be used as the only source of truth for unwritten data.
+
         Args:
-            timestamp: Timestamp of the new value.
-            value: value to add.
-            missing: if true, the given timestamp will be recorded as missing.
-                The value will still be written.
-
-                Note: When relying on your own values to mark missing data (e.g.
-                `math.nan`), be advised that when `update()` is called with
-                timestamps newer than the current time + `sampling_period`
-                (as is the case for loading historical data after an app
-                restart), a gap of missing data exists.
-                This gap does not contain NaN values but simply the old invalid values.
-                The list of gaps returned by `gaps()` will reflect this.
+            sample: Sample to add to the ringbuffer
 
         Raises:
             IndexError: When the timestamp to be added is too old.
         """
         # adjust timestamp to be exactly on the sample period time point
-        timestamp = self._normalize_timestamp(timestamp)
+        timestamp = self._normalize_timestamp(sample.timestamp)
 
         # Don't add outdated entries
         if timestamp < self._datetime_oldest and self._datetime_oldest != datetime.max:
@@ -140,9 +138,10 @@ def update(self, timestamp: datetime, value: T, missing: bool = False) -> None:
         self._datetime_oldest = self._datetime_newest - self._time_range
 
         # Update data
+        value: float = math.nan if sample.value is None else sample.value
         self._buffer[self.datetime_to_index(timestamp)] = value
 
-        self._update_gaps(timestamp, prev_newest, missing)
+        self._update_gaps(timestamp, prev_newest, sample.value is None)
 
     def datetime_to_index(
         self, timestamp: datetime, allow_outside_range: bool = False
@@ -387,7 +386,7 @@ def _wrap(self, index: int) -> int:
         """
         return index % self.maxlen
 
-    def __setitem__(self, index_or_slice: int | slice, value: T) -> None:
+    def __setitem__(self, index_or_slice: int | slice, value: float) -> None:
         """Set item or slice at requested position.
 
         No wrapping of the index will be done.
@@ -399,7 +398,7 @@ def __setitem__(self, index_or_slice: int | slice, value: T) -> None:
         """
         self._buffer.__setitem__(index_or_slice, value)
 
-    def __getitem__(self, index_or_slice: int | slice) -> T | Sequence[T]:
+    def __getitem__(self, index_or_slice: int | slice) -> float | Sequence[float]:
         """Get item or slice at requested position.
 
         No wrapping of the index will be done.
diff --git a/tests/timeseries/test_ringbuffer.py b/tests/timeseries/test_ringbuffer.py