Use 99th percentile and standard deviation to dynamically tune peer timeout (#1553)

pipermerriam · web-flow · commit 7d085bd3742c · 2018-12-10T11:19:55.000-07:00
* implement percentile tracking for peer round trip request/response times

* Use dynamic timeout values for peer requests based on historical performance

* linting

* Convert Percentile and StandardDeviation to operate on a historical window

* PR feedback
diff --git a/tests/trinity/core/utils/test_percentile.py b/tests/trinity/core/utils/test_percentile.py
@@ -0,0 +1,19 @@
+import pytest
+
+from trinity.utils.percentile import Percentile
+
+
+@pytest.mark.parametrize(
+    'data,percentile,window_size,expected',
+    (
+        (range(6), 0.2, 6, 1),
+        (range(11), 0.4, 11, 4),
+        (range(11), 0.2, 6, 6),
+    ),
+)
+def test_percentile_class(data, percentile, window_size, expected):
+    percentile = Percentile(percentile=percentile, window_size=window_size)
+    for value in data:
+        percentile.update(value)
+
+    assert percentile.value == expected
diff --git a/tests/trinity/core/utils/test_standard_deviation.py b/tests/trinity/core/utils/test_standard_deviation.py
@@ -0,0 +1,22 @@
+import pytest
+
+from trinity.utils.stddev import StandardDeviation
+
+
+@pytest.mark.parametrize(
+    "data,expected",
+    (
+        ((4, 2, 5, 8, 6), 2.23606),
+        ((1.5, 1.8, 7, 1.2, 1.35), 2.4863),
+        ((2, 2, 2, 2, 2), 0),
+        ((1, 3, 5, 7, 9), 3.1622),
+        ((100, 200, 300, 400, 500, 1, 3, 5, 7, 9), 3.1622),
+    ),
+)
+def test_standard_deviation(data, expected):
+    stddev = StandardDeviation(window_size=5)
+
+    for value in data:
+        stddev.update(value)
+
+    assert abs(stddev.value - expected) < 0.01
diff --git a/trinity/protocol/common/managers.py b/trinity/protocol/common/managers.py
@@ -79,29 +79,45 @@ async def payload_candidates(
         To mark a response as valid, use `complete_request`. After that call, payload
         candidates will stop arriving.
         """
-        if timeout is None:
-            timeout = self.response_timeout
+        outer_timeout = self.response_timeout if timeout is None else timeout
 
         start_at = time.perf_counter()
 
         # The _lock ensures that we never have two concurrent requests to a
         # single peer for a single command pair in flight.
         try:
-            await self.wait(self._lock.acquire(), timeout=timeout)
+            await self.wait(self._lock.acquire(), timeout=outer_timeout)
         except TimeoutError:
             raise AlreadyWaiting(
                 f"Timed out waiting for {self.response_msg_name} request lock "
                 f"or peer: {self._peer}"
             )
 
+        if timeout is not None or tracker.total_msgs < 20:
+            inner_timeout = outer_timeout
+        else:
+            # We compute a timeout based on the historical performance
+            # of the peer defined as three standard deviations above
+            # the response time for the 99th percentile of requests.
+            try:
+                rtt_99th = tracker.round_trip_99th.value
+                rtt_stddev = tracker.round_trip_stddev.value
+            except ValueError:
+                inner_timeout = outer_timeout
+            else:
+                inner_timeout = rtt_99th + 3 * rtt_stddev
+
         try:
             self._request(request)
             while self._is_pending():
-                timeout_remaining = max(0, timeout - (time.perf_counter() - start_at))
+                timeout_remaining = max(0, outer_timeout - (time.perf_counter() - start_at))
+
+                payload_timeout = min(inner_timeout, timeout_remaining)
+
                 try:
-                    yield await self._get_payload(timeout_remaining)
+                    yield await self._get_payload(payload_timeout)
                 except TimeoutError:
-                    tracker.record_timeout(timeout)
+                    tracker.record_timeout()
                     raise
         finally:
             self._lock.release()
diff --git a/trinity/protocol/common/trackers.py b/trinity/protocol/common/trackers.py
@@ -13,6 +13,8 @@
 
 from trinity.utils.ema import EMA
 from trinity.utils.logging import HasTraceLogger
+from trinity.utils.percentile import Percentile
+from trinity.utils.stddev import StandardDeviation
 from .constants import ROUND_TRIP_TIMEOUT
 from .types import (
     TResult,
@@ -35,8 +37,10 @@ def __init__(self) -> None:
         # empty responses.
         self.response_quality_ema = EMA(initial_value=0, smoothing_factor=0.05)
 
-        # an EMA of the round trip request/response time
+        # Metrics for the round trip request/response time
         self.round_trip_ema = EMA(initial_value=ROUND_TRIP_TIMEOUT, smoothing_factor=0.05)
+        self.round_trip_99th = Percentile(percentile=0.99, window_size=200)
+        self.round_trip_stddev = StandardDeviation(window_size=200)
 
         # an EMA of the items per second
         self.items_per_second_ema = EMA(initial_value=0, smoothing_factor=0.05)
@@ -76,39 +80,43 @@ def get_stats(self) -> str:
         """
         if not self.total_msgs:
             return 'None'
-        avg_rtt = self.total_response_time / self.total_msgs
-        if not self.total_response_time:
-            items_per_second = 0.0
-        else:
-            items_per_second = self.total_items / self.total_response_time
+
+        try:
+            rt99 = self.round_trip_99th.value
+        except ValueError:
+            rt99 = 0
+
+        try:
+            rt_stddev = self.round_trip_stddev.value
+        except ValueError:
+            rt_stddev = 0
 
         # msgs: total number of messages
         # items: total number of items
-        # rtt: round-trip-time (avg/ema)
-        # ips: items-per-second (avg/ema)
+        # rtt: round-trip-time (ema/99th/stddev)
+        # ips: items-per-second (ema)
         # timeouts: total number of timeouts
         # missing: total number of missing response items
         # quality: 0-100 for how complete responses are
         return (
-            'msgs=%d  items=%d  rtt=%.2f/%.2f  ips=%.5f/%.5f  '
+            'msgs=%d  items=%d  rtt=%.2f/%.2f/%.2f  ips=%.5f  '
             'timeouts=%d  quality=%d'
         ) % (
             self.total_msgs,
             self.total_items,
-            avg_rtt,
             self.round_trip_ema.value,
-            items_per_second,
+            rt99,
+            rt_stddev,
             self.items_per_second_ema.value,
             self.total_timeouts,
             int(self.response_quality_ema.value),
         )
 
-    def record_timeout(self, timeout: float) -> None:
+    def record_timeout(self) -> None:
         self.total_msgs += 1
         self.total_timeouts += 1
         self.response_quality_ema.update(0)
         self.items_per_second_ema.update(0)
-        self.round_trip_ema.update(timeout)
 
     def record_response(self,
                         elapsed: float,
@@ -148,7 +156,10 @@ def record_response(self,
 
         self.total_items += num_items
         self.total_response_time += elapsed
+
         self.round_trip_ema.update(elapsed)
+        self.round_trip_99th.update(elapsed)
+        self.round_trip_stddev.update(elapsed)
 
         if elapsed > 0:
             throughput = num_items / elapsed
diff --git a/trinity/sync/full/constants.py b/trinity/sync/full/constants.py
@@ -1,3 +1,4 @@
 # How old (in seconds) must our local head be to cause us to start with a
 # fast-sync before we switch to regular-sync.
 FAST_SYNC_CUTOFF = 60 * 60 * 24
+FAST_SYNC_CUTOFF = 60
diff --git a/trinity/utils/percentile.py b/trinity/utils/percentile.py
@@ -0,0 +1,53 @@
+import bisect
+import collections
+import math
+from typing import List, Union, Deque
+
+
+class Percentile:
+    """
+    Track a specific percentile across a window of recent data.
+
+    https://en.wikipedia.org/wiki/Percentile
+    """
+    def __init__(self, percentile: float, window_size: int) -> None:
+        if percentile < 0 or percentile > 1:
+            raise ValueError("Invalid: percentile must be in the range [0, 1]")
+        self.window: List[Union[int, float]] = []
+        self.history: Deque[Union[int, float]] = collections.deque()
+        self.percentile = percentile
+        self.window_size = window_size
+
+    @property
+    def value(self) -> float:
+        """
+        The current approximation for the tracked percentile.
+        """
+        if not self.window:
+            raise ValueError("No data for percentile calculation")
+
+        idx = (len(self.window) - 1) * self.percentile
+        if int(idx) == idx:
+            return self.window[int(idx)]
+
+        left = int(math.floor(idx))
+        right = int(math.ceil(idx))
+
+        left_part = self.window[int(left)] * (right - idx)
+        right_part = self.window[int(right)] * (idx - left)
+
+        return left_part + right_part
+
+    def update(self, value: Union[int, float]) -> None:
+        bisect.insort(self.window, value)
+        self.history.append(value)
+
+        while len(self.history) > self.window_size:
+            to_discard = self.history.popleft()
+            window_idx = bisect.bisect_left(self.window, to_discard)
+            discarded = self.window.pop(window_idx)
+            if discarded != to_discard:
+                raise ValueError(
+                    "The value popped from the `window` does not match the "
+                    "expected value"
+                )
diff --git a/trinity/utils/stddev.py b/trinity/utils/stddev.py
@@ -0,0 +1,35 @@
+import collections
+import math
+from typing import Union, Deque
+
+
+class StandardDeviation:
+    """
+    https://stackoverflow.com/questions/5543651/computing-standard-deviation-in-a-stream
+
+    Tracks standard deviation on a stream of data.
+    """
+    def __init__(self, window_size: int) -> None:
+        self.window: Deque[Union[int, float]] = collections.deque()
+        self.window_size = window_size
+
+    def update(self, value: Union[int, float]) -> None:
+        self.window.append(value)
+
+        while len(self.window) > self.window_size:
+            self.window.popleft()
+
+    @property
+    def value(self) -> float:
+        num_values = len(self.window)
+
+        if num_values < 2:
+            raise ValueError("No data")
+
+        sum_of_values = sum(self.window)
+        sum_of_squared_values = sum(item * item for item in self.window)
+
+        return math.sqrt(
+            (num_values * sum_of_squared_values - sum_of_values ** 2) /
+            (num_values * (num_values - 1))
+        )