fix(sampling): ensure the rate limiter operates on positive time intervals (#9416)

mabdinur · web-flow · commit b5588d1a89e3 · 2024-06-07T18:23:52.000Z
## Motivation Currently the RateLimiter samples the first span in trace using `Span.start_ns` and evaluating this timestamp against the last seen timestamp. [RateLimiter.is_allowed(...)](https://github.com/DataDog/dd-trace-py/blob/v2.9.0rc7/ddtrace/internal/rate_limiter.py#L60) works as expected if it receives monotonically increasing timestamps. However if this method receives a timestamp that is less than a previous value it will compute a [negative time window](https://github.com/DataDog/dd-trace-py/blob/v2.9.0rc7/ddtrace/internal/rate_limiter.py#L126) and then set an [incorrect rate_limit](https://github.com/DataDog/dd-trace-py/blob/v2.9.0rc7/ddtrace/internal/rate_limiter.py#L136). ddtrace v2.8.0 introduced support for lazy sampling. With this feature sample rates and rate limits are no longer applied on span start. This increased the frequency of this bug: 9707da1. ## Description This PR resolves this issue by: - Deprecating the timestamp argument in `RateLimiter.is_allowed`. The current time will always be used to compute span rate limits (instead of Span.start_ns). This will ensure rate limits are computed on ONLY increasing time intervals. - Ensuring a lock is acquired when computing rate limits and updating rate counts. Currently we only acquire a lock to compute `RateLimiter._replenish`. This is not sufficient. ## Reproduction - This bug can be reproduced by generating two spans with different start times but the same end time. The span with earliest start time should be finished last. Failing regression test: https://app.circleci.com/pipelines/github/DataDog/dd-trace-py/62701/workflows/915c8cc5-6968-4069-a379-84929b239df8/jobs/3906251 ## Checklist - [x] Change(s) are motivated and described in the PR description - [x] Testing strategy is described if automated tests are not included in the PR - [x] Risks are described (performance impact, potential for breakage, maintainability) - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. ## Reviewer Checklist - [x] Title is accurate - [x] All changes are related to the pull request's stated goal - [x] Description motivates each change - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [x] Testing strategy adequately addresses listed risks - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] Release note makes sense to a user of the library - [x] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
diff --git a/ddtrace/appsec/_processor.py b/ddtrace/appsec/_processor.py
@@ -374,7 +374,7 @@ def _waf_action(
         if waf_results.data or blocked:
             # We run the rate limiter only if there is an attack, its goal is to limit the number of collected asm
             # events
-            allowed = self._rate_limiter.is_allowed(span.start_ns)
+            allowed = self._rate_limiter.is_allowed()
             if not allowed:
                 # TODO: add metric collection to keep an eye (when it's name is clarified)
                 return waf_results
diff --git a/ddtrace/internal/core/_core.pyi b/ddtrace/internal/core/_core.pyi
@@ -29,12 +29,20 @@ class RateLimiter:
         :param time_window: The time window where the rate limit applies in nanoseconds. default value is 1 second.
         :type time_window: :obj:`float`
         """
-    def is_allowed(self, timestamp_ns: int) -> bool:
+    def is_allowed(self, timestamp_ns: typing.Optional[int] = None) -> bool:
         """
         Check whether the current request is allowed or not
 
         This method will also reduce the number of available tokens by 1
 
+        :param int timestamp_ns: timestamp in nanoseconds for the current request. [deprecated]
+        :returns: Whether the current request is allowed or not
+        :rtype: :obj:`bool`
+        """
+    def _is_allowed(self, timestamp_ns: int) -> bool:
+        """
+        Internal method to check whether the current request is allowed or not
+
         :param int timestamp_ns: timestamp in nanoseconds for the current request.
         :returns: Whether the current request is allowed or not
         :rtype: :obj:`bool`
diff --git a/ddtrace/internal/rate_limiter.py b/ddtrace/internal/rate_limiter.py
@@ -8,6 +8,9 @@
 
 import attr
 
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
+from ddtrace.vendor.debtcollector import deprecate
+
 from ..internal import compat
 from ..internal.constants import DEFAULT_SAMPLING_RATE_LIMIT
 from .core import RateLimiter as _RateLimiter
@@ -18,6 +21,17 @@ class RateLimiter(_RateLimiter):
     def _has_been_configured(self):
         return self.rate_limit != DEFAULT_SAMPLING_RATE_LIMIT
 
+    def is_allowed(self, timestamp_ns: Optional[int] = None) -> bool:
+        if timestamp_ns is not None:
+            deprecate(
+                "The `timestamp_ns` parameter is deprecated and will be removed in a future version."
+                "Ratelimiter will use the current time.",
+                category=DDTraceDeprecationWarning,
+            )
+        # rate limits are tested and mocked in pytest so we need to compute the timestamp here
+        # (or move the unit tests to rust)
+        return self._is_allowed(compat.monotonic_ns())
+
 
 class RateLimitExceeded(Exception):
     pass
diff --git a/ddtrace/internal/sampling.py b/ddtrace/internal/sampling.py
@@ -147,7 +147,7 @@ def __init__(
     def sample(self, span):
         # type: (Span) -> bool
         if self._sample(span):
-            if self._limiter.is_allowed(span.start_ns):
+            if self._limiter.is_allowed():
                 self.apply_span_sampling_tags(span)
                 return True
         return False
@@ -310,7 +310,7 @@ def _apply_rate_limit(span, sampled, limiter):
     # type: (Span, bool, RateLimiter) -> bool
     allowed = True
     if sampled:
-        allowed = limiter.is_allowed(span.start_ns)
+        allowed = limiter.is_allowed()
         if not allowed:
             _set_priority(span, USER_REJECT)
     if limiter._has_been_configured:
diff --git a/releasenotes/notes/rate-limiting-fix-06e1952610b246f1.yaml b/releasenotes/notes/rate-limiting-fix-06e1952610b246f1.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    tracing: Ensures spans are rate limited at the expected rate (100 spans per second by default). Previously long running spans would set the rate limiter to set an invalid window and this could cause the next trace to be dropped.
diff --git a/src/core/rate_limiter.rs b/src/core/rate_limiter.rs
@@ -31,7 +31,7 @@ impl RateLimiter {
         }
     }
 
-    pub fn is_allowed(&mut self, timestamp_ns: f64) -> bool {
+    pub fn _is_allowed(&mut self, timestamp_ns: f64) -> bool {
         let mut _lock = self._lock.lock().unwrap();
 
         let allowed = (|| -> bool {
@@ -43,7 +43,11 @@ impl RateLimiter {
             }
 
             if self.tokens < self.max_tokens {
-                let elapsed: f64 = (timestamp_ns - self.last_update_ns) / self.time_window;
+                let mut elapsed: f64 = (timestamp_ns - self.last_update_ns) / self.time_window;
+                if elapsed < 0.0 {
+                    // Note - this should never happen, but if it does, we should reset the elapsed time to avoid negative tokens.
+                    elapsed = 0.0
+                }
                 self.tokens += elapsed * self.max_tokens;
                 if self.tokens > self.max_tokens {
                     self.tokens = self.max_tokens;
@@ -114,8 +118,8 @@ impl RateLimiterPy {
         }
     }
 
-    pub fn is_allowed(&mut self, py: Python<'_>, timestamp_ns: f64) -> bool {
-        py.allow_threads(|| self.rate_limiter.is_allowed(timestamp_ns))
+    pub fn _is_allowed(&mut self, py: Python<'_>, timestamp_ns: f64) -> bool {
+        py.allow_threads(|| self.rate_limiter._is_allowed(timestamp_ns))
     }
 
     #[getter]
diff --git a/tests/integration/test_sampling.py b/tests/integration/test_sampling.py
@@ -1,3 +1,4 @@
+import mock
 import pytest
 
 from ddtrace import config
@@ -297,3 +298,48 @@ def test_extended_sampling_float_special_case_match_star(writer, tracer):
     tracer.configure(sampler=sampler, writer=writer)
     with tracer.trace(name="should_send") as span:
         span.set_tag("tag", 20.1)
+
+
+def test_rate_limiter_on_spans(tracer):
+    """
+    Ensure that the rate limiter is applied to spans
+    """
+    tracer.configure(sampler=DatadogSampler(rate_limit=10))
+    spans = []
+    # Generate 10 spans with the start and finish time in same second
+    for x in range(10):
+        start_time = x / 10
+        span = tracer.trace(name=f"span {start_time}")
+        span.start = start_time
+        span.finish(1 - start_time)
+        spans.append(span)
+    # Generate 11th span in the same second
+    dropped_span = tracer.trace(name=f"span {start_time}")
+    dropped_span.start = 0.8
+    dropped_span.finish(0.9)
+    # Spans are sampled on flush
+    tracer.flush()
+    # Since the rate limiter is set to 10, first ten spans should be kept
+    for span in spans:
+        assert span.context.sampling_priority > 0
+    # 11th span should be dropped
+    assert dropped_span.context.sampling_priority < 0
+
+
+def test_rate_limiter_on_long_running_spans(tracer):
+    """
+    Ensure that the rate limiter is applied on increasing time intervals
+    """
+    tracer.configure(sampler=DatadogSampler(rate_limit=5))
+
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=1617333414):
+        span_m30 = tracer.trace(name="march 30")
+        span_m30.start = 1622347257  # Mar 30 2021
+        span_m30.finish(1617333414)  # April 2 2021
+
+        span_m29 = tracer.trace(name="march 29")
+        span_m29.start = 1616999414  # Mar 29 2021
+        span_m29.finish(1617333414)  # April 2 2021
+
+    assert span_m29.context.sampling_priority > 0
+    assert span_m30.context.sampling_priority > 0
diff --git a/tests/tracer/test_rate_limiter.py b/tests/tracer/test_rate_limiter.py
@@ -33,7 +33,8 @@ def test_rate_limiter_rate_limit_0(time_window):
     now_ns = compat.monotonic_ns()
     for i in nanoseconds(10000, time_window):
         # Make sure the time is different for every check
-        assert limiter.is_allowed(now_ns + i) is False
+        with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now_ns + i):
+            assert limiter.is_allowed() is False
 
 
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
@@ -46,30 +47,32 @@ def test_rate_limiter_rate_limit_negative(time_window):
     now_ns = compat.monotonic_ns()
     for i in nanoseconds(10000, time_window):
         # Make sure the time is different for every check
-        assert limiter.is_allowed(now_ns + i) is True
+        with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now_ns + i):
+            assert limiter.is_allowed() is True
 
 
 @pytest.mark.parametrize("rate_limit", [1, 10, 50, 100, 500, 1000])
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
 def test_rate_limiter_is_allowed(rate_limit, time_window):
     limiter = RateLimiter(rate_limit=rate_limit, time_window=time_window)
 
-    def check_limit(time_ns):
+    def check_limit():
         # Up to the allowed limit is allowed
         for _ in range(rate_limit):
-            assert limiter.is_allowed(time_ns) is True
+            assert limiter.is_allowed() is True
 
         # Any over the limit is disallowed
         for _ in range(1000):
-            assert limiter.is_allowed(time_ns) is False
+            assert limiter.is_allowed() is False
 
     # Start time
     now = compat.monotonic_ns()
 
     # Check the limit for 5 time frames
     for i in nanoseconds(5, time_window):
         # Keep the same timeframe
-        check_limit(now + i)
+        with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now + i):
+            check_limit()
 
 
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
@@ -79,12 +82,14 @@ def test_rate_limiter_is_allowed_large_gap(time_window):
     # Start time
     now_ns = compat.monotonic_ns()
     # Keep the same timeframe
-    for _ in range(100):
-        assert limiter.is_allowed(now_ns) is True
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now_ns):
+        for _ in range(100):
+            assert limiter.is_allowed() is True
 
     # Large gap before next call to `is_allowed()`
-    for _ in range(100):
-        assert limiter.is_allowed(now_ns + (time_window * 100)) is True
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now_ns + (time_window * 100)):
+        for _ in range(100):
+            assert limiter.is_allowed() is True
 
 
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
@@ -98,8 +103,8 @@ def test_rate_limiter_is_allowed_small_gaps(time_window):
     for i in nanoseconds(10000, time_window):
         # Keep the same timeframe
         time_ns = now_ns + (gap * i)
-
-        assert limiter.is_allowed(time_ns) is True
+        with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=time_ns):
+            assert limiter.is_allowed() is True
 
 
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
@@ -108,30 +113,31 @@ def test_rate_liimter_effective_rate_rates(time_window):
 
     # Static rate limit window
     starting_window_ns = compat.monotonic_ns()
-    for _ in range(100):
-        assert limiter.is_allowed(starting_window_ns) is True
-        assert limiter.effective_rate == 1.0
-        assert limiter.current_window_ns == starting_window_ns
-
-    for i in range(1, 101):
-        assert limiter.is_allowed(starting_window_ns) is False
-        rate = 100 / (100 + i)
-        assert limiter.effective_rate == rate
-        assert limiter.current_window_ns == starting_window_ns
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=starting_window_ns):
+        for _ in range(100):
+            assert limiter.is_allowed() is True
+            assert limiter.effective_rate == 1.0
+            assert limiter.current_window_ns == starting_window_ns
+
+        for i in range(1, 101):
+            assert limiter.is_allowed() is False
+            rate = 100 / (100 + i)
+            assert limiter.effective_rate == rate
+            assert limiter.current_window_ns == starting_window_ns
 
     prev_rate = 0.5
     window_ns = starting_window_ns + time_window
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=window_ns):
+        for _ in range(100):
+            assert limiter.is_allowed() is True
+            assert limiter.effective_rate == 0.75
+            assert limiter.current_window_ns == window_ns
 
-    for _ in range(100):
-        assert limiter.is_allowed(window_ns) is True
-        assert limiter.effective_rate == 0.75
-        assert limiter.current_window_ns == window_ns
-
-    for i in range(1, 101):
-        assert limiter.is_allowed(window_ns) is False
-        rate = 100 / (100 + i)
-        assert limiter.effective_rate == (rate + prev_rate) / 2
-        assert limiter.current_window_ns == window_ns
+        for i in range(1, 101):
+            assert limiter.is_allowed() is False
+            rate = 100 / (100 + i)
+            assert limiter.effective_rate == (rate + prev_rate) / 2
+            assert limiter.current_window_ns == window_ns
 
 
 @pytest.mark.parametrize("time_window", [1e3, 1e6, 1e9])
@@ -150,47 +156,51 @@ def test_rate_limiter_effective_rate_starting_rate(time_window):
     assert limiter.prev_window_rate is None
 
     # Calling `.is_allowed()` updates the values
-    assert limiter.is_allowed(now_ns) is True
-    assert limiter.effective_rate == 1.0
-    assert limiter.current_window_ns == now_ns
-    assert limiter.prev_window_rate is None
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=now_ns):
+        assert limiter.is_allowed() is True
+        assert limiter.effective_rate == 1.0
+        assert limiter.current_window_ns == now_ns
+        assert limiter.prev_window_rate is None
 
     # Gap of 0.9999 seconds, same window
     time_ns = now_ns + (0.9999 * time_window)
-    assert limiter.is_allowed(time_ns) is False
-    # DEV: We have rate_limit=1 set
-    assert limiter.effective_rate == 0.5
-    assert limiter.current_window_ns == now_ns
-    assert limiter.prev_window_rate is None
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=time_ns):
+        assert limiter.is_allowed() is False
+        # DEV: We have rate_limit=1 set
+        assert limiter.effective_rate == 0.5
+        assert limiter.current_window_ns == now_ns
+        assert limiter.prev_window_rate is None
 
     # Gap of 1.0 seconds, new window
     time_ns = now_ns + time_window
-    assert limiter.is_allowed(time_ns) is True
-    assert limiter.effective_rate == 0.75
-    assert limiter.current_window_ns == (now_ns + time_window)
-    assert limiter.prev_window_rate == 0.5
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=time_ns):
+        assert limiter.is_allowed() is True
+        assert limiter.effective_rate == 0.75
+        assert limiter.current_window_ns == (now_ns + time_window)
+        assert limiter.prev_window_rate == 0.5
 
     # Gap of 1.9999 seconds, same window
     time_ns = now_ns + (1.9999 * time_window)
-    assert limiter.is_allowed(time_ns) is False
-    assert limiter.effective_rate == 0.5
-    assert limiter.current_window_ns == (now_ns + time_window)  # Same as old window
-    assert limiter.prev_window_rate == 0.5
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=time_ns):
+        assert limiter.is_allowed() is False
+        assert limiter.effective_rate == 0.5
+        assert limiter.current_window_ns == (now_ns + time_window)  # Same as old window
+        assert limiter.prev_window_rate == 0.5
 
     # Large gap of 100 seconds, new window
     time_ns = now_ns + (100.0 * time_window)
-    assert limiter.is_allowed(time_ns) is True
-    assert limiter.effective_rate == 0.75
-    assert limiter.current_window_ns == (now_ns + (100.0 * time_window))
-    assert limiter.prev_window_rate == 0.5
+    with mock.patch("ddtrace.internal.rate_limiter.compat.monotonic_ns", return_value=time_ns):
+        assert limiter.is_allowed() is True
+        assert limiter.effective_rate == 0.75
+        assert limiter.current_window_ns == (now_ns + (100.0 * time_window))
+        assert limiter.prev_window_rate == 0.5
 
 
 def test_rate_limiter_3():
     limiter = RateLimiter(rate_limit=2)
 
-    now_ns = compat.monotonic_ns()
     for i in range(3):
-        decision = limiter.is_allowed(now_ns)
+        decision = limiter.is_allowed()
         # the first two should be allowed, the third should not
         if i < 2:
             assert decision is True
diff --git a/tests/tracer/test_single_span_sampling_rules.py b/tests/tracer/test_single_span_sampling_rules.py
@@ -336,7 +336,7 @@ def test_max_per_sec_with_is_allowed_check():
     tracer = DummyTracer(rule)
     while True:
         span = traced_function(rule, tracer)
-        if not rule._limiter.is_allowed(span.start_ns):
+        if not rule._limiter.is_allowed():
             break
         assert_sampling_decision_tags(span, limit=2)
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    tracing: Ensures spans are rate limited at the expected rate (100 spans per second by default). Previously long running spans would set the rate limiter to set an invalid window and this could cause the next trace to be dropped.
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ impl RateLimiter {`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
`34`		`- pub fn is_allowed(&mut self, timestamp_ns: f64) -> bool {`
	`34`	`+ pub fn _is_allowed(&mut self, timestamp_ns: f64) -> bool {`
`35`	`35`	`let mut _lock = self._lock.lock().unwrap();`
`36`	`36`
`37`	`37`	`let allowed = (\|\| -> bool {`
`@@ -43,7 +43,11 @@ impl RateLimiter {`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`if self.tokens < self.max_tokens {`
`46`		`- let elapsed: f64 = (timestamp_ns - self.last_update_ns) / self.time_window;`
	`46`	`+ let mut elapsed: f64 = (timestamp_ns - self.last_update_ns) / self.time_window;`
	`47`	`+ if elapsed < 0.0 {`
	`48`	`+ // Note - this should never happen, but if it does, we should reset the elapsed time to avoid negative tokens.`
	`49`	`+ elapsed = 0.0`
	`50`	`+ }`
`47`	`51`	`self.tokens += elapsed * self.max_tokens;`
`48`	`52`	`if self.tokens > self.max_tokens {`
`49`	`53`	`self.tokens = self.max_tokens;`
`@@ -114,8 +118,8 @@ impl RateLimiterPy {`
`114`	`118`	`}`
`115`	`119`	`}`
`116`	`120`
`117`		`- pub fn is_allowed(&mut self, py: Python<'_>, timestamp_ns: f64) -> bool {`
`118`		`- py.allow_threads(\|\| self.rate_limiter.is_allowed(timestamp_ns))`
	`121`	`+ pub fn _is_allowed(&mut self, py: Python<'_>, timestamp_ns: f64) -> bool {`
	`122`	`+ py.allow_threads(\|\| self.rate_limiter._is_allowed(timestamp_ns))`
`119`	`123`	`}`
`120`	`124`
`121`	`125`	`#[getter]`