fix(tracing): resolve concurrency bug in health metrics (backport #7413 to 1.20) (#7640)

mabdinur · web-flow · commit dfa2b5b8ef0e · 2023-11-20T22:11:38.000Z
Backports: #7413 ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [ ] This PR doesn't touch any of that.
diff --git a/ddtrace/internal/writer/writer.py b/ddtrace/internal/writer/writer.py
@@ -44,11 +44,9 @@
 
 
 if TYPE_CHECKING:  # pragma: no cover
-    from typing import Tuple
-
     from ddtrace import Span
 
-    from .agent import ConnectionType
+    from ..agent import ConnectionType
 
 
 log = get_logger(__name__)
@@ -178,7 +176,7 @@ def __init__(
 
         self._clients = clients
         self.dogstatsd = dogstatsd
-        self._metrics_reset()
+        self._metrics = defaultdict(int)  # type: Dict[str, int]
         self._drop_sma = SimpleMovingAverage(DEFAULT_SMA_WINDOW)
         self._sync_mode = sync_mode
         self._conn = None  # type: Optional[ConnectionType]
@@ -213,36 +211,22 @@ def _intake_url(self, client=None):
             return client._intake_url
         return self.intake_url
 
-    def _metrics_dist(self, name, count=1, tags=tuple()):
-        # type: (str, int, Tuple) -> None
-        if tags in self._metrics[name]:
-            self._metrics[name][tags] += count
-        else:
-            self._metrics[name][tags] = count
-
-    def _metrics_reset(self):
-        # type: () -> None
-        self._metrics = defaultdict(dict)  # type: Dict[str, Dict[Tuple[str,...], int]]
+    def _metrics_dist(self, name, count=1, tags=None):
+        # type: (str, int, Optional[List]) -> None
+        if config.health_metrics_enabled and self.dogstatsd:
+            self.dogstatsd.distribution("datadog.%s.%s" % (self.STATSD_NAMESPACE, name), count, tags=tags)
 
     def _set_drop_rate(self):
-        dropped = sum(
-            counts
-            for metric in ("encoder.dropped.traces", "buffer.dropped.traces", "http.dropped.traces")
-            for _tags, counts in self._metrics[metric].items()
-        )
-        accepted = sum(counts for _tags, counts in self._metrics["writer.accepted.traces"].items())
-
-        if dropped > accepted:
-            # Sanity check, we cannot drop more traces than we accepted.
-            log.debug(
-                "dropped.traces metric is greater than accepted.traces metric"
-                "This difference may be reconciled in future metric uploads (dropped.traces: %d, accepted.traces: %d)",
-                dropped,
-                accepted,
-            )
-            accepted = dropped
-
+        # type: () -> None
+        accepted = self._metrics["accepted_traces"]
+        sent = self._metrics["sent_traces"]
+        encoded = sum([len(client.encoder) for client in self._clients])
+        # The number of dropped traces is the number of accepted traces minus the number of traces in the encoder
+        # This calculation is a best effort. Due to race conditions it may result in a slight underestimate.
+        dropped = max(accepted - sent - encoded, 0)  # dropped spans should never be negative
         self._drop_sma.set(dropped, accepted)
+        self._metrics["sent_traces"] = 0  # reset sent traces for the next interval
+        self._metrics["accepted_traces"] = encoded  # sets accepted traces to number of spans in encoders
 
     def _set_keep_rate(self, trace):
         if trace:
@@ -307,9 +291,10 @@ def _send_payload(self, payload, count, client):
         response = self._put(payload, headers, client, no_trace=True)
 
         if response.status >= 400:
-            self._metrics_dist("http.errors", tags=("type:%s" % response.status,))
+            self._metrics_dist("http.errors", tags=["type:%s" % response.status])
         else:
             self._metrics_dist("http.sent.bytes", len(payload))
+            self._metrics["sent_traces"] += count
 
         if response.status not in (404, 415) and response.status >= 400:
             msg = "failed to send traces to intake at %s: HTTP error status %s, reason %s"
@@ -353,6 +338,7 @@ def _write_with_client(self, client, spans=None):
                 pass
 
         self._metrics_dist("writer.accepted.traces")
+        self._metrics["accepted_traces"] += 1
         self._set_keep_rate(spans)
 
         try:
@@ -364,8 +350,8 @@ def _write_with_client(self, client, spans=None):
                 payload_size,
                 client.encoder.max_item_size,
             )
-            self._metrics_dist("buffer.dropped.traces", 1, tags=("reason:t_too_big",))
-            self._metrics_dist("buffer.dropped.bytes", payload_size, tags=("reason:t_too_big",))
+            self._metrics_dist("buffer.dropped.traces", 1, tags=["reason:t_too_big"])
+            self._metrics_dist("buffer.dropped.bytes", payload_size, tags=["reason:t_too_big"])
         except BufferFull as e:
             payload_size = e.args[0]
             log.warning(
@@ -376,10 +362,10 @@ def _write_with_client(self, client, spans=None):
                 payload_size,
                 self.status.value,
             )
-            self._metrics_dist("buffer.dropped.traces", 1, tags=("reason:full",))
-            self._metrics_dist("buffer.dropped.bytes", payload_size, tags=("reason:full",))
+            self._metrics_dist("buffer.dropped.traces", 1, tags=["reason:full"])
+            self._metrics_dist("buffer.dropped.bytes", payload_size, tags=["reason:full"])
         except NoEncodableSpansError:
-            self._metrics_dist("buffer.dropped.traces", 1, tags=("reason:incompatible",))
+            self._metrics_dist("buffer.dropped.traces", 1, tags=["reason:incompatible"])
         else:
             self._metrics_dist("buffer.accepted.traces", 1)
             self._metrics_dist("buffer.accepted.spans", len(spans))
@@ -390,7 +376,6 @@ def flush_queue(self, raise_exc=False):
                 self._flush_queue_with_client(client, raise_exc=raise_exc)
         finally:
             self._set_drop_rate()
-            self._metrics_reset()
 
     def _flush_queue_with_client(self, client, raise_exc=False):
         # type: (WriterClientBase, bool) -> None
@@ -411,7 +396,7 @@ def _flush_queue_with_client(self, client, raise_exc=False):
         try:
             self._send_payload_with_backoff(encoded, n_traces, client)
         except Exception:
-            self._metrics_dist("http.errors", tags=("type:err",))
+            self._metrics_dist("http.errors", tags=["type:err"])
             self._metrics_dist("http.dropped.bytes", len(encoded))
             self._metrics_dist("http.dropped.traces", n_traces)
             if raise_exc:
@@ -424,17 +409,8 @@ def _flush_queue_with_client(self, client, raise_exc=False):
                     self.RETRY_ATTEMPTS,
                 )
         finally:
-            if config.health_metrics_enabled and self.dogstatsd:
-                namespace = self.STATSD_NAMESPACE
-                # Note that we cannot use the batching functionality of dogstatsd because
-                # it's not thread-safe.
-                # https://github.com/DataDog/datadogpy/issues/439
-                # This really isn't ideal as now we're going to do a ton of socket calls.
-                self.dogstatsd.distribution("datadog.%s.http.sent.bytes" % namespace, len(encoded))
-                self.dogstatsd.distribution("datadog.%s.http.sent.traces" % namespace, n_traces)
-                for name, metric_tags in self._metrics.items():
-                    for tags, count in metric_tags.items():
-                        self.dogstatsd.distribution("datadog.%s.%s" % (namespace, name), count, tags=list(tags))
+            self._metrics_dist("http.sent.bytes", len(encoded))
+            self._metrics_dist("http.sent.traces", n_traces)
 
     def periodic(self):
         self.flush_queue(raise_exc=False)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -141,6 +141,7 @@ services:
             - DD_POOL_TRACE_CHECK_FAILURES=true
             - DD_DISABLE_ERROR_RESPONSES=true
             - ENABLED_CHECKS=trace_content_length,trace_stall,meta_tracer_version_header,trace_count_header,trace_peer_service,trace_dd_service
+            - SNAPSHOT_IGNORED_ATTRS=span_id,trace_id,parent_id,duration,start,metrics.system.pid,metrics.system.process_id,metrics.process_id,meta.runtime-id,meta._dd.p.tid,meta.pathway.hash,metrics._dd.tracer_kr
     vertica:
         image: sumitchawla/vertica
         environment:
diff --git a/releasenotes/notes/fix-writer-concurrent-dictionary-modification-d37e2f918c51578c.yaml b/releasenotes/notes/fix-writer-concurrent-dictionary-modification-d37e2f918c51578c.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    tracing: Fixes an issue where the thread responsible for sending traces is killed due to concurrent dictionary modification.
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -18,7 +18,6 @@
 from tests.integration.utils import send_invalid_payload_and_get_logs
 from tests.integration.utils import skip_if_testagent
 from tests.utils import call_program
-from tests.utils import override_global_config
 
 
 FOUR_KB = 1 << 12
@@ -235,79 +234,89 @@ def test_child_spans_do_not_cause_warning_logs():
         log.error.assert_not_called()
 
 
-def _test_metrics(
-    tracer,
-    http_sent_traces=-1,
-    writer_accepted_traces=-1,
-    buffer_accepted_traces=-1,
-    buffer_accepted_spans=-1,
-    http_requests=-1,
-    http_sent_bytes=-1,
-):
+@parametrize_with_all_encodings(env={"DD_TRACE_HEALTH_METRICS_ENABLED": "true"})
+def test_metrics():
+    import mock
+
+    from ddtrace import tracer as t
+    from tests.utils import AnyInt
+    from tests.utils import override_global_config
+
+    assert t._partial_flush_min_spans == 500
+
     with override_global_config(dict(health_metrics_enabled=True)):
         statsd_mock = mock.Mock()
-        tracer._writer.dogstatsd = statsd_mock
+        t._writer.dogstatsd = statsd_mock
         with mock.patch("ddtrace.internal.writer.writer.log") as log:
-            for _ in range(5):
+            for _ in range(2):
                 spans = []
-                for _ in range(3000):
-                    spans.append(tracer.trace("op"))
+                for _ in range(600):
+                    spans.append(t.trace("op"))
                 for s in spans:
                     s.finish()
 
-            tracer.shutdown()
+            t.shutdown()
             log.warning.assert_not_called()
             log.error.assert_not_called()
 
-        for metric_name, metric_value, check_tags in (
-            ("datadog.tracer.http.sent.traces", http_sent_traces, False),
-            ("datadog.tracer.writer.accepted.traces", writer_accepted_traces, True),
-            ("datadog.tracer.buffer.accepted.traces", buffer_accepted_traces, True),
-            ("datadog.tracer.buffer.accepted.spans", buffer_accepted_spans, True),
-            ("datadog.tracer.http.requests", http_requests, True),
-            ("datadog.tracer.http.sent.bytes", http_sent_bytes, True),
-        ):
-            if metric_value != -1:
-                kwargs = {"tags": []} if check_tags else {}
-                statsd_mock.distribution.assert_has_calls(
-                    [mock.call(metric_name, metric_value, **kwargs)], any_order=True
-                )
-
-
-@parametrize_with_all_encodings(env={"DD_TRACE_HEALTH_METRICS_ENABLED": "true"})
-def test_metrics():
-    from ddtrace import tracer as t
-    from tests.integration.test_integration import _test_metrics
-    from tests.utils import AnyInt
-
-    assert t._partial_flush_min_spans == 500
-    _test_metrics(
-        t,
-        http_sent_bytes=AnyInt(),
-        http_sent_traces=30,
-        writer_accepted_traces=30,
-        buffer_accepted_traces=30,
-        buffer_accepted_spans=15000,
-        http_requests=1,
+    statsd_mock.distribution.assert_has_calls(
+        [
+            mock.call("datadog.tracer.writer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 500, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 100, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 500, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 100, tags=None),
+            mock.call("datadog.tracer.http.requests", 1, tags=None),
+            mock.call("datadog.tracer.http.sent.bytes", AnyInt(), tags=None),
+            mock.call("datadog.tracer.http.sent.bytes", AnyInt(), tags=None),
+            mock.call("datadog.tracer.http.sent.traces", 4, tags=None),
+        ],
+        any_order=True,
     )
 
 
-@skip_if_testagent
 @parametrize_with_all_encodings(env={"DD_TRACE_HEALTH_METRICS_ENABLED": "true"})
 def test_metrics_partial_flush_disabled():
+    import mock
+
     from ddtrace import tracer as t
-    from tests.integration.test_integration import _test_metrics
     from tests.utils import AnyInt
+    from tests.utils import override_global_config
 
     t.configure(
         partial_flush_enabled=False,
     )
-    _test_metrics(
-        t,
-        http_sent_bytes=AnyInt(),
-        buffer_accepted_traces=5,
-        buffer_accepted_spans=15000,
-        http_requests=1,
+
+    with override_global_config(dict(health_metrics_enabled=True)):
+        statsd_mock = mock.Mock()
+        t._writer.dogstatsd = statsd_mock
+        with mock.patch("ddtrace.internal.writer.writer.log") as log:
+            for _ in range(2):
+                spans = []
+                for _ in range(600):
+                    spans.append(t.trace("op"))
+                for s in spans:
+                    s.finish()
+
+            t.shutdown()
+            log.warning.assert_not_called()
+            log.error.assert_not_called()
+
+    statsd_mock.distribution.assert_has_calls(
+        [
+            mock.call("datadog.tracer.writer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 600, tags=None),
+            mock.call("datadog.tracer.writer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.traces", 1, tags=None),
+            mock.call("datadog.tracer.buffer.accepted.spans", 600, tags=None),
+            mock.call("datadog.tracer.http.requests", 1, tags=None),
+            mock.call("datadog.tracer.http.sent.bytes", AnyInt(), tags=None),
+            mock.call("datadog.tracer.http.sent.bytes", AnyInt(), tags=None),
+            mock.call("datadog.tracer.http.sent.traces", 2, tags=None),
+        ],
+        any_order=True,
     )
 
 
diff --git a/tests/tracer/test_writer.py b/tests/tracer/test_writer.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    tracing: Fixes an issue where the thread responsible for sending traces is killed due to concurrent dictionary modification.