fix(tracing): resolves issues encoding and sending large trace payloads [backport 2.0] (#7162)

github-actions[bot] · mabdinur · web-flow · commit c242dfbe6a7d · 2023-10-04T18:28:55.000Z
Backport 4646688 from #6943 to 2.0. Increases the maximum payload size and buffer size from 8MB to 20MB. Also decreases the maximum number of spans in trace chunks when ``DD_TRACE_PARTIAL_FLUSH_ENABLED=True``. This ensures large traces are correctly encoded and submitted. This should decrease the occurrence of "failed to send traces" error logs. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. Co-authored-by: Munir Abdinur <munir.abdinur@datadoghq.com>
diff --git a/ddtrace/internal/constants.py b/ddtrace/internal/constants.py
@@ -31,8 +31,8 @@
 SPAN_API_DATADOG = "datadog"
 SPAN_API_OTEL = "otel"
 SPAN_API_OPENTRACING = "opentracing"
-DEFAULT_BUFFER_SIZE = 8 << 20  # 8 MB
-DEFAULT_MAX_PAYLOAD_SIZE = 8 << 20  # 8 MB
+DEFAULT_BUFFER_SIZE = 20 << 20  # 20 MB
+DEFAULT_MAX_PAYLOAD_SIZE = 20 << 20  # 20 MB
 DEFAULT_PROCESSING_INTERVAL = 1.0
 DEFAULT_REUSE_CONNECTIONS = False
 BLOCKED_RESPONSE_HTML = """
diff --git a/ddtrace/settings/config.py b/ddtrace/settings/config.py
@@ -228,7 +228,7 @@ def __init__(self):
         self._trace_rate_limit = int(os.getenv("DD_TRACE_RATE_LIMIT", default=DEFAULT_SAMPLING_RATE_LIMIT))
         self._trace_sampling_rules = os.getenv("DD_TRACE_SAMPLING_RULES")
         self._partial_flush_enabled = asbool(os.getenv("DD_TRACE_PARTIAL_FLUSH_ENABLED", default=True))
-        self._partial_flush_min_spans = int(os.getenv("DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", default=500))
+        self._partial_flush_min_spans = int(os.getenv("DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", default=300))
         self._priority_sampling = asbool(os.getenv("DD_PRIORITY_SAMPLING", default=True))
 
         header_tags = parse_tags_str(os.getenv("DD_TRACE_HEADER_TAGS", ""))
diff --git a/releasenotes/notes/fix-trace-buffer-size-issues-807e63bfd8a3c57f.yaml b/releasenotes/notes/fix-trace-buffer-size-issues-807e63bfd8a3c57f.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    tracing: Increases the maximum payload size and buffer size from 8MB to 20MB. Also decreases the maximum number of spans in trace chunks when ``DD_TRACE_PARTIAL_FLUSH_ENABLED=True``. This ensures large traces are correctly encoded and submitted.
+    This should decrease the occurrence of "failed to send traces" error logs.
diff --git a/tests/commands/test_runner.py b/tests/commands/test_runner.py
@@ -361,7 +361,7 @@ def test_info_no_configs():
     Health metrics enabled: False
     Priority sampling enabled: True
     Partial flushing enabled: True
-    Partial flush minimum number of spans: 500
+    Partial flush minimum number of spans: 300
     WAF timeout: 5.0 msecs
     \x1b[92m\x1b[1mTagging:\x1b[0m
     DD Service: None
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -280,13 +280,13 @@ def test_metrics():
     from tests.integration.test_integration import _test_metrics
     from tests.utils import AnyInt
 
-    assert t._partial_flush_min_spans == 500
+    assert t._partial_flush_min_spans == 300
     _test_metrics(
         t,
         http_sent_bytes=AnyInt(),
-        http_sent_traces=30,
-        writer_accepted_traces=30,
-        buffer_accepted_traces=30,
+        http_sent_traces=50,
+        writer_accepted_traces=50,
+        buffer_accepted_traces=50,
         buffer_accepted_spans=15000,
         http_requests=1,
     )
@@ -346,7 +346,9 @@ def test_single_trace_too_large():
 
 
 @skip_if_testagent
-@parametrize_with_all_encodings(env={"DD_TRACE_PARTIAL_FLUSH_ENABLED": "false"})
+@parametrize_with_all_encodings(
+    env={"DD_TRACE_PARTIAL_FLUSH_ENABLED": "false", "DD_TRACE_WRITER_BUFFER_SIZE_BYTES": str(8 << 20)}
+)
 def test_single_trace_too_large_partial_flush_disabled():
     import mock
 
@@ -647,8 +649,8 @@ def test_writer_configured_correctly_from_env():
 def test_writer_configured_correctly_from_env_defaults():
     import ddtrace
 
-    assert ddtrace.tracer._writer._encoder.max_size == 8 << 20
-    assert ddtrace.tracer._writer._encoder.max_item_size == 8 << 20
+    assert ddtrace.tracer._writer._encoder.max_size == 20 << 20
+    assert ddtrace.tracer._writer._encoder.max_item_size == 20 << 20
     assert ddtrace.tracer._writer._interval == 1.0
 
 
@@ -676,8 +678,8 @@ def test_writer_configured_correctly_from_env_defaults_under_ddtrace_run(ddtrace
         """
 import ddtrace
 
-assert ddtrace.tracer._writer._encoder.max_size == 8 << 20
-assert ddtrace.tracer._writer._encoder.max_item_size == 8 << 20
+assert ddtrace.tracer._writer._encoder.max_size == 20 << 20
+assert ddtrace.tracer._writer._encoder.max_item_size == 20 << 20
 assert ddtrace.tracer._writer._interval == 1.0
 """,
     )
diff --git a/tests/integration/test_integration_snapshots.py b/tests/integration/test_integration_snapshots.py
@@ -14,6 +14,7 @@
 from ddtrace.internal.writer import AgentWriter
 from tests.integration.utils import mark_snapshot
 from tests.integration.utils import parametrize_with_all_encodings
+from tests.utils import override_global_config
 from tests.utils import snapshot
 
 from .test_integration import AGENT_VERSION
@@ -180,15 +181,15 @@ def test_wrong_span_name_type_not_sent():
 @snapshot()
 def test_trace_with_wrong_meta_types_not_sent(encoding, meta, monkeypatch):
     """Wrong meta types should raise TypeErrors during encoding and fail to send to the agent."""
-    monkeypatch.setenv("DD_TRACE_API_VERSION", encoding)
-    tracer = Tracer()
-    with mock.patch("ddtrace.span.log") as log:
-        with tracer.trace("root") as root:
-            root._meta = meta
-            for _ in range(499):
-                with tracer.trace("child") as child:
-                    child._meta = meta
-        log.exception.assert_called_once_with("error closing trace")
+    with override_global_config(dict(_trace_api=encoding)):
+        tracer = Tracer()
+        with mock.patch("ddtrace.span.log") as log:
+            with tracer.trace("root") as root:
+                root._meta = meta
+                for _ in range(299):
+                    with tracer.trace("child") as child:
+                        child._meta = meta
+            log.exception.assert_called_once_with("error closing trace")
 
 
 @pytest.mark.parametrize(
@@ -203,15 +204,15 @@ def test_trace_with_wrong_meta_types_not_sent(encoding, meta, monkeypatch):
 @snapshot()
 def test_trace_with_wrong_metrics_types_not_sent(encoding, metrics, monkeypatch):
     """Wrong metric types should raise TypeErrors during encoding and fail to send to the agent."""
-    monkeypatch.setenv("DD_TRACE_API_VERSION", encoding)
-    tracer = Tracer()
-    with mock.patch("ddtrace.span.log") as log:
-        with tracer.trace("root") as root:
-            root._metrics = metrics
-            for _ in range(499):
-                with tracer.trace("child") as child:
-                    child._metrics = metrics
-        log.exception.assert_called_once_with("error closing trace")
+    with override_global_config(dict(_trace_api=encoding)):
+        tracer = Tracer()
+        with mock.patch("ddtrace.span.log") as log:
+            with tracer.trace("root") as root:
+                root._metrics = metrics
+                for _ in range(299):
+                    with tracer.trace("child") as child:
+                        child._metrics = metrics
+            log.exception.assert_called_once_with("error closing trace")
 
 
 @snapshot()
diff --git a/tests/telemetry/test_writer.py b/tests/telemetry/test_writer.py
@@ -106,7 +106,7 @@ def test_app_started_event(telemetry_writer, test_agent_session, mock_time):
             },
             {"name": "DD_TRACE_OTEL_ENABLED", "origin": "unknown", "value": False},
             {"name": "DD_TRACE_PARTIAL_FLUSH_ENABLED", "origin": "unknown", "value": True},
-            {"name": "DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", "origin": "unknown", "value": 500},
+            {"name": "DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", "origin": "unknown", "value": 300},
             {"name": "DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED", "origin": "unknown", "value": False},
             {"name": "DD_TRACE_PEER_SERVICE_MAPPING", "origin": "unknown", "value": ""},
             {"name": "DD_TRACE_PROPAGATION_STYLE_EXTRACT", "origin": "unknown", "value": "tracecontext,datadog"},
@@ -117,9 +117,9 @@ def test_app_started_event(telemetry_writer, test_agent_session, mock_time):
             {"name": "DD_TRACE_SAMPLING_RULES", "origin": "unknown", "value": None},
             {"name": "DD_TRACE_SPAN_ATTRIBUTE_SCHEMA", "origin": "unknown", "value": "v0"},
             {"name": "DD_TRACE_STARTUP_LOGS", "origin": "unknown", "value": False},
-            {"name": "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "origin": "unknown", "value": 8388608},
+            {"name": "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "origin": "unknown", "value": 20 << 20},
             {"name": "DD_TRACE_WRITER_INTERVAL_SECONDS", "origin": "unknown", "value": 1.0},
-            {"name": "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE_BYTES", "origin": "unknown", "value": 8388608},
+            {"name": "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE_BYTES", "origin": "unknown", "value": 20 << 20},
             {"name": "DD_TRACE_WRITER_REUSE_CONNECTIONS", "origin": "unknown", "value": False},
             {"name": "ddtrace_auto_used", "origin": "unknown", "value": False},
             {"name": "ddtrace_bootstrapped", "origin": "unknown", "value": False},
diff --git a/tests/tracer/test_writer.py b/tests/tracer/test_writer.py
@@ -92,7 +92,7 @@ def test_metrics_bad_endpoint(self):
 
     def test_metrics_trace_too_big(self):
         statsd = mock.Mock()
-        with override_global_config(dict(health_metrics_enabled=True)):
+        with override_global_config(dict(health_metrics_enabled=True, _trace_writer_buffer_size=8 << 20)):
             writer = self.WRITER_CLASS("http://asdf:1234", dogstatsd=statsd)
             for i in range(10):
                 writer.write([Span(name="name", trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(5)])
@@ -225,7 +225,7 @@ def test_drop_reason_trace_too_big(self):
             for i in range(10):
                 writer.write([Span(name="name", trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(5)])
             writer.write(
-                [Span(name="a" * 5000, trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(2 ** 10)]
+                [Span(name="a" * 5000 * i, trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(2 ** 10)]
             )
             writer.stop()
             writer.join()
@@ -284,7 +284,7 @@ def test_keep_rate(self):
         writer_run_periodic = mock.Mock()
         writer_put = mock.Mock()
         writer_put.return_value = Response(status=200)
-        with override_global_config(dict(health_metrics_enabled=False)):
+        with override_global_config(dict(health_metrics_enabled=False, _trace_writer_buffer_size=8 << 20)):
             writer = self.WRITER_CLASS("http://asdf:1234", dogstatsd=statsd)
             writer.run_periodic = writer_run_periodic
             writer._put = writer_put

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +fixes:
 +  - |
 +    tracing: Increases the maximum payload size and buffer size from 8MB to 20MB. Also decreases the maximum number of spans in trace chunks when ``DD_TRACE_PARTIAL_FLUSH_ENABLED=True``. This ensures large traces are correctly encoded and submitted.
 +    This should decrease the occurrence of "failed to send traces" error logs.