Skip to content

Commit c242dfb

Browse files
fix(tracing): resolves issues encoding and sending large trace payloads [backport 2.0] (#7162)
Backport 4646688 from #6943 to 2.0. Increases the maximum payload size and buffer size from 8MB to 20MB. Also decreases the maximum number of spans in trace chunks when ``DD_TRACE_PARTIAL_FLUSH_ENABLED=True``. This ensures large traces are correctly encoded and submitted. This should decrease the occurrence of "failed to send traces" error logs. ## Checklist - [x] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [x] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. Co-authored-by: Munir Abdinur <[email protected]>
1 parent f5f02d0 commit c242dfb

File tree

8 files changed

+45
-37
lines changed

8 files changed

+45
-37
lines changed

ddtrace/internal/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@
3131
SPAN_API_DATADOG = "datadog"
3232
SPAN_API_OTEL = "otel"
3333
SPAN_API_OPENTRACING = "opentracing"
34-
DEFAULT_BUFFER_SIZE = 8 << 20 # 8 MB
35-
DEFAULT_MAX_PAYLOAD_SIZE = 8 << 20 # 8 MB
34+
DEFAULT_BUFFER_SIZE = 20 << 20 # 20 MB
35+
DEFAULT_MAX_PAYLOAD_SIZE = 20 << 20 # 20 MB
3636
DEFAULT_PROCESSING_INTERVAL = 1.0
3737
DEFAULT_REUSE_CONNECTIONS = False
3838
BLOCKED_RESPONSE_HTML = """

ddtrace/settings/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def __init__(self):
228228
self._trace_rate_limit = int(os.getenv("DD_TRACE_RATE_LIMIT", default=DEFAULT_SAMPLING_RATE_LIMIT))
229229
self._trace_sampling_rules = os.getenv("DD_TRACE_SAMPLING_RULES")
230230
self._partial_flush_enabled = asbool(os.getenv("DD_TRACE_PARTIAL_FLUSH_ENABLED", default=True))
231-
self._partial_flush_min_spans = int(os.getenv("DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", default=500))
231+
self._partial_flush_min_spans = int(os.getenv("DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", default=300))
232232
self._priority_sampling = asbool(os.getenv("DD_PRIORITY_SAMPLING", default=True))
233233

234234
header_tags = parse_tags_str(os.getenv("DD_TRACE_HEADER_TAGS", ""))
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
fixes:
3+
- |
4+
tracing: Increases the maximum payload size and buffer size from 8MB to 20MB. Also decreases the maximum number of spans in trace chunks when ``DD_TRACE_PARTIAL_FLUSH_ENABLED=True``. This ensures large traces are correctly encoded and submitted.
5+
This should decrease the occurrence of "failed to send traces" error logs.

tests/commands/test_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def test_info_no_configs():
361361
Health metrics enabled: False
362362
Priority sampling enabled: True
363363
Partial flushing enabled: True
364-
Partial flush minimum number of spans: 500
364+
Partial flush minimum number of spans: 300
365365
WAF timeout: 5.0 msecs
366366
\x1b[92m\x1b[1mTagging:\x1b[0m
367367
DD Service: None

tests/integration/test_integration.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,13 @@ def test_metrics():
280280
from tests.integration.test_integration import _test_metrics
281281
from tests.utils import AnyInt
282282

283-
assert t._partial_flush_min_spans == 500
283+
assert t._partial_flush_min_spans == 300
284284
_test_metrics(
285285
t,
286286
http_sent_bytes=AnyInt(),
287-
http_sent_traces=30,
288-
writer_accepted_traces=30,
289-
buffer_accepted_traces=30,
287+
http_sent_traces=50,
288+
writer_accepted_traces=50,
289+
buffer_accepted_traces=50,
290290
buffer_accepted_spans=15000,
291291
http_requests=1,
292292
)
@@ -346,7 +346,9 @@ def test_single_trace_too_large():
346346

347347

348348
@skip_if_testagent
349-
@parametrize_with_all_encodings(env={"DD_TRACE_PARTIAL_FLUSH_ENABLED": "false"})
349+
@parametrize_with_all_encodings(
350+
env={"DD_TRACE_PARTIAL_FLUSH_ENABLED": "false", "DD_TRACE_WRITER_BUFFER_SIZE_BYTES": str(8 << 20)}
351+
)
350352
def test_single_trace_too_large_partial_flush_disabled():
351353
import mock
352354

@@ -647,8 +649,8 @@ def test_writer_configured_correctly_from_env():
647649
def test_writer_configured_correctly_from_env_defaults():
648650
import ddtrace
649651

650-
assert ddtrace.tracer._writer._encoder.max_size == 8 << 20
651-
assert ddtrace.tracer._writer._encoder.max_item_size == 8 << 20
652+
assert ddtrace.tracer._writer._encoder.max_size == 20 << 20
653+
assert ddtrace.tracer._writer._encoder.max_item_size == 20 << 20
652654
assert ddtrace.tracer._writer._interval == 1.0
653655

654656

@@ -676,8 +678,8 @@ def test_writer_configured_correctly_from_env_defaults_under_ddtrace_run(ddtrace
676678
"""
677679
import ddtrace
678680
679-
assert ddtrace.tracer._writer._encoder.max_size == 8 << 20
680-
assert ddtrace.tracer._writer._encoder.max_item_size == 8 << 20
681+
assert ddtrace.tracer._writer._encoder.max_size == 20 << 20
682+
assert ddtrace.tracer._writer._encoder.max_item_size == 20 << 20
681683
assert ddtrace.tracer._writer._interval == 1.0
682684
""",
683685
)

tests/integration/test_integration_snapshots.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ddtrace.internal.writer import AgentWriter
1515
from tests.integration.utils import mark_snapshot
1616
from tests.integration.utils import parametrize_with_all_encodings
17+
from tests.utils import override_global_config
1718
from tests.utils import snapshot
1819

1920
from .test_integration import AGENT_VERSION
@@ -180,15 +181,15 @@ def test_wrong_span_name_type_not_sent():
180181
@snapshot()
181182
def test_trace_with_wrong_meta_types_not_sent(encoding, meta, monkeypatch):
182183
"""Wrong meta types should raise TypeErrors during encoding and fail to send to the agent."""
183-
monkeypatch.setenv("DD_TRACE_API_VERSION", encoding)
184-
tracer = Tracer()
185-
with mock.patch("ddtrace.span.log") as log:
186-
with tracer.trace("root") as root:
187-
root._meta = meta
188-
for _ in range(499):
189-
with tracer.trace("child") as child:
190-
child._meta = meta
191-
log.exception.assert_called_once_with("error closing trace")
184+
with override_global_config(dict(_trace_api=encoding)):
185+
tracer = Tracer()
186+
with mock.patch("ddtrace.span.log") as log:
187+
with tracer.trace("root") as root:
188+
root._meta = meta
189+
for _ in range(299):
190+
with tracer.trace("child") as child:
191+
child._meta = meta
192+
log.exception.assert_called_once_with("error closing trace")
192193

193194

194195
@pytest.mark.parametrize(
@@ -203,15 +204,15 @@ def test_trace_with_wrong_meta_types_not_sent(encoding, meta, monkeypatch):
203204
@snapshot()
204205
def test_trace_with_wrong_metrics_types_not_sent(encoding, metrics, monkeypatch):
205206
"""Wrong metric types should raise TypeErrors during encoding and fail to send to the agent."""
206-
monkeypatch.setenv("DD_TRACE_API_VERSION", encoding)
207-
tracer = Tracer()
208-
with mock.patch("ddtrace.span.log") as log:
209-
with tracer.trace("root") as root:
210-
root._metrics = metrics
211-
for _ in range(499):
212-
with tracer.trace("child") as child:
213-
child._metrics = metrics
214-
log.exception.assert_called_once_with("error closing trace")
207+
with override_global_config(dict(_trace_api=encoding)):
208+
tracer = Tracer()
209+
with mock.patch("ddtrace.span.log") as log:
210+
with tracer.trace("root") as root:
211+
root._metrics = metrics
212+
for _ in range(299):
213+
with tracer.trace("child") as child:
214+
child._metrics = metrics
215+
log.exception.assert_called_once_with("error closing trace")
215216

216217

217218
@snapshot()

tests/telemetry/test_writer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def test_app_started_event(telemetry_writer, test_agent_session, mock_time):
106106
},
107107
{"name": "DD_TRACE_OTEL_ENABLED", "origin": "unknown", "value": False},
108108
{"name": "DD_TRACE_PARTIAL_FLUSH_ENABLED", "origin": "unknown", "value": True},
109-
{"name": "DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", "origin": "unknown", "value": 500},
109+
{"name": "DD_TRACE_PARTIAL_FLUSH_MIN_SPANS", "origin": "unknown", "value": 300},
110110
{"name": "DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED", "origin": "unknown", "value": False},
111111
{"name": "DD_TRACE_PEER_SERVICE_MAPPING", "origin": "unknown", "value": ""},
112112
{"name": "DD_TRACE_PROPAGATION_STYLE_EXTRACT", "origin": "unknown", "value": "tracecontext,datadog"},
@@ -117,9 +117,9 @@ def test_app_started_event(telemetry_writer, test_agent_session, mock_time):
117117
{"name": "DD_TRACE_SAMPLING_RULES", "origin": "unknown", "value": None},
118118
{"name": "DD_TRACE_SPAN_ATTRIBUTE_SCHEMA", "origin": "unknown", "value": "v0"},
119119
{"name": "DD_TRACE_STARTUP_LOGS", "origin": "unknown", "value": False},
120-
{"name": "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "origin": "unknown", "value": 8388608},
120+
{"name": "DD_TRACE_WRITER_BUFFER_SIZE_BYTES", "origin": "unknown", "value": 20 << 20},
121121
{"name": "DD_TRACE_WRITER_INTERVAL_SECONDS", "origin": "unknown", "value": 1.0},
122-
{"name": "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE_BYTES", "origin": "unknown", "value": 8388608},
122+
{"name": "DD_TRACE_WRITER_MAX_PAYLOAD_SIZE_BYTES", "origin": "unknown", "value": 20 << 20},
123123
{"name": "DD_TRACE_WRITER_REUSE_CONNECTIONS", "origin": "unknown", "value": False},
124124
{"name": "ddtrace_auto_used", "origin": "unknown", "value": False},
125125
{"name": "ddtrace_bootstrapped", "origin": "unknown", "value": False},

tests/tracer/test_writer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def test_metrics_bad_endpoint(self):
9292

9393
def test_metrics_trace_too_big(self):
9494
statsd = mock.Mock()
95-
with override_global_config(dict(health_metrics_enabled=True)):
95+
with override_global_config(dict(health_metrics_enabled=True, _trace_writer_buffer_size=8 << 20)):
9696
writer = self.WRITER_CLASS("http://asdf:1234", dogstatsd=statsd)
9797
for i in range(10):
9898
writer.write([Span(name="name", trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(5)])
@@ -225,7 +225,7 @@ def test_drop_reason_trace_too_big(self):
225225
for i in range(10):
226226
writer.write([Span(name="name", trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(5)])
227227
writer.write(
228-
[Span(name="a" * 5000, trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(2 ** 10)]
228+
[Span(name="a" * 5000 * i, trace_id=i, span_id=j, parent_id=j - 1 or None) for j in range(2 ** 10)]
229229
)
230230
writer.stop()
231231
writer.join()
@@ -284,7 +284,7 @@ def test_keep_rate(self):
284284
writer_run_periodic = mock.Mock()
285285
writer_put = mock.Mock()
286286
writer_put.return_value = Response(status=200)
287-
with override_global_config(dict(health_metrics_enabled=False)):
287+
with override_global_config(dict(health_metrics_enabled=False, _trace_writer_buffer_size=8 << 20)):
288288
writer = self.WRITER_CLASS("http://asdf:1234", dogstatsd=statsd)
289289
writer.run_periodic = writer_run_periodic
290290
writer._put = writer_put

0 commit comments

Comments
 (0)