Skip to content

Commit cf23e14

Browse files
authored
feat: Increase trace payload limit from 2 MiB to 50 MiB (#902)
# This PR Change the trace request limit from 2 MiB to 50 MiB. # Motivation When the Node.js tracer layer sends a request to Lambda extension that's between 2 MiB and 50 MiB, the extension closes the HTTP connection, the tracer gets an `EPIPE` error and breaks. (Maybe the tracer should handle the error better, but that's out of scope of this PR.) According to @rochdev: > the agent is supposed to have a limit of 50mb So let's change the limit on agent side to match the expectation. # Testing Tested with Node.js 22 Lambda with this handler: ``` import tracer from 'dd-trace'; import crypto from 'crypto'; tracer.init(); function randomGarbage(len) { // low-compressibility payload (random bytes -> base64) return crypto.randomBytes(len).toString('base64'); } export const handler = async (event) => { const SPANS = 3000; const TAG_BYTES_PER_SPAN = 20_000; // ~20 KB per span tag (base64 expands a bit) const root = tracer.startSpan('repro.root'); root.setTag('dd.repro', 'true'); for (let i = 0; i < SPANS; i++) { console.log(`Sending the ${i}-th span`); const span = tracer.startSpan('repro.child', { childOf: root }); span.setTag('blob', randomGarbage(TAG_BYTES_PER_SPAN)); span.finish(); } root.finish(); const response = { statusCode: 200, body: JSON.stringify('Hello from Lambda!'), }; return response; }; ``` ### Before: There are errors like: ``` Error: write EPIPE at WriteWrap.onWriteComplete [as oncomplete] (node:internal/stream_base_commons:95:16) at WriteWrap.callbackTrampoline (node:internal/async_hooks:130:17) ``` ``` LAMBDA_RUNTIME Failed to post handler success response. Http response code: 403. {"errorMessage":"State transition from Ready to InvocationErrorResponse failed for runtime. Error: State transition is not allowed","errorType":"InvalidStateTransition"} ``` ### After When Lambda's memory is 1024 MB, the error no longer happens. When Lambda's memory is 512 MB, the invocation can fail due to OOM. But I think that's a legit error. We can ask customers to increase memory limit for high-volume workload like this. # Notes cc @astuyve who set a `MAX_CONTENT_LENGTH` of 10 MiB in #294. This PR increases it to 50 MiB as well. Thanks @dougqh @duncanista @lucaspimentel @rochdev for discussion. #899 Jira: https://datadoghq.atlassian.net/browse/SVLS-7777
1 parent c5ee4fc commit cf23e14

File tree

3 files changed

+13
-2
lines changed

3 files changed

+13
-2
lines changed

bottlecap/Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bottlecap/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ opentelemetry-semantic-conventions = { version = "0.30", features = ["semconv_ex
4949
rustls-native-certs = { version = "0.8.1", optional = true }
5050
axum = { version = "0.8.4", default-features = false, features = ["default"] }
5151
ustr = { version = "1.0.0", default-features = false }
52+
tower-http = { version = "0.6.6", default-features = false, features = ["limit"] }
5253
# If you are adding or updating a datadog-owned code dependency, please ensure
5354
# that it has a clippy.toml rule for disallowing the reqwest::Client::builder
5455
# method in favor of our

bottlecap/src/traces/trace_agent.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
use axum::{
55
Router,
6-
extract::{Request, State},
6+
extract::{DefaultBodyLimit, Request, State},
77
http::StatusCode,
88
response::{IntoResponse, Response},
99
routing::{any, post},
@@ -18,6 +18,7 @@ use tokio::sync::{
1818
mpsc::{self, Receiver, Sender},
1919
};
2020
use tokio_util::sync::CancellationToken;
21+
use tower_http::limit::RequestBodyLimitLayer;
2122
use tracing::{debug, error};
2223

2324
use crate::traces::trace_processor::SendingTraceProcessor;
@@ -72,7 +73,9 @@ const INSTRUMENTATION_INTAKE_PATH: &str = "/api/v2/apmtelemetry";
7273

7374
const TRACER_PAYLOAD_CHANNEL_BUFFER_SIZE: usize = 10;
7475
const STATS_PAYLOAD_CHANNEL_BUFFER_SIZE: usize = 10;
75-
pub const MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
76+
pub const TRACE_REQUEST_BODY_LIMIT: usize = 50 * 1024 * 1024;
77+
pub const DEFAULT_REQUEST_BODY_LIMIT: usize = 2 * 1024 * 1024;
78+
pub const MAX_CONTENT_LENGTH: usize = 50 * 1024 * 1024;
7679
const LAMBDA_LOAD_SPAN: &str = "aws.lambda.load";
7780

7881
#[derive(Clone)]
@@ -231,10 +234,12 @@ impl TraceAgent {
231234
V5_TRACE_ENDPOINT_PATH,
232235
post(Self::v05_traces).put(Self::v05_traces),
233236
)
237+
.layer(RequestBodyLimitLayer::new(TRACE_REQUEST_BODY_LIMIT))
234238
.with_state(trace_state);
235239

236240
let stats_router = Router::new()
237241
.route(STATS_ENDPOINT_PATH, post(Self::stats).put(Self::stats))
242+
.layer(RequestBodyLimitLayer::new(DEFAULT_REQUEST_BODY_LIMIT))
238243
.with_state(stats_state);
239244

240245
let proxy_router = Router::new()
@@ -254,6 +259,7 @@ impl TraceAgent {
254259
INSTRUMENTATION_ENDPOINT_PATH,
255260
post(Self::instrumentation_proxy),
256261
)
262+
.layer(RequestBodyLimitLayer::new(DEFAULT_REQUEST_BODY_LIMIT))
257263
.with_state(proxy_state);
258264

259265
let info_router = Router::new().route(INFO_ENDPOINT_PATH, any(Self::info));
@@ -264,6 +270,8 @@ impl TraceAgent {
264270
.merge(proxy_router)
265271
.merge(info_router)
266272
.fallback(handler_not_found)
273+
// Disable the default body limit so we can use our own limit
274+
.layer(DefaultBodyLimit::disable())
267275
}
268276

269277
async fn graceful_shutdown(shutdown_token: CancellationToken) {

0 commit comments

Comments
 (0)