Skip to content

Commit 8ebb122

Browse files
authored
ekump/APMSP-2162 emit missing payload health metrics (#1147)
* add datadog.tracer.http.sent.bytes health metric to trace exporter * add datadog.tracer.http.dropped.bytes health metric to trace exporter * add datadog.tracer.http.sent.traces health metric to trace exporter * Add a health metrics doc to the exporter * add http.requests health metric to the trace exporter * add dropped traces metric and add type to send.traces.errors * Standardize naming convention for health metrics and move MD to rustdoc
1 parent 330c52e commit 8ebb122

File tree

3 files changed

+589
-60
lines changed

3 files changed

+589
-60
lines changed
Lines changed: 178 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,188 @@
11
// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
22
// SPDX-License-Identifier: Apache-2.0
33

4-
//! health_metrics holds data to emit info about the health of the data-pipeline
4+
//! # Trace Exporter Health Metrics
5+
//!
6+
//! This module defines all health metrics emitted by the libdatadog trace exporter. These metrics
7+
//! are sent to DogStatsD to provide visibility for Datadog support.
8+
//!
9+
//! ## Overview
10+
//!
11+
//! Health metrics help monitor the trace exporter's behavior, including successful operations,
12+
//! error conditions, and performance characteristics. They are emitted via DogStatsD and follow
13+
//! consistent naming conventions.
14+
//!
15+
//! ## Metric Types
16+
//!
17+
//! - **Count**: Incremental counters that track the number of occurrences
18+
//! - **Distribution**: Value distributions that track sizes, durations, or quantities
19+
//!
20+
//! ## Naming Convention
21+
//!
22+
//! The metrics follow a hierarchical naming pattern:
23+
//!
24+
//! - `datadog.tracer.exporter.*`: All trace exporter metrics
25+
//! - `datadog.tracer.exporter.deserialize.*`: Trace deserialization metrics
26+
//! - `datadog.tracer.exporter.serialize.*`: Trace serialization metrics
27+
//! - `datadog.tracer.exporter.transport.*`: Network transport metrics
28+
//! - `datadog.tracer.exporter.transport.traces.*`: Trace-specific transport metrics
29+
//! - `datadog.tracer.exporter.transport.traces.sent`: All trace send attempts
30+
//! - `datadog.tracer.exporter.transport.traces.successful`: Successful trace sends
31+
//! - `datadog.tracer.exporter.transport.traces.failed`: Failed trace sends
32+
//! - `datadog.tracer.exporter.transport.traces.dropped`: Dropped traces due to errors
33+
//!
34+
//! ## Error Handling Patterns
35+
//!
36+
//! ### HTTP Status Code Handling
37+
//!
38+
//! - **Success (2xx)**: Emit `transport.traces.successful`, `transport.sent.bytes`,
39+
//! `transport.traces.sent`
40+
//! - **Client Errors (4xx)**: Emit `transport.traces.failed`, `transport.sent.bytes`,
41+
//! `transport.traces.sent`, and conditionally `transport.dropped.bytes`,
42+
//! `transport.traces.dropped`
43+
//! - **Server Errors (5xx)**: Emit `transport.traces.failed`, `transport.sent.bytes`,
44+
//! `transport.traces.sent`, `transport.dropped.bytes`, `transport.traces.dropped`
45+
//! - **Network Errors**: Emit `transport.traces.failed`, `transport.sent.bytes`,
46+
//! `transport.traces.sent`, `transport.dropped.bytes`, `transport.traces.dropped`
47+
//!
48+
//! ### Special Status Code Exclusions
49+
//!
50+
//! The following HTTP status codes do NOT trigger `transport.dropped.bytes` or
51+
//! `transport.traces.dropped` emission:
52+
//! - **404 Not Found**: Indicates endpoint not available (agent configuration issue)
53+
//! - **415 Unsupported Media Type**: Indicates format negotiation issue
54+
//!
55+
//! These exclusions prevent false alarms for configuration issues rather than actual payload drops.
56+
//!
57+
//! ## Tags
58+
//!
59+
//! All metrics include the following standard tags:
60+
//! - `libdatadog_version`: Version of the libdatadog library
61+
//!
62+
//! Additional conditional tags:
63+
//! - `type:<status_code>`: HTTP status code for error metrics (e.g., `type:400`, `type:404`,
64+
//! `type:500`)
65+
//! - `type:<error_type>`: Error type classification for non-HTTP errors (e.g., `type:network`,
66+
//! `type:timeout`, `type:response_body`, `type:build`, `type:unknown`)
567
6-
pub(crate) const STAT_SEND_TRACES: &str = "datadog.libdatadog.send.traces";
7-
pub(crate) const STAT_SEND_TRACES_ERRORS: &str = "datadog.libdatadog.send.traces.errors";
8-
pub(crate) const STAT_DESER_TRACES: &str = "datadog.libdatadog.deser_traces";
9-
pub(crate) const STAT_DESER_TRACES_ERRORS: &str = "datadog.libdatadog.deser_traces.errors";
68+
// =============================================================================
69+
// Trace Processing Metrics
70+
// =============================================================================
71+
72+
/// Number of trace chunks successfully deserialized from input.
73+
///
74+
/// **Type**: Count
75+
/// **When Emitted**: After successful deserialization of trace data from msgpack format
76+
/// **Tags**: `libdatadog_version`
77+
pub(crate) const DESERIALIZE_TRACES: &str = "datadog.tracer.exporter.deserialize.traces";
78+
79+
/// Number of trace deserialization errors.
80+
///
81+
/// **Type**: Count
82+
/// **When Emitted**: When msgpack deserialization fails due to invalid format or corrupted data
83+
/// **Tags**: `libdatadog_version`
84+
pub(crate) const DESERIALIZE_TRACES_ERRORS: &str = "datadog.tracer.exporter.deserialize.errors";
85+
86+
/// Number of trace serialization errors.
87+
///
88+
/// **Type**: Count
89+
/// **When Emitted**: Currently unused but reserved for future trace serialization error tracking
90+
/// **Tags**: `libdatadog_version`
91+
/// **Status**: Dead code (marked with `#[allow(dead_code)]`)
1092
#[allow(dead_code)] // TODO (APMSP-1584) Add support for health metrics when using trace utils
11-
pub(crate) const STAT_SER_TRACES_ERRORS: &str = "datadog.libdatadog.ser_traces.errors";
93+
pub(crate) const SERIALIZE_TRACES_ERRORS: &str = "datadog.tracer.exporter.serialize.errors";
94+
95+
// =============================================================================
96+
// Transport - Trace Metrics
97+
// =============================================================================
98+
99+
/// Number of trace chunks included in HTTP requests to the agent (all attempts).
100+
///
101+
/// **Type**: Distribution
102+
/// **When Emitted**: Always emitted for every send attempt, regardless of success or failure
103+
/// **Tags**: `libdatadog_version`
104+
pub(crate) const TRANSPORT_TRACES_SENT: &str = "datadog.tracer.exporter.transport.traces.sent";
105+
106+
/// Number of trace chunks successfully sent to the agent.
107+
///
108+
/// **Type**: Count
109+
/// **When Emitted**: After successful HTTP response from the agent (2xx status codes)
110+
/// **Tags**: `libdatadog_version`
111+
pub(crate) const TRANSPORT_TRACES_SUCCESSFUL: &str =
112+
"datadog.tracer.exporter.transport.traces.successful";
113+
114+
/// Number of errors encountered while sending traces to the agent.
115+
///
116+
/// **Type**: Count
117+
/// **When Emitted**:
118+
/// - HTTP error responses (4xx, 5xx status codes)
119+
/// - Network/connection errors
120+
/// - Request timeout errors
121+
///
122+
/// **Tags**: `libdatadog_version`, `type:<status_code>` (for HTTP errors), `type:<error_type>` (for
123+
/// other errors)
124+
///
125+
/// **Error Types**:
126+
/// - `type:<status_code>`: HTTP status codes (e.g., `type:400`, `type:404`, `type:500`)
127+
/// - `type:network`: Network/connection errors
128+
/// - `type:timeout`: Request timeout errors
129+
/// - `type:response_body`: Response body read errors
130+
/// - `type:build`: Request build errors
131+
/// - `type:unknown`: Fallback for unrecognized error types
132+
pub(crate) const TRANSPORT_TRACES_FAILED: &str = "datadog.tracer.exporter.transport.traces.failed";
133+
134+
/// Number of trace chunks dropped due to errors.
135+
///
136+
/// **Type**: Distribution
137+
/// **When Emitted**:
138+
/// - HTTP error responses (excluding 404 Not Found and 415 Unsupported Media Type)
139+
/// - Network/connection errors
140+
/// - Request timeout errors
141+
///
142+
/// **Tags**: `libdatadog_version`
143+
///
144+
/// **Note**: 404 and 415 status codes are excluded as they represent endpoint/format issues rather
145+
/// than dropped payloads. While they aren't counted as dropped traces, they may still be dropped.
146+
pub(crate) const TRANSPORT_TRACES_DROPPED: &str =
147+
"datadog.tracer.exporter.transport.traces.dropped";
148+
149+
// =============================================================================
150+
// Transport - Payload Metrics
151+
// =============================================================================
152+
153+
/// Size in bytes of HTTP payloads sent to the agent.
154+
///
155+
/// **Type**: Distribution
156+
/// **When Emitted**: Always emitted for every send attempt, regardless of success or failure
157+
/// **Tags**: `libdatadog_version`
158+
pub(crate) const TRANSPORT_SENT_BYTES: &str = "datadog.tracer.exporter.transport.sent.bytes";
159+
160+
/// Size in bytes of HTTP payloads dropped due to errors.
161+
///
162+
/// **Type**: Distribution
163+
/// **When Emitted**:
164+
/// - HTTP error responses (excluding 404 Not Found and 415 Unsupported Media Type)
165+
/// - Network/connection errors
166+
/// - Request timeout errors
167+
///
168+
/// **Tags**: `libdatadog_version`
169+
///
170+
/// **Note**: 404 and 415 status codes are excluded as they represent endpoint/format issues rather
171+
/// than dropped payloads
172+
pub(crate) const TRANSPORT_DROPPED_BYTES: &str = "datadog.tracer.exporter.transport.dropped.bytes";
173+
174+
/// Number of HTTP requests made to the agent.
175+
///
176+
/// **Type**: Distribution
177+
/// **When Emitted**: Always emitted after each send operation, counting all HTTP attempts including
178+
/// retries **Tags**: `libdatadog_version`
179+
///
180+
/// **Note**: Value represents total request attempts (1 for success without retries, >1 when
181+
/// retries occur)
182+
pub(crate) const TRANSPORT_REQUESTS: &str = "datadog.tracer.exporter.transport.requests";
12183

13184
#[derive(Debug)]
14185
pub(crate) enum HealthMetric {
15186
Count(&'static str, i64),
187+
Distribution(&'static str, i64),
16188
}

data-pipeline/src/trace_exporter/metrics.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,19 @@ impl<'a> MetricsEmitter<'a> {
4141
);
4242
flusher.send(vec![DogStatsDAction::Count(name, c, tags.into_iter())])
4343
}
44+
HealthMetric::Distribution(name, value) => {
45+
debug!(
46+
metric_name = name,
47+
value = value,
48+
has_custom_tags = has_custom_tags,
49+
"Emitting distribution metric to dogstatsd"
50+
);
51+
flusher.send(vec![DogStatsDAction::Distribution(
52+
name,
53+
value as f64,
54+
tags.into_iter(),
55+
)])
56+
}
4457
}
4558
} else {
4659
debug!(
@@ -78,5 +91,6 @@ mod tests {
7891
HealthMetric::Count("test.metric", 5),
7992
Some(vec![&tag!("custom", "tag")]),
8093
);
94+
emitter.emit(HealthMetric::Distribution("test.distribution", 1024), None);
8195
}
8296
}

0 commit comments

Comments
 (0)