Skip to content

Commit 1261138

Browse files
authored
feat(telemetry): track tracer and telemetry usage (#209)
Introduce additional telemetry metrics to gain deeper insights into our tracer usage.
1 parent 83958bc commit 1261138

File tree

8 files changed

+349
-107
lines changed

8 files changed

+349
-107
lines changed

src/datadog/datadog_agent.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,24 @@ void DatadogAgent::flush() {
219219
return;
220220
}
221221

222+
// Ideally:
223+
/*auto [encode_result, duration] = mesure([&trace_chunks] {*/
224+
/* std::string body;*/
225+
/* msgpack_encode(body, trace_chunks);*/
226+
/*});*/
227+
222228
std::string body;
229+
230+
auto beg = std::chrono::steady_clock::now();
223231
auto encode_result = msgpack_encode(body, trace_chunks);
232+
auto end = std::chrono::steady_clock::now();
233+
234+
telemetry::distribution::add(
235+
metrics::tracer::trace_chunk_serialization_duration,
236+
std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
237+
telemetry::distribution::add(metrics::tracer::trace_chunk_serialized_bytes,
238+
static_cast<uint64_t>(body.size()));
239+
224240
if (auto* error = encode_result.if_error()) {
225241
logger_->log_error(*error);
226242
return;
@@ -311,11 +327,17 @@ void DatadogAgent::flush() {
311327
};
312328

313329
telemetry::counter::increment(metrics::tracer::api::requests);
330+
telemetry::distribution::add(metrics::tracer::api::bytes_sent,
331+
static_cast<uint64_t>(body.size()));
332+
314333
auto post_result =
315334
http_client_->post(traces_endpoint_, std::move(set_request_headers),
316335
std::move(body), std::move(on_response),
317336
std::move(on_error), clock_().tick + request_timeout_);
318337
if (auto* error = post_result.if_error()) {
338+
// NOTE(@dmehala): `technical` is a better kind of errors.
339+
telemetry::counter::increment(metrics::tracer::api::errors,
340+
{"type:network"});
319341
logger_->log_error(
320342
error->with_prefix("Unexpected error submitting traces: "));
321343
}

src/datadog/telemetry/telemetry_impl.cpp

Lines changed: 98 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,40 @@ using namespace datadog::tracing;
1717
using namespace std::chrono_literals;
1818

1919
namespace datadog::telemetry {
20+
namespace internal_metrics {
21+
22+
/// The number of logs created with a given log level. Useful for calculating
23+
/// impact for other features (automatic sending of logs). Levels should be one
24+
/// of `debug`, `info`, `warn`, `error`, `critical`.
25+
const telemetry::Counter logs_created{"logs_created", "general", true};
26+
27+
/// The number of requests sent to the api endpoint in the agent that errored,
28+
/// tagged by the error type (e.g. `type:timeout`, `type:network`,
29+
/// `type:status_code`) and Endpoint (`endpoint:agent`, `endpoint:agentless`).
30+
const telemetry::Counter errors{"telemetry_api.errors", "telemetry", true};
31+
32+
/// The number of requests sent to a telemetry endpoint, regardless of success,
33+
/// tagged by the endpoint (`endpoint:agent`, `endpoint:agentless`).
34+
const telemetry::Counter requests{"telemetry_api.requests", "telemetry", true};
35+
36+
/// The number of responses received from the endpoint, tagged with status code
37+
/// (`status_code:200`, `status_code:404`) and endpoint (`endpoint:agent`,
38+
/// `endpoint:agentless`).
39+
const telemetry::Counter responses{"telemetry_api.responses", "telemetry",
40+
true};
41+
42+
/// The size of the payload sent to the stats endpoint in bytes, tagged by the
43+
/// endpoint (`endpoint:agent`, `endpoint:agentless`).
44+
const telemetry::Distribution bytes_sent{"telemetry_api.bytes", "telemetry",
45+
true};
46+
47+
/// The time it takes to send the payload sent to the endpoint in ms, tagged by
48+
/// the endpoint (`endpoint:agent`, `endpoint:agentless`).
49+
const telemetry::Distribution request_duration{"telemetry_api.ms", "telemetry",
50+
true};
51+
52+
} // namespace internal_metrics
53+
2054
namespace {
2155

2256
HTTPClient::URL make_telemetry_endpoint(HTTPClient::URL url) {
@@ -174,26 +208,8 @@ Telemetry::Telemetry(FinalizedConfiguration config,
174208
host_info_(get_host_info()) {
175209
// Callback for successful telemetry HTTP requests, to examine HTTP
176210
// status.
177-
telemetry_on_response_ = [logger = logger_](
178-
int response_status,
179-
const DictReader& /*response_headers*/,
180-
std::string response_body) {
181-
if (response_status < 200 || response_status >= 300) {
182-
logger->log_error([&](auto& stream) {
183-
stream << "Unexpected telemetry response status " << response_status
184-
<< " with body (if any, starts on next line):\n"
185-
<< response_body;
186-
});
187-
}
188-
};
189-
190-
// Callback for unsuccessful telemetry HTTP requests.
191-
telemetry_on_error_ = [logger = logger_](Error error) {
192-
logger->log_error(error.with_prefix(
193-
"Error occurred during HTTP request for telemetry: "));
194-
};
195-
196211
send_telemetry("app-started", app_started());
212+
http_client_->drain(clock_().tick + 2s);
197213
schedule_tasks();
198214
}
199215

@@ -216,20 +232,23 @@ Telemetry::~Telemetry() {
216232
// The app-closing message is bundled with a message containing the
217233
// final metric values.
218234
send_telemetry("app-closing", app_closing());
219-
http_client_->drain(clock_().tick + 1s);
235+
http_client_->drain(clock_().tick + 2s);
220236
}
221237
}
222238

223239
Telemetry::Telemetry(Telemetry&& rhs)
224240
: config_(std::move(rhs.config_)),
225241
logger_(std::move(rhs.logger_)),
226-
telemetry_on_response_(std::move(rhs.telemetry_on_response_)),
227-
telemetry_on_error_(std::move(rhs.telemetry_on_error_)),
228242
telemetry_endpoint_(std::move(rhs.telemetry_endpoint_)),
229243
tracer_signature_(std::move(rhs.tracer_signature_)),
230244
http_client_(rhs.http_client_),
231245
clock_(std::move(rhs.clock_)),
232246
scheduler_(std::move(rhs.scheduler_)),
247+
counters_(std::move(rhs.counters_)),
248+
counters_snapshot_(std::move(rhs.counters_snapshot_)),
249+
rates_(std::move(rhs.rates_)),
250+
rates_snapshot_(std::move(rhs.rates_snapshot_)),
251+
distributions_(std::move(rhs.distributions_)),
233252
seq_id_(rhs.seq_id_),
234253
config_seq_ids_(rhs.config_seq_ids_),
235254
host_info_(rhs.host_info_) {
@@ -242,13 +261,17 @@ Telemetry& Telemetry::operator=(Telemetry&& rhs) {
242261
cancel_tasks(rhs.tasks_);
243262
std::swap(config_, rhs.config_);
244263
std::swap(logger_, rhs.logger_);
245-
std::swap(telemetry_on_response_, rhs.telemetry_on_response_);
246-
std::swap(telemetry_on_error_, rhs.telemetry_on_error_);
247264
std::swap(telemetry_endpoint_, rhs.telemetry_endpoint_);
248265
std::swap(http_client_, rhs.http_client_);
249266
std::swap(tracer_signature_, rhs.tracer_signature_);
250267
std::swap(http_client_, rhs.http_client_);
251268
std::swap(clock_, rhs.clock_);
269+
std::swap(scheduler_, rhs.scheduler_);
270+
std::swap(counters_, rhs.counters_);
271+
std::swap(counters_snapshot_, rhs.counters_snapshot_);
272+
std::swap(rates_, rhs.rates_);
273+
std::swap(rates_snapshot_, rhs.rates_snapshot_);
274+
std::swap(distributions_, rhs.distributions_);
252275
std::swap(seq_id_, rhs.seq_id_);
253276
std::swap(config_seq_ids_, rhs.config_seq_ids_);
254277
std::swap(host_info_, rhs.host_info_);
@@ -259,16 +282,19 @@ Telemetry& Telemetry::operator=(Telemetry&& rhs) {
259282

260283
void Telemetry::log_error(std::string message) {
261284
if (!config_.report_logs) return;
285+
increment_counter(internal_metrics::logs_created, {"level:error"});
262286
log(std::move(message), LogLevel::ERROR);
263287
}
264288

265289
void Telemetry::log_error(std::string message, std::string stacktrace) {
266290
if (!config_.report_logs) return;
291+
increment_counter(internal_metrics::logs_created, {"level:error"});
267292
log(std::move(message), LogLevel::ERROR, stacktrace);
268293
}
269294

270295
void Telemetry::log_warning(std::string message) {
271296
if (!config_.report_logs) return;
297+
increment_counter(internal_metrics::logs_created, {"level:warning"});
272298
log(std::move(message), LogLevel::WARNING);
273299
}
274300

@@ -293,10 +319,55 @@ void Telemetry::send_telemetry(StringView request_type, std::string payload) {
293319
}
294320
};
295321

296-
auto post_result = http_client_->post(
297-
telemetry_endpoint_, set_telemetry_headers, std::move(payload),
298-
telemetry_on_response_, telemetry_on_error_, clock_().tick + 5s);
322+
auto telemetry_on_response = [this, logger = logger_](
323+
int response_status,
324+
const DictReader& /*response_headers*/,
325+
std::string response_body) {
326+
if (response_status >= 500) {
327+
increment_counter(internal_metrics::responses,
328+
{"status_code:5xx", "endpoint:agent"});
329+
} else if (response_status >= 400) {
330+
increment_counter(internal_metrics::responses,
331+
{"status_code:4xx", "endpoint:agent"});
332+
} else if (response_status >= 300) {
333+
increment_counter(internal_metrics::responses,
334+
{"status_code:3xx", "endpoint:agent"});
335+
} else if (response_status >= 200) {
336+
increment_counter(internal_metrics::responses,
337+
{"status_code:2xx", "endpoint:agent"});
338+
} else if (response_status >= 100) {
339+
increment_counter(internal_metrics::responses,
340+
{"status_code:1xx", "endpoint:agent"});
341+
}
342+
343+
if (response_status < 200 || response_status >= 300) {
344+
logger->log_error([&](auto& stream) {
345+
stream << "Unexpected telemetry response status " << response_status
346+
<< " with body (if any, starts on next line):\n"
347+
<< response_body;
348+
});
349+
}
350+
};
351+
352+
// Callback for unsuccessful telemetry HTTP requests.
353+
auto telemetry_on_error = [this, logger = logger_](Error error) {
354+
increment_counter(internal_metrics::errors,
355+
{"type:network", "endpoint:agent"});
356+
logger->log_error(error.with_prefix(
357+
"Error occurred during HTTP request for telemetry: "));
358+
};
359+
360+
increment_counter(internal_metrics::requests, {"endpoint:agent"});
361+
add_datapoint(internal_metrics::bytes_sent, {"endpoint:agent"},
362+
payload.size());
363+
364+
auto post_result =
365+
http_client_->post(telemetry_endpoint_, set_telemetry_headers,
366+
std::move(payload), std::move(telemetry_on_response),
367+
std::move(telemetry_on_error), clock_().tick + 5s);
299368
if (auto* error = post_result.if_error()) {
369+
increment_counter(internal_metrics::errors,
370+
{"type:network", "endpoint:agent"});
300371
logger_->log_error(
301372
error->with_prefix("Unexpected error submitting telemetry event: "));
302373
}

src/datadog/telemetry/telemetry_impl.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ class Telemetry final {
3434
/// Shared pointer to the user logger instance.
3535
std::shared_ptr<tracing::Logger> logger_;
3636
std::vector<tracing::EventScheduler::Cancel> tasks_;
37-
tracing::HTTPClient::ResponseHandler telemetry_on_response_;
38-
tracing::HTTPClient::ErrorHandler telemetry_on_error_;
3937
tracing::HTTPClient::URL telemetry_endpoint_;
4038
tracing::TracerSignature tracer_signature_;
4139
std::shared_ptr<tracing::HTTPClient> http_client_;

src/datadog/telemetry_metrics.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,36 @@ namespace datadog::tracing::metrics {
44

55
namespace tracer {
66
const telemetry::Counter spans_created = {"spans_created", "tracers", true};
7+
const telemetry::Counter spans_dropped = {"spans_dropped", "tracers", true};
78
const telemetry::Counter spans_finished = {"spans_finished", "tracers", true};
89

910
const telemetry::Counter trace_segments_created = {"trace_segments_created",
1011
"tracers", true};
1112

1213
const telemetry::Counter trace_segments_closed = {"trace_segments_closed",
1314
"tracers", true};
15+
16+
const telemetry::Distribution trace_chunk_size = {"trace_chunk_size", "tracers",
17+
true};
18+
19+
const telemetry::Distribution trace_chunk_serialized_bytes = {
20+
"trace_chunk_serialization.bytes", "tracers", true};
21+
22+
const telemetry::Distribution trace_chunk_serialization_duration = {
23+
"trace_chunk_serialization.ms", "tracers", true};
24+
25+
const telemetry::Counter trace_chunks_enqueued = {"trace_chunks_enqueued",
26+
"tracers", true};
27+
28+
const telemetry::Counter trace_chunks_enqueued_for_serialization = {
29+
"trace_chunks_enqueued_for_serialization", "tracers", true};
30+
31+
const telemetry::Counter trace_chunks_dropped = {"trace_chunks_dropped",
32+
"tracers", true};
33+
34+
const telemetry::Counter trace_chunks_sent = {"trace_chunks_sent", "tracers",
35+
true};
36+
1437
const telemetry::Counter context_header_truncated = {
1538
"context_header.truncated",
1639
"tracers",
@@ -20,8 +43,21 @@ const telemetry::Counter context_header_truncated = {
2043
namespace api {
2144
const telemetry::Counter requests = {"trace_api.requests", "tracers", true};
2245
const telemetry::Counter responses = {"trace_api.responses", "tracers", true};
46+
const telemetry::Distribution bytes_sent = {"trace_api.bytes", "tracers", true};
47+
const telemetry::Distribution request_duration = {"trace_api.ms", "tracers",
48+
true};
2349
const telemetry::Counter errors = {"trace_api.errors", "tracers", true};
2450
} // namespace api
51+
52+
namespace trace_context {
53+
const telemetry::Counter injected = {"context_header_style.injected", "tracers",
54+
true};
55+
const telemetry::Counter extracted = {"context_header_style.extracted",
56+
"tracers", true};
57+
const telemetry::Counter truncated = {"context_header.truncated", "tracers",
58+
true};
59+
} // namespace trace_context
60+
2561
} // namespace tracer
2662

2763
} // namespace datadog::tracing::metrics

0 commit comments

Comments
 (0)