Skip to content

Commit 751bb0d

Browse files
thzteh-cmc
andauthored
re_perf_telemetry crate: allow metric scraping (#11051)
- **Allow scraping of metrics (http endpoint).** Previously metrics could only be pushed to an OTLP endpoint. This change adds the ability to start a dedicated HTTP server to expose a /metrics endpoint for Prometheus-style scraping. This change does not enable a metrics endpoint in the rerun viewer, but it allows doing so by calling `::start_metrics_server()`. --------- Co-authored-by: Clement Rey <[email protected]>
1 parent 27ea500 commit 751bb0d

File tree

8 files changed

+927
-23
lines changed

8 files changed

+927
-23
lines changed

Cargo.lock

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,10 +1018,13 @@ checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5"
10181018
dependencies = [
10191019
"axum-core",
10201020
"bytes",
1021+
"form_urlencoded",
10211022
"futures-util",
10221023
"http",
10231024
"http-body",
10241025
"http-body-util",
1026+
"hyper",
1027+
"hyper-util",
10251028
"itoa",
10261029
"matchit",
10271030
"memchr",
@@ -1030,10 +1033,15 @@ dependencies = [
10301033
"pin-project-lite",
10311034
"rustversion",
10321035
"serde",
1036+
"serde_json",
1037+
"serde_path_to_error",
1038+
"serde_urlencoded",
10331039
"sync_wrapper",
1040+
"tokio",
10341041
"tower",
10351042
"tower-layer",
10361043
"tower-service",
1044+
"tracing",
10371045
]
10381046

10391047
[[package]]
@@ -1053,6 +1061,7 @@ dependencies = [
10531061
"sync_wrapper",
10541062
"tower-layer",
10551063
"tower-service",
1064+
"tracing",
10561065
]
10571066

10581067
[[package]]
@@ -3007,6 +3016,12 @@ version = "0.1.2"
30073016
source = "registry+https://github.com/rust-lang/crates.io-index"
30083017
checksum = "d8b14ccef22fc6f5a8f4d7d768562a182c04ce9a3b3157b91390b52ddfdf1a76"
30093018

3019+
[[package]]
3020+
name = "dtoa"
3021+
version = "1.0.10"
3022+
source = "registry+https://github.com/rust-lang/crates.io-index"
3023+
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
3024+
30103025
[[package]]
30113026
name = "ecolor"
30123027
version = "0.32.1"
@@ -6747,6 +6762,29 @@ dependencies = [
67476762
"syn 2.0.104",
67486763
]
67496764

6765+
[[package]]
6766+
name = "prometheus-client"
6767+
version = "0.24.0"
6768+
source = "registry+https://github.com/rust-lang/crates.io-index"
6769+
checksum = "e4500adecd7af8e0e9f4dbce15cfee07ce913fbf6ad605cc468b83f2d531ee94"
6770+
dependencies = [
6771+
"dtoa",
6772+
"itoa",
6773+
"parking_lot",
6774+
"prometheus-client-derive-encode",
6775+
]
6776+
6777+
[[package]]
6778+
name = "prometheus-client-derive-encode"
6779+
version = "0.5.0"
6780+
source = "registry+https://github.com/rust-lang/crates.io-index"
6781+
checksum = "9adf1691c04c0a5ff46ff8f262b58beb07b0dbb61f96f9f54f6cbd82106ed87f"
6782+
dependencies = [
6783+
"proc-macro2",
6784+
"quote",
6785+
"syn 2.0.104",
6786+
]
6787+
67506788
[[package]]
67516789
name = "prost"
67526790
version = "0.13.5"
@@ -7964,6 +8002,7 @@ version = "0.25.0-alpha.1+dev"
79648002
dependencies = [
79658003
"ahash",
79668004
"anyhow",
8005+
"axum",
79678006
"base64 0.22.1",
79688007
"clap",
79698008
"http",
@@ -7972,8 +8011,10 @@ dependencies = [
79728011
"opentelemetry-otlp",
79738012
"opentelemetry_sdk",
79748013
"parking_lot",
8014+
"prometheus-client",
79758015
"serde",
79768016
"serde_json",
8017+
"tokio",
79778018
"tonic",
79788019
"tower",
79798020
"tower-http",
@@ -9831,6 +9872,16 @@ dependencies = [
98319872
"serde",
98329873
]
98339874

9875+
[[package]]
9876+
name = "serde_path_to_error"
9877+
version = "0.1.17"
9878+
source = "registry+https://github.com/rust-lang/crates.io-index"
9879+
checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a"
9880+
dependencies = [
9881+
"itoa",
9882+
"serde",
9883+
]
9884+
98349885
[[package]]
98359886
name = "serde_repr"
98369887
version = "0.1.20"

crates/utils/re_perf_telemetry/Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ opentelemetry-appender-tracing = { workspace = true, features = [
5656
"experimental_use_tracing_span_context",
5757
] }
5858
opentelemetry-otlp = { workspace = true, features = ["grpc-tonic"] }
59-
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
59+
opentelemetry_sdk = { workspace = true, features = [
60+
"rt-tokio",
61+
"experimental_metrics_custom_reader",
62+
] }
6063
parking_lot.workspace = true
6164
serde.workspace = true
6265
serde_json.workspace = true
@@ -70,6 +73,9 @@ tracing-subscriber = { workspace = true, features = [
7073
"env-filter",
7174
"json",
7275
] }
76+
axum = "0.8.4"
77+
prometheus-client = "0.24"
78+
tokio = { workspace = true }
7379

7480
# External (optional)
7581
tracing-tracy = { workspace = true, optional = true }

crates/utils/re_perf_telemetry/src/args.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,19 +214,27 @@ pub struct TelemetryArgs {
214214

215215
/// The HTTP OTLP endpoint to send the metrics to.
216216
///
217-
/// It's fine for the target endpoint to be down.
218-
///
219217
/// Part of the `OpenTelemetry` spec.
220-
#[clap(
221-
long,
222-
env = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT",
223-
default_value = "http://localhost:9090/api/v1/otlp/v1/metrics"
224-
)]
218+
#[clap(long, env = "OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", default_value = "")]
225219
pub metric_endpoint: String,
226220

227221
/// The interval in milliseconds at which metrics are pushed to the collector.
228222
///
229223
/// Part of the `OpenTelemetry` spec.
230224
#[clap(long, env = "OTEL_METRIC_EXPORT_INTERVAL", default_value = "10000")]
231225
pub metric_interval: String,
226+
227+
/// Listening address for dedicated HTTP /metrics endpoint for scraping.
228+
///
229+
/// Setting this has no immediate effect. The actual listener has to be
230+
/// started by calling `Telemetry::start_metrics_listener()`.
231+
///
232+
/// Metrics are the same as those being pushed to the OTLP endpoint.
233+
///
234+
/// Format: ":9091", "0.0.0.0:9091", or "127.0.0.1:9091"
235+
/// Empty value means the listener is disabled.
236+
///
237+
/// This has no effect if `TELEMETRY_ENABLED` or `OTEL_SDK_ENABLED` is false.
238+
#[clap(long, env = "METRICS_LISTEN_ADDRESS", default_value = "")]
239+
pub metrics_listen_address: String,
232240
}

crates/utils/re_perf_telemetry/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
4040
mod args;
4141
mod grpc;
42+
mod metrics_server;
43+
mod prometheus;
44+
mod shared_reader;
4245
mod telemetry;
4346
mod utils;
4447

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
//! HTTP server for metrics collection and exposition
2+
3+
use parking_lot::Mutex;
4+
use std::net::SocketAddr;
5+
use std::sync::Arc;
6+
7+
use axum::{
8+
Router,
9+
extract::State,
10+
http::{StatusCode, header},
11+
response::IntoResponse,
12+
routing::get,
13+
};
14+
use opentelemetry_sdk::metrics::{ManualReader, data::ResourceMetrics, reader::MetricReader as _};
15+
use tokio::net::TcpListener;
16+
use tracing::error;
17+
18+
use crate::prometheus::{MetricContainer, convert_to_prometheus, encode_registry};
19+
20+
/// Start a metrics server that binds synchronously and serves asynchronously.
21+
/// Returns the bound socket address after successful binding.
22+
/// The server continues running in the spawned task.
23+
pub(crate) async fn start_metrics_server(
24+
address: &str,
25+
reader: Arc<ManualReader>,
26+
) -> anyhow::Result<SocketAddr> {
27+
let addr: SocketAddr = address.parse().map_err(|err| {
28+
anyhow::anyhow!(
29+
"Failed to parse metrics listen address '{}': {}",
30+
address,
31+
err
32+
)
33+
})?;
34+
35+
let app = Router::new()
36+
.route("/metrics", get(manual_metrics_handler))
37+
.with_state(reader);
38+
39+
// Bind synchronously to catch binding errors immediately
40+
let listener = TcpListener::bind(addr)
41+
.await
42+
.map_err(|err| anyhow::anyhow!("Failed to bind to {}: {}", addr, err))?;
43+
44+
let bound_addr = listener
45+
.local_addr()
46+
.map_err(|err| anyhow::anyhow!("Failed to get local address: {}", err))?;
47+
48+
// Spawn the server task to run asynchronously
49+
tokio::spawn(async move {
50+
if let Err(err) = axum::serve(listener, app).await {
51+
error!("Metrics server error: {}", err);
52+
}
53+
});
54+
55+
Ok(bound_addr)
56+
}
57+
58+
/// Handler for the ManualReader-based /metrics endpoint
59+
/// This collects metrics on-demand from `OpenTelemetry's` `ManualReader`
60+
async fn manual_metrics_handler(State(reader): State<Arc<ManualReader>>) -> impl IntoResponse {
61+
// This handler is picking up data from telemetry SDK's ManualReader,
62+
// this is a temporary solution to expose metrics in different ways
63+
// (pull and push).
64+
// This is to be replaced in the future with a less complex solution,
65+
// using only a single approach.
66+
let mut resource_metrics = ResourceMetrics::default();
67+
68+
// Collect metrics from ManualReader
69+
match reader.collect(&mut resource_metrics) {
70+
Ok(_) => {
71+
let metrics = Arc::new(Mutex::new(MetricContainer::new()));
72+
73+
// Convert ResourceMetrics to Prometheus metrics and get the registry
74+
let registry = convert_to_prometheus(&resource_metrics, &metrics);
75+
76+
// Encode metrics to Prometheus text format
77+
match encode_registry(&registry) {
78+
Ok(buffer) => (
79+
StatusCode::OK,
80+
[(header::CONTENT_TYPE, "text/plain; version=0.0.4")],
81+
buffer,
82+
),
83+
Err(err) => {
84+
error!("Failed to encode metrics: {}", err);
85+
(
86+
StatusCode::INTERNAL_SERVER_ERROR,
87+
[(header::CONTENT_TYPE, "text/plain")],
88+
format!("Failed to encode metrics: {err}"),
89+
)
90+
}
91+
}
92+
}
93+
Err(err) => {
94+
error!("Failed to collect metrics from ManualReader: {}", err);
95+
(
96+
StatusCode::INTERNAL_SERVER_ERROR,
97+
[(header::CONTENT_TYPE, "text/plain")],
98+
format!("Failed to collect metrics: {err}"),
99+
)
100+
}
101+
}
102+
}

0 commit comments

Comments
 (0)