Skip to content

Commit 334aa94

Browse files
committed
Merge branch 'igornovg/control-plane-metrics' into 'master'
BOUN-774: control-plane: atomic node metrics Make Prometheus export updated metrics only after a check run has finished. This is accomplished by: * Adding a finish callback to `CheckPersistRunner` that gets executed when the run is done * Extract the relevant metrics in a callback and store them in a separate Mutex-guarded variable * In metrics exporter merge all other metrics with check-related ones and output them Not perfect, but it does the job. See merge request dfinity-lab/public/ic!12480
2 parents c829fc3 + d591431 commit 334aa94

File tree

4 files changed

+61
-13
lines changed

4 files changed

+61
-13
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rs/boundary_node/control_plane/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ DEPENDENCIES = [
1111
"//rs/types/types",
1212
"@crate_index//:anyhow",
1313
"@crate_index//:async-scoped",
14+
"@crate_index//:arc-swap",
1415
"@crate_index//:axum",
1516
"@crate_index//:bytes",
1617
"@crate_index//:candid",

rs/boundary_node/control_plane/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ edition = "2021"
55

66
[dependencies]
77
anyhow = "1.0.58"
8+
arc-swap = "1.6.0"
89
async-scoped = { version = "0.7", features = ["use-tokio"] }
910
async-trait = "0.1.56"
1011
axum = "0.6.1"

rs/boundary_node/control_plane/src/main.rs

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::{
77
};
88

99
use anyhow::{anyhow, Context, Error};
10+
use arc_swap::ArcSwap;
1011
use async_trait::async_trait;
1112
use axum::{
1213
body::Body,
@@ -29,7 +30,9 @@ use opentelemetry::{
2930
KeyValue,
3031
};
3132
use opentelemetry_prometheus::{ExporterBuilder, PrometheusExporter};
33+
use prometheus::proto::MetricFamily;
3234
use prometheus::{Encoder as PrometheusEncoder, TextEncoder};
35+
3336
use regex::Regex;
3437
use tokio::{sync::Semaphore, task};
3538
use tracing::info;
@@ -59,6 +62,7 @@ use crate::{
5962
};
6063

6164
const SERVICE_NAME: &str = "control-plane";
65+
const CHECKER_METRIC_PREFIX: &str = "control_plane_check_";
6266

6367
const SECOND: Duration = Duration::from_secs(1);
6468
const MINUTE: Duration = Duration::from_secs(60);
@@ -151,18 +155,20 @@ async fn main() -> Result<(), Error> {
151155
)
152156
.expect("failed to set global subscriber");
153157

154-
let exporter = ExporterBuilder::new(
155-
controllers::basic(
156-
processors::factory(
157-
selectors::simple::histogram([]),
158-
aggregation::cumulative_temporality_selector(),
158+
let exporter = Arc::new(
159+
ExporterBuilder::new(
160+
controllers::basic(
161+
processors::factory(
162+
selectors::simple::histogram([]),
163+
aggregation::cumulative_temporality_selector(),
164+
)
165+
.with_memory(true),
159166
)
160-
.with_memory(true),
167+
.with_resource(Resource::new(vec![KeyValue::new("service", SERVICE_NAME)]))
168+
.build(),
161169
)
162-
.with_resource(Resource::new(vec![KeyValue::new("service", SERVICE_NAME)]))
163-
.build(),
164-
)
165-
.init();
170+
.init(),
171+
);
166172

167173
// Metrics
168174
let meter = global::meter(SERVICE_NAME);
@@ -211,6 +217,9 @@ async fn main() -> Result<(), Error> {
211217
let snapshot_runner = WithThrottle(snapshot_runner, ThrottleParams::new(1 * MINUTE));
212218
let mut snapshot_runner = snapshot_runner;
213219

220+
let checker_metrics: Vec<MetricFamily> = Vec::new();
221+
let checker_metrics = Arc::new(ArcSwap::from_pointee(checker_metrics));
222+
214223
let checker = Checker::new(http_client);
215224
let checker = CheckWithMetrics(
216225
checker,
@@ -299,8 +308,9 @@ async fn main() -> Result<(), Error> {
299308
let metrics_router = Router::new()
300309
.route("/metrics", get(metrics_handler))
301310
.with_state(MetricsHandlerArgs {
302-
exporter,
311+
exporter: Arc::clone(&exporter),
303312
active_replicas,
313+
checker_metrics: Arc::clone(&checker_metrics),
304314
});
305315

306316
info!(
@@ -319,8 +329,12 @@ async fn main() -> Result<(), Error> {
319329
}
320330
}),
321331
task::spawn(async move {
332+
let exporter = Arc::clone(&exporter);
333+
let checker_metrics = Arc::clone(&checker_metrics);
334+
322335
loop {
323336
let _ = check_persist_runner.run().await;
337+
update_checker_metrics(&exporter, &checker_metrics);
324338
}
325339
}),
326340
task::spawn(
@@ -336,18 +350,49 @@ async fn main() -> Result<(), Error> {
336350

337351
#[derive(Clone)]
338352
struct MetricsHandlerArgs<A> {
339-
exporter: PrometheusExporter,
353+
exporter: Arc<PrometheusExporter>,
340354
active_replicas: A,
355+
checker_metrics: Arc<ArcSwap<Vec<MetricFamily>>>,
356+
}
357+
358+
// Gathers metrics relevant to node checking and stores them in the ArcSwap
359+
fn update_checker_metrics(
360+
exporter: &Arc<PrometheusExporter>,
361+
checker_metrics: &Arc<ArcSwap<Vec<MetricFamily>>>,
362+
) {
363+
// Gather node checker metrics
364+
let metric_families = Arc::new(
365+
exporter
366+
.registry()
367+
.gather()
368+
.into_iter()
369+
.filter(|x| x.get_name().starts_with(CHECKER_METRIC_PREFIX))
370+
.collect::<Vec<_>>(),
371+
);
372+
373+
checker_metrics.store(metric_families);
341374
}
342375

343376
async fn metrics_handler<A: ActiveChecker>(
344377
State(MetricsHandlerArgs {
345378
exporter,
346379
active_replicas,
380+
checker_metrics,
347381
}): State<MetricsHandlerArgs<A>>,
348382
_: Request<Body>,
349383
) -> Response<Body> {
350-
let metric_families = exporter.registry().gather();
384+
// Read out all metrics that are not related to node checking
385+
let mut metric_families = exporter
386+
.registry()
387+
.gather()
388+
.into_iter()
389+
.filter(|x| !x.get_name().starts_with(CHECKER_METRIC_PREFIX))
390+
.collect::<Vec<_>>();
391+
392+
// Concatenate node checking metrics with all others & sort the result to be consistent with gather() output
393+
let mut _checker_metrics = { Vec::clone(&checker_metrics.load()) };
394+
metric_families.append(&mut _checker_metrics);
395+
metric_families.sort_by(|a, b| a.get_name().cmp(b.get_name()));
351396

352397
let encoder = TextEncoder::new();
353398

0 commit comments

Comments
 (0)