Skip to content

Commit 0a1b9e4

Browse files
breuhanspoutn1k
andauthored
Add metrics for operation time statistics (#103)
* Add metrics for operation time statistics * Remove period --------- Co-authored-by: Jean-Baptiste Skutnik <[email protected]>
1 parent bbd606b commit 0a1b9e4

18 files changed

+474
-95
lines changed

lustrefs-exporter/src/lib.rs

Lines changed: 17 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,20 @@ pub mod tests {
140140
use prometheus_parse::{Sample, Scrape};
141141
use serial_test::serial;
142142
use std::{
143-
collections::{BTreeSet, HashSet},
143+
collections::HashSet,
144144
path::{Path, PathBuf},
145145
};
146146

147+
// These metrics are ignored for the comparison with the previous implementation
148+
// since they are new and not present in the previous implementation.
149+
const IGNORED_METRICS: &[&str] = &[
150+
"target_info",
151+
"lustre_health_healthy",
152+
"lustre_stats_time_total",
153+
"lustre_stats_time_max",
154+
"lustre_stats_time_min",
155+
];
156+
147157
#[test]
148158
fn test_error_into_response() {
149159
let error = Error::NoCap("test_param", "test_content".to_string());
@@ -312,123 +322,35 @@ pub mod tests {
312322
fn compare_snapshots_to_existing_otel_snapshots() -> Result<(), Box<dyn std::error::Error>> {
313323
insta::glob!("otel_snapshots/", "*.otelsnap", |path| {
314324
let snap_name = path.file_name().unwrap();
315-
316325
let snap_file = path
317326
.parent()
318327
.unwrap()
319328
.parent()
320329
.unwrap()
321330
.join("snapshots")
322331
.join(snap_name.to_string_lossy().replace(".otelsnap", ".snap"));
332+
let otel_metrics = read_metrics_from_snapshot(path);
333+
let metrics = read_metrics_from_snapshot(&snap_file);
323334

324-
let otel_snapshot_contents = std::fs::read_to_string(path).unwrap();
325-
326-
let snapshot_contents = std::fs::read_to_string(&snap_file).unwrap();
327-
328-
let otel_snapshot = normalize_snapshot_for_otel_comparison(&otel_snapshot_contents);
329-
330-
let snapshot = normalize_snapshot_for_otel_comparison(&snapshot_contents);
331-
332-
let only_in_otel_snapshots = otel_snapshot
333-
.difference(&snapshot)
334-
.cloned()
335-
.collect::<Vec<_>>();
336-
337-
let only_in_snapshots = snapshot
338-
.difference(&otel_snapshot)
339-
.cloned()
340-
.collect::<Vec<_>>();
341-
342-
let snapshots_equal =
343-
if only_in_otel_snapshots.is_empty() && only_in_snapshots.is_empty() {
344-
true
345-
} else {
346-
if !only_in_otel_snapshots.is_empty() {
347-
eprintln!("Metrics only in {}:", path.display());
348-
349-
for metric in only_in_otel_snapshots {
350-
eprintln!("{metric:?}");
351-
}
352-
}
353-
354-
if !only_in_snapshots.is_empty() {
355-
eprintln!("Metrics only in {}:", snap_file.display());
356-
357-
for metric in only_in_snapshots {
358-
eprintln!("{metric:?}");
359-
}
360-
}
361-
362-
false
363-
};
364-
365-
assert!(snapshots_equal, "Snapshots are not equal.");
335+
compare_metrics(&otel_metrics, &metrics);
366336
});
367337

368338
Ok(())
369339
}
370340

371-
fn normalize_snapshot_for_otel_comparison(x: &str) -> BTreeSet<String> {
372-
x.lines()
373-
.filter(|x| {
374-
!x.contains("target_info")
375-
&& !x.contains("# EOF")
376-
&& !x.starts_with("source:")
377-
&& !x.starts_with("expression")
378-
&& !x.starts_with("\"")
379-
&& !x.starts_with("---")
380-
&& !x.is_empty()
381-
})
382-
.map(String::from)
383-
.map(|x| {
384-
if !x.starts_with("# HELP") {
385-
return x;
386-
}
387-
388-
x.strip_suffix(".").unwrap_or(&x).to_string()
389-
})
390-
.map(|x| {
391-
if x.starts_with('#') {
392-
return x.to_string();
393-
}
394-
395-
let Some((metric_name, rest)) = x.split_once("{") else {
396-
return x.to_string();
397-
};
398-
399-
let Some((labels, value)) = rest.split_once("}") else {
400-
return x.to_string();
401-
};
402-
403-
let labels = labels
404-
.trim()
405-
.split(",")
406-
.filter(|x| !x.contains("otel_scope_name"))
407-
.collect::<BTreeSet<_>>();
408-
409-
format!(
410-
"{}{{{}}} {}",
411-
metric_name.trim(),
412-
labels.into_iter().collect::<Vec<_>>().join(","),
413-
value.trim()
414-
)
415-
})
416-
.collect::<BTreeSet<_>>()
417-
}
418-
419341
pub(super) fn compare_metrics(metrics1: &Scrape, metrics2: &Scrape) {
420342
// Skip OTEL specific metric and updated metrics.
421343
let set1: HashSet<_> = metrics1
422344
.samples
423345
.iter()
424-
.filter(|s| s.metric != "target_info")
346+
.filter(|s| !IGNORED_METRICS.contains(&s.metric.as_str()))
425347
.map(normalize_sample)
426348
.collect();
427349

428350
let set2: HashSet<_> = metrics2
429351
.samples
430352
.iter()
431-
.filter(|s| s.metric != "target_info")
353+
.filter(|s| !IGNORED_METRICS.contains(&s.metric.as_str()))
432354
.map(normalize_sample)
433355
.collect();
434356

@@ -565,7 +487,7 @@ pub mod tests {
565487
let mut sorted_docs: Vec<_> = docs
566488
.iter()
567489
.filter_map(|(k, v)| {
568-
if k != "target_info" && k != "lustre_health_healthy" {
490+
if !IGNORED_METRICS.contains(&k.as_str()) {
569491
Some((k.clone(), v.strip_suffix(".").unwrap_or(v).to_string()))
570492
} else {
571493
None

lustrefs-exporter/src/snapshots/lustrefs_exporter__routes__tests__jobstats_with_stderr_output.snap

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,4 +1219,34 @@ lustre_stats_total{component="mdt",operation="open",target="ai400x2-MDT0000"} 51
12191219
lustre_stats_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 516
12201220
lustre_stats_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 124801
12211221
lustre_stats_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 512
1222+
# HELP lustre_stats_time_min Minimum time taken for an operation in microseconds.
1223+
# TYPE lustre_stats_time_min gauge
1224+
lustre_stats_time_min{component="mdt",operation="close",target="ai400x2-MDT0000"} 6
1225+
lustre_stats_time_min{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 1
1226+
lustre_stats_time_min{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 6
1227+
lustre_stats_time_min{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 59
1228+
lustre_stats_time_min{component="mdt",operation="open",target="ai400x2-MDT0000"} 64
1229+
lustre_stats_time_min{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 26
1230+
lustre_stats_time_min{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 0
1231+
lustre_stats_time_min{component="mdt",operation="sync",target="ai400x2-MDT0000"} 4
1232+
# HELP lustre_stats_time_max Maximum time taken for an operation in microseconds.
1233+
# TYPE lustre_stats_time_max gauge
1234+
lustre_stats_time_max{component="mdt",operation="close",target="ai400x2-MDT0000"} 4052
1235+
lustre_stats_time_max{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 2989
1236+
lustre_stats_time_max{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 1091
1237+
lustre_stats_time_max{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 39101
1238+
lustre_stats_time_max{component="mdt",operation="open",target="ai400x2-MDT0000"} 39146
1239+
lustre_stats_time_max{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 121
1240+
lustre_stats_time_max{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 95
1241+
lustre_stats_time_max{component="mdt",operation="sync",target="ai400x2-MDT0000"} 50
1242+
# HELP lustre_stats_time_total Total time taken for an operation in microseconds.
1243+
# TYPE lustre_stats_time_total gauge
1244+
lustre_stats_time_total{component="mdt",operation="close",target="ai400x2-MDT0000"} 2630805
1245+
lustre_stats_time_total{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 989676
1246+
lustre_stats_time_total{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 687561
1247+
lustre_stats_time_total{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 767427
1248+
lustre_stats_time_total{component="mdt",operation="open",target="ai400x2-MDT0000"} 772784
1249+
lustre_stats_time_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 22544
1250+
lustre_stats_time_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 1292933
1251+
lustre_stats_time_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 7767
12221252
# EOF

lustrefs-exporter/src/snapshots/lustrefs_exporter__routes__tests__metrics_endpoint_is_idempotent.snap

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1219,4 +1219,34 @@ lustre_stats_total{component="mdt",operation="open",target="ai400x2-MDT0000"} 51
12191219
lustre_stats_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 516
12201220
lustre_stats_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 124801
12211221
lustre_stats_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 512
1222+
# HELP lustre_stats_time_min Minimum time taken for an operation in microseconds.
1223+
# TYPE lustre_stats_time_min gauge
1224+
lustre_stats_time_min{component="mdt",operation="close",target="ai400x2-MDT0000"} 6
1225+
lustre_stats_time_min{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 1
1226+
lustre_stats_time_min{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 6
1227+
lustre_stats_time_min{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 59
1228+
lustre_stats_time_min{component="mdt",operation="open",target="ai400x2-MDT0000"} 64
1229+
lustre_stats_time_min{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 26
1230+
lustre_stats_time_min{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 0
1231+
lustre_stats_time_min{component="mdt",operation="sync",target="ai400x2-MDT0000"} 4
1232+
# HELP lustre_stats_time_max Maximum time taken for an operation in microseconds.
1233+
# TYPE lustre_stats_time_max gauge
1234+
lustre_stats_time_max{component="mdt",operation="close",target="ai400x2-MDT0000"} 4052
1235+
lustre_stats_time_max{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 2989
1236+
lustre_stats_time_max{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 1091
1237+
lustre_stats_time_max{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 39101
1238+
lustre_stats_time_max{component="mdt",operation="open",target="ai400x2-MDT0000"} 39146
1239+
lustre_stats_time_max{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 121
1240+
lustre_stats_time_max{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 95
1241+
lustre_stats_time_max{component="mdt",operation="sync",target="ai400x2-MDT0000"} 50
1242+
# HELP lustre_stats_time_total Total time taken for an operation in microseconds.
1243+
# TYPE lustre_stats_time_total gauge
1244+
lustre_stats_time_total{component="mdt",operation="close",target="ai400x2-MDT0000"} 2630805
1245+
lustre_stats_time_total{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 989676
1246+
lustre_stats_time_total{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 687561
1247+
lustre_stats_time_total{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 767427
1248+
lustre_stats_time_total{component="mdt",operation="open",target="ai400x2-MDT0000"} 772784
1249+
lustre_stats_time_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 22544
1250+
lustre_stats_time_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 1292933
1251+
lustre_stats_time_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 7767
12221252
# EOF

lustrefs-exporter/src/snapshots/lustrefs_exporter__tests__host_stats_non_healthy_otel.snap

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,4 +611,43 @@ lustre_stats_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"}
611611
lustre_stats_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 91893
612612
lustre_stats_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 224
613613
lustre_stats_total{component="mdt",operation="unlink",target="ai400x2-MDT0000"} 3
614+
# HELP lustre_stats_time_min Minimum time taken for an operation in microseconds.
615+
# TYPE lustre_stats_time_min gauge
616+
lustre_stats_time_min{component="mdt",operation="close",target="ai400x2-MDT0000"} 7
617+
lustre_stats_time_min{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 0
618+
lustre_stats_time_min{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 6
619+
lustre_stats_time_min{component="mdt",operation="mkdir",target="ai400x2-MDT0000"} 162
620+
lustre_stats_time_min{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 47
621+
lustre_stats_time_min{component="mdt",operation="open",target="ai400x2-MDT0000"} 24
622+
lustre_stats_time_min{component="mdt",operation="rmdir",target="ai400x2-MDT0000"} 58
623+
lustre_stats_time_min{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 18
624+
lustre_stats_time_min{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 0
625+
lustre_stats_time_min{component="mdt",operation="sync",target="ai400x2-MDT0000"} 2
626+
lustre_stats_time_min{component="mdt",operation="unlink",target="ai400x2-MDT0000"} 412
627+
# HELP lustre_stats_time_max Maximum time taken for an operation in microseconds.
628+
# TYPE lustre_stats_time_max gauge
629+
lustre_stats_time_max{component="mdt",operation="close",target="ai400x2-MDT0000"} 255
630+
lustre_stats_time_max{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 1740
631+
lustre_stats_time_max{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 47
632+
lustre_stats_time_max{component="mdt",operation="mkdir",target="ai400x2-MDT0000"} 2911
633+
lustre_stats_time_max{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 1081
634+
lustre_stats_time_max{component="mdt",operation="open",target="ai400x2-MDT0000"} 1091
635+
lustre_stats_time_max{component="mdt",operation="rmdir",target="ai400x2-MDT0000"} 115
636+
lustre_stats_time_max{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 241
637+
lustre_stats_time_max{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 65
638+
lustre_stats_time_max{component="mdt",operation="sync",target="ai400x2-MDT0000"} 24
639+
lustre_stats_time_max{component="mdt",operation="unlink",target="ai400x2-MDT0000"} 4498
640+
# HELP lustre_stats_time_total Total time taken for an operation in microseconds.
641+
# TYPE lustre_stats_time_total gauge
642+
lustre_stats_time_total{component="mdt",operation="close",target="ai400x2-MDT0000"} 191804
643+
lustre_stats_time_total{component="mdt",operation="getattr",target="ai400x2-MDT0000"} 44670
644+
lustre_stats_time_total{component="mdt",operation="getxattr",target="ai400x2-MDT0000"} 50689
645+
lustre_stats_time_total{component="mdt",operation="mkdir",target="ai400x2-MDT0000"} 5639
646+
lustre_stats_time_total{component="mdt",operation="mknod",target="ai400x2-MDT0000"} 19926
647+
lustre_stats_time_total{component="mdt",operation="open",target="ai400x2-MDT0000"} 22203
648+
lustre_stats_time_total{component="mdt",operation="rmdir",target="ai400x2-MDT0000"} 302
649+
lustre_stats_time_total{component="mdt",operation="setattr",target="ai400x2-MDT0000"} 8279
650+
lustre_stats_time_total{component="mdt",operation="statfs",target="ai400x2-MDT0000"} 634431
651+
lustre_stats_time_total{component="mdt",operation="sync",target="ai400x2-MDT0000"} 1433
652+
lustre_stats_time_total{component="mdt",operation="unlink",target="ai400x2-MDT0000"} 5408
614653
# EOF

lustrefs-exporter/src/snapshots/lustrefs_exporter__tests__lnetctl_stats_mds_otel.snap

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,48 @@ lustre_stats_total{component="mdt",operation="setattr",target="fs-MDT0000"} 3
432432
lustre_stats_total{component="mdt",operation="statfs",target="fs-MDT0000"} 4
433433
lustre_stats_total{component="mdt",operation="write",target="fs-MDT0000"} 1
434434
lustre_stats_total{component="mdt",operation="write_bytes",target="fs-MDT0000"} 1
435+
# HELP lustre_stats_time_min Minimum time taken for an operation in microseconds.
436+
# TYPE lustre_stats_time_min gauge
437+
lustre_stats_time_min{component="mdt",operation="close",target="fs-MDT0000"} 17
438+
lustre_stats_time_min{component="mdt",operation="getattr",target="fs-MDT0000"} 10
439+
lustre_stats_time_min{component="mdt",operation="getxattr",target="fs-MDT0000"} 15
440+
lustre_stats_time_min{component="mdt",operation="mknod",target="fs-MDT0000"} 223
441+
lustre_stats_time_min{component="mdt",operation="open",target="fs-MDT0000"} 52
442+
lustre_stats_time_min{component="mdt",operation="punch",target="fs-MDT0000"} 28
443+
lustre_stats_time_min{component="mdt",operation="read",target="fs-MDT0000"} 4577
444+
lustre_stats_time_min{component="mdt",operation="read_bytes",target="fs-MDT0000"} 4096
445+
lustre_stats_time_min{component="mdt",operation="setattr",target="fs-MDT0000"} 100
446+
lustre_stats_time_min{component="mdt",operation="statfs",target="fs-MDT0000"} 0
447+
lustre_stats_time_min{component="mdt",operation="write",target="fs-MDT0000"} 363
448+
lustre_stats_time_min{component="mdt",operation="write_bytes",target="fs-MDT0000"} 409600
449+
# HELP lustre_stats_time_max Maximum time taken for an operation in microseconds.
450+
# TYPE lustre_stats_time_max gauge
451+
lustre_stats_time_max{component="mdt",operation="close",target="fs-MDT0000"} 103
452+
lustre_stats_time_max{component="mdt",operation="getattr",target="fs-MDT0000"} 6928
453+
lustre_stats_time_max{component="mdt",operation="getxattr",target="fs-MDT0000"} 15
454+
lustre_stats_time_max{component="mdt",operation="mknod",target="fs-MDT0000"} 5357
455+
lustre_stats_time_max{component="mdt",operation="open",target="fs-MDT0000"} 16976
456+
lustre_stats_time_max{component="mdt",operation="punch",target="fs-MDT0000"} 28
457+
lustre_stats_time_max{component="mdt",operation="read",target="fs-MDT0000"} 10837
458+
lustre_stats_time_max{component="mdt",operation="read_bytes",target="fs-MDT0000"} 405504
459+
lustre_stats_time_max{component="mdt",operation="setattr",target="fs-MDT0000"} 319
460+
lustre_stats_time_max{component="mdt",operation="statfs",target="fs-MDT0000"} 33
461+
lustre_stats_time_max{component="mdt",operation="write",target="fs-MDT0000"} 363
462+
lustre_stats_time_max{component="mdt",operation="write_bytes",target="fs-MDT0000"} 409600
463+
# HELP lustre_stats_time_total Total time taken for an operation in microseconds.
464+
# TYPE lustre_stats_time_total gauge
465+
lustre_stats_time_total{component="mdt",operation="close",target="fs-MDT0000"} 352
466+
lustre_stats_time_total{component="mdt",operation="getattr",target="fs-MDT0000"} 7053
467+
lustre_stats_time_total{component="mdt",operation="getxattr",target="fs-MDT0000"} 15
468+
lustre_stats_time_total{component="mdt",operation="mknod",target="fs-MDT0000"} 5580
469+
lustre_stats_time_total{component="mdt",operation="open",target="fs-MDT0000"} 23055
470+
lustre_stats_time_total{component="mdt",operation="punch",target="fs-MDT0000"} 28
471+
lustre_stats_time_total{component="mdt",operation="read",target="fs-MDT0000"} 15414
472+
lustre_stats_time_total{component="mdt",operation="read_bytes",target="fs-MDT0000"} 409600
473+
lustre_stats_time_total{component="mdt",operation="setattr",target="fs-MDT0000"} 605
474+
lustre_stats_time_total{component="mdt",operation="statfs",target="fs-MDT0000"} 63
475+
lustre_stats_time_total{component="mdt",operation="write",target="fs-MDT0000"} 363
476+
lustre_stats_time_total{component="mdt",operation="write_bytes",target="fs-MDT0000"} 409600
435477
# HELP lustre_client_export_stats Number of operations the target has performed per export.
436478
# TYPE lustre_client_export_stats counter
437479
lustre_client_export_stats{component="mdt",name="close",nid="0@lo",target="fs-MDT0000",units="usecs"} 6

0 commit comments

Comments
 (0)