Skip to content

Commit 876ae02

Browse files
authored
metrics: Expose process_cpu_seconds_total as a float (#754)
Prometheus handles all values as `f64`, but we only expose values as whole integers. This means that the `process_cpu_seconds_total` metric only exposes whole second values, while Linux exposes process time in 10ms increments. This change modifies the `Counter` metric type to store an additional marker that provides a strategy for converting the stored `u64` value to `f64` for export. This strategy is employed so that we can continue to use `AtomicU64` to back counters and only use floats at export-time. By default the unit type is used to convert counters as before, but an alternate `MillisAsSeconds` strategy is used to expose fractional seconds from a millisecond counter. This necessitates changing the histogram buckets to floats as well. While this change doesn't modify the bucket values, this sets up future changes to latency metrics.
1 parent 1e9a001 commit 876ae02

File tree

7 files changed

+240
-179
lines changed

7 files changed

+240
-179
lines changed

linkerd/app/core/src/telemetry/process.rs

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ impl FmtMetrics for Report {
5454
#[cfg(target_os = "linux")]
5555
mod system {
5656
use libc::{self, pid_t};
57-
use linkerd2_metrics::{metrics, Counter, FmtMetrics, Gauge};
57+
use linkerd2_metrics::{metrics, Counter, FmtMetrics, Gauge, MillisAsSeconds};
5858
use procinfo::pid;
5959
use std::fmt;
6060
use std::{fs, io};
6161
use tracing::{error, warn};
6262

6363
metrics! {
64-
process_cpu_seconds_total: Counter {
64+
process_cpu_seconds_total: Counter<MillisAsSeconds> {
6565
"Total user and system CPU time spent in seconds."
6666
},
6767
process_open_fds: Gauge { "Number of open file descriptors." },
@@ -77,16 +77,28 @@ mod system {
7777
#[derive(Clone, Debug)]
7878
pub(super) struct System {
7979
page_size: u64,
80-
clock_ticks_per_sec: u64,
80+
ms_per_tick: u64,
8181
}
8282

8383
impl System {
8484
pub fn new() -> io::Result<Self> {
8585
let page_size = Self::sysconf(libc::_SC_PAGESIZE, "page size")?;
86+
87+
// On Linux, CLK_TCK is ~always `100`, so pure integer division
88+
// works. This is probably not suitable if we encounter other
89+
// values.
8690
let clock_ticks_per_sec = Self::sysconf(libc::_SC_CLK_TCK, "clock ticks per second")?;
91+
let ms_per_tick = 1_000 / clock_ticks_per_sec;
92+
if clock_ticks_per_sec != 100 {
93+
warn!(
94+
clock_ticks_per_sec,
95+
ms_per_tick, "Unexpected value; process_cpu_seconds_total may be inaccurate."
96+
);
97+
}
98+
8799
Ok(Self {
88100
page_size,
89-
clock_ticks_per_sec,
101+
ms_per_tick,
90102
})
91103
}
92104

@@ -130,9 +142,16 @@ mod system {
130142
};
131143

132144
let clock_ticks = stat.utime as u64 + stat.stime as u64;
133-
let cpu = Counter::from(clock_ticks / self.clock_ticks_per_sec);
145+
let cpu_ms = clock_ticks * self.ms_per_tick;
134146
process_cpu_seconds_total.fmt_help(f)?;
135-
process_cpu_seconds_total.fmt_metric(f, &cpu)?;
147+
process_cpu_seconds_total.fmt_metric(f, &Counter::from(cpu_ms))?;
148+
149+
process_virtual_memory_bytes.fmt_help(f)?;
150+
process_virtual_memory_bytes.fmt_metric(f, &Gauge::from(stat.vsize as u64))?;
151+
152+
process_resident_memory_bytes.fmt_help(f)?;
153+
process_resident_memory_bytes
154+
.fmt_metric(f, &Gauge::from(stat.rss as u64 * self.page_size))?;
136155

137156
match Self::open_fds(stat.pid) {
138157
Ok(open_fds) => {
@@ -141,7 +160,6 @@ mod system {
141160
}
142161
Err(err) => {
143162
warn!("could not determine process_open_fds: {}", err);
144-
return Ok(());
145163
}
146164
}
147165

@@ -153,17 +171,10 @@ mod system {
153171
}
154172
Err(err) => {
155173
warn!("could not determine process_max_fds: {}", err);
156-
return Ok(());
157174
}
158175
}
159176

160-
process_virtual_memory_bytes.fmt_help(f)?;
161-
let vsz = Gauge::from(stat.vsize as u64);
162-
process_virtual_memory_bytes.fmt_metric(f, &vsz)?;
163-
164-
process_resident_memory_bytes.fmt_help(f)?;
165-
let rss = Gauge::from(stat.rss as u64 * self.page_size);
166-
process_resident_memory_bytes.fmt_metric(f, &rss)
177+
Ok(())
167178
}
168179
}
169180
}

linkerd/metrics/src/counter.rs

Lines changed: 71 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
use super::prom::{FmtLabels, FmtMetric, MAX_PRECISE_VALUE};
1+
use super::{
2+
prom::{FmtLabels, FmtMetric},
3+
Factor,
4+
};
25
use std::fmt::{self, Display};
36
use std::sync::atomic::{AtomicU64, Ordering};
47

@@ -14,41 +17,58 @@ use std::sync::atomic::{AtomicU64, Ordering};
1417
/// [`rate()`]: https://prometheus.io/docs/prometheus/latest/querying/functions/#rate()
1518
/// [`irate()`]: https://prometheus.io/docs/prometheus/latest/querying/functions/#irate()
1619
/// [`resets()`]: https://prometheus.io/docs/prometheus/latest/querying/functions/#resets
17-
#[derive(Debug, Default)]
18-
pub struct Counter(AtomicU64);
20+
#[derive(Debug)]
21+
pub struct Counter<F = ()>(AtomicU64, std::marker::PhantomData<F>);
1922

2023
// ===== impl Counter =====
2124

22-
impl Counter {
25+
impl<F> Default for Counter<F> {
26+
fn default() -> Self {
27+
Self(AtomicU64::default(), std::marker::PhantomData)
28+
}
29+
}
30+
31+
impl<F> Counter<F> {
32+
pub fn new() -> Self {
33+
Self::default()
34+
}
35+
2336
pub fn incr(&self) {
2437
self.add(1)
2538
}
2639

2740
pub fn add(&self, n: u64) {
2841
self.0.fetch_add(n, Ordering::Release);
2942
}
43+
}
3044

45+
impl<F: Factor> Counter<F> {
3146
/// Return current counter value, wrapped to be safe for use with Prometheus.
32-
pub fn value(&self) -> u64 {
33-
self.0
34-
.load(Ordering::Acquire)
35-
.wrapping_rem(MAX_PRECISE_VALUE + 1)
47+
pub fn value(&self) -> f64 {
48+
let n = self.0.load(Ordering::Acquire);
49+
F::factor(n)
3650
}
3751
}
3852

39-
impl Into<u64> for Counter {
40-
fn into(self) -> u64 {
53+
impl<F: Factor> Into<f64> for &Counter<F> {
54+
fn into(self) -> f64 {
4155
self.value()
4256
}
4357
}
4458

45-
impl From<u64> for Counter {
59+
impl<F> Into<u64> for &Counter<F> {
60+
fn into(self) -> u64 {
61+
self.0.load(Ordering::Acquire)
62+
}
63+
}
64+
65+
impl<F> From<u64> for Counter<F> {
4666
fn from(value: u64) -> Self {
47-
Counter(value.into())
67+
Counter(value.into(), std::marker::PhantomData)
4868
}
4969
}
5070

51-
impl FmtMetric for Counter {
71+
impl<F: Factor> FmtMetric for Counter<F> {
5272
const KIND: &'static str = "counter";
5373

5474
fn fmt_metric<N: Display>(&self, f: &mut fmt::Formatter<'_>, name: N) -> fmt::Result {
@@ -74,34 +94,50 @@ impl FmtMetric for Counter {
7494
#[cfg(test)]
7595
mod tests {
7696
use super::*;
97+
use crate::{MillisAsSeconds, MAX_PRECISE_UINT64};
7798

7899
#[test]
79100
fn count_simple() {
80-
let cnt = Counter::from(0);
81-
assert_eq!(cnt.value(), 0);
82-
cnt.incr();
83-
assert_eq!(cnt.value(), 1);
84-
cnt.add(41);
85-
assert_eq!(cnt.value(), 42);
86-
cnt.add(0);
87-
assert_eq!(cnt.value(), 42);
101+
let c = Counter::<()>::default();
102+
assert_eq!(c.value(), 0.0);
103+
c.incr();
104+
assert_eq!(c.value(), 1.0);
105+
c.add(41);
106+
assert_eq!(c.value(), 42.0);
107+
c.add(0);
108+
assert_eq!(c.value(), 42.0);
88109
}
89110

90111
#[test]
91112
fn count_wrapping() {
92-
let cnt = Counter::from(MAX_PRECISE_VALUE - 1);
93-
assert_eq!(cnt.value(), MAX_PRECISE_VALUE - 1);
94-
cnt.incr();
95-
assert_eq!(cnt.value(), MAX_PRECISE_VALUE);
96-
cnt.incr();
97-
assert_eq!(cnt.value(), 0);
98-
cnt.incr();
99-
assert_eq!(cnt.value(), 1);
100-
101-
let max = Counter::from(MAX_PRECISE_VALUE);
102-
assert_eq!(max.value(), MAX_PRECISE_VALUE);
103-
104-
let over = Counter::from(MAX_PRECISE_VALUE + 1);
105-
assert_eq!(over.value(), 0);
113+
let c = Counter::<()>::from(MAX_PRECISE_UINT64 - 1);
114+
assert_eq!(c.value(), (MAX_PRECISE_UINT64 - 1) as f64);
115+
c.incr();
116+
assert_eq!(c.value(), MAX_PRECISE_UINT64 as f64);
117+
c.incr();
118+
assert_eq!(c.value(), 0.0);
119+
c.incr();
120+
assert_eq!(c.value(), 1.0);
121+
122+
let max = Counter::<()>::from(MAX_PRECISE_UINT64);
123+
assert_eq!(max.value(), MAX_PRECISE_UINT64 as f64);
124+
}
125+
126+
#[test]
127+
fn millis_as_seconds() {
128+
let c = Counter::<MillisAsSeconds>::from(1);
129+
assert_eq!(c.value(), 0.001);
130+
131+
let c = Counter::<MillisAsSeconds>::from((MAX_PRECISE_UINT64 - 1) * 1000);
132+
assert_eq!(c.value(), (MAX_PRECISE_UINT64 - 1) as f64);
133+
c.add(1000);
134+
assert_eq!(c.value(), MAX_PRECISE_UINT64 as f64);
135+
c.add(1000);
136+
assert_eq!(c.value(), 0.0);
137+
c.add(1000);
138+
assert_eq!(c.value(), 1.0);
139+
140+
let max = Counter::<MillisAsSeconds>::from(MAX_PRECISE_UINT64 * 1000);
141+
assert_eq!(max.value(), MAX_PRECISE_UINT64 as f64);
106142
}
107143
}

linkerd/metrics/src/gauge.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use super::prom::{FmtLabels, FmtMetric, MAX_PRECISE_VALUE};
1+
use super::prom::{FmtLabels, FmtMetric};
22
use std::fmt::{self, Display};
33
use std::sync::atomic::{AtomicU64, Ordering};
44

@@ -20,7 +20,7 @@ impl Gauge {
2020
pub fn value(&self) -> u64 {
2121
self.0
2222
.load(Ordering::Acquire)
23-
.wrapping_rem(MAX_PRECISE_VALUE + 1)
23+
.wrapping_rem(crate::MAX_PRECISE_UINT64 + 1)
2424
}
2525
}
2626

0 commit comments

Comments
 (0)