Skip to content

Commit 5a6c908

Browse files
committed
Add per-zone cpu metrics.
Add a new oximeter instrument for tracking per-zone cpu statistics with kstat, and use it in sled-agent. We add metrics for cpu_nsec_{user,sys,waitrq}, for a total cardinality of triple the number of internal zones.
1 parent 81ba7e2 commit 5a6c908

File tree

5 files changed

+481
-11
lines changed

5 files changed

+481
-11
lines changed

oximeter/instruments/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ uuid = { workspace = true, optional = true }
2727
omicron-workspace-hack.workspace = true
2828

2929
[features]
30-
default = ["http-instruments", "cpu", "datalink"]
30+
default = ["http-instruments", "cpu", "datalink", "zone"]
3131
http-instruments = [
3232
"dep:chrono",
3333
"dep:dropshot",
@@ -55,6 +55,7 @@ kstat = [
5555
]
5656
cpu = ["kstat"]
5757
datalink = ["kstat"]
58+
zone = ["kstat"]
5859

5960
[dev-dependencies]
6061
rand.workspace = true

oximeter/instruments/src/kstat/mod.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ use std::time::Duration;
9191
pub mod cpu;
9292
#[cfg(any(feature = "datalink", test))]
9393
pub mod link;
94+
#[cfg(any(feature = "zone", test))]
95+
pub mod zone;
9496
mod sampler;
9597

9698
pub use sampler::CollectionDetails;
@@ -236,6 +238,7 @@ pub trait ConvertNamedData {
236238
fn as_u32(&self) -> Result<u32, Error>;
237239
fn as_i64(&self) -> Result<i64, Error>;
238240
fn as_u64(&self) -> Result<u64, Error>;
241+
fn as_str(&self) -> Result<&str, Error>;
239242
}
240243

241244
impl ConvertNamedData for NamedData<'_> {
@@ -282,6 +285,17 @@ impl ConvertNamedData for NamedData<'_> {
282285
})
283286
}
284287
}
288+
289+
fn as_str(&self) -> Result<&str, Error> {
290+
if let NamedData::String(x) = self {
291+
Ok(*x)
292+
} else {
293+
Err(Error::UnexpectedDataType {
294+
expected: NamedType::String,
295+
found: self.data_type(),
296+
})
297+
}
298+
}
285299
}
286300

287301
/// Return a high-resolution monotonic timestamp, in nanoseconds since an
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Report CPU metrics about zones on the host system
6+
7+
use crate::kstat::ConvertNamedData;
8+
use crate::kstat::Error;
9+
use crate::kstat::KstatList;
10+
use crate::kstat::KstatTarget;
11+
use crate::kstat::hrtime_to_utc;
12+
use kstat_rs::Data;
13+
use kstat_rs::Kstat;
14+
use kstat_rs::Named;
15+
use oximeter::FieldType;
16+
use oximeter::FieldValue;
17+
use oximeter::Sample;
18+
use oximeter::Target;
19+
use oximeter::types::Cumulative;
20+
use uuid::Uuid;
21+
22+
oximeter::use_timeseries!("zone.toml");
23+
pub use self::zone::Zone as ZoneTarget;
24+
25+
/// CPU metrics for all zones on a sled.
26+
#[derive(Clone, Debug)]
27+
pub struct Zone {
28+
/// The target for this sled's CPUs.
29+
pub target: ZoneTarget,
30+
/// Flag indicating whether the sled is synced with NTP.
31+
pub time_synced: bool,
32+
}
33+
34+
impl Zone {
35+
/// Create a new `Zone` with the given target and synchronization flag.
36+
pub fn new(target: ZoneTarget, time_synced: bool) -> Self {
37+
Self { target, time_synced }
38+
}
39+
40+
/// Return the sled ID.
41+
pub fn sled_id(&self) -> Uuid {
42+
self.target.sled_id
43+
}
44+
}
45+
46+
impl KstatTarget for Zone {
47+
fn interested(&self, kstat: &Kstat<'_>) -> bool {
48+
self.time_synced && kstat.ks_module == "zones"
49+
}
50+
51+
fn to_samples(
52+
&self,
53+
kstats: KstatList<'_, '_>,
54+
) -> Result<Vec<Sample>, Error> {
55+
let mut samples = Vec::new();
56+
57+
for (creation_time, kstat, data) in kstats.iter() {
58+
let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?;
59+
60+
let Data::Named(named) = data else {
61+
return Err(Error::ExpectedNamedKstat);
62+
};
63+
64+
let mut zone_name: Option<&str> = None;
65+
let mut nsec_user: Option<u64> = None;
66+
let mut nsec_sys: Option<u64> = None;
67+
let mut nsec_waitrq: Option<u64> = None;
68+
69+
for named_data in named.iter() {
70+
let Named { name, value } = named_data;
71+
match *name {
72+
"zonename" => zone_name = Some(value.as_str()?),
73+
"nsec_user" => nsec_user = Some(value.as_u64()?),
74+
"nsec_sys" => nsec_sys = Some(value.as_u64()?),
75+
"nsec_waitrq" => nsec_waitrq = Some(value.as_u64()?),
76+
_ => {}
77+
}
78+
}
79+
80+
let zone_name = zone_name.ok_or(Error::NoSuchKstat)?.to_string();
81+
82+
let user_metric = zone::CpuNsecUser {
83+
zone_name: zone_name.clone().into(),
84+
datum: Cumulative::with_start_time(
85+
*creation_time,
86+
nsec_user.ok_or(Error::NoSuchKstat)?,
87+
),
88+
};
89+
let user_sample = Sample::new_with_timestamp(
90+
snapshot_time,
91+
&self.target,
92+
&user_metric,
93+
)
94+
.map_err(Error::Sample)?;
95+
samples.push(user_sample);
96+
97+
let sys_metric = zone::CpuNsecSys {
98+
zone_name: zone_name.clone().into(),
99+
datum: Cumulative::with_start_time(
100+
*creation_time,
101+
nsec_sys.ok_or(Error::NoSuchKstat)?,
102+
),
103+
};
104+
let sys_sample = Sample::new_with_timestamp(
105+
snapshot_time,
106+
&self.target,
107+
&sys_metric,
108+
)
109+
.map_err(Error::Sample)?;
110+
samples.push(sys_sample);
111+
112+
let waitrq_metric = zone::CpuNsecWaitrq {
113+
zone_name: zone_name.clone().into(),
114+
datum: Cumulative::with_start_time(
115+
*creation_time,
116+
nsec_waitrq.ok_or(Error::NoSuchKstat)?,
117+
),
118+
};
119+
let waitrq_sample = Sample::new_with_timestamp(
120+
snapshot_time,
121+
&self.target,
122+
&waitrq_metric,
123+
)
124+
.map_err(Error::Sample)?;
125+
samples.push(waitrq_sample);
126+
}
127+
128+
Ok(samples)
129+
}
130+
}
131+
132+
// NOTE: Delegate to the inner target type for this implementation.
133+
impl Target for Zone {
134+
fn name(&self) -> &'static str {
135+
self.target.name()
136+
}
137+
138+
fn field_names(&self) -> &'static [&'static str] {
139+
self.target.field_names()
140+
}
141+
142+
fn field_types(&self) -> Vec<FieldType> {
143+
self.target.field_types()
144+
}
145+
146+
fn field_values(&self) -> Vec<FieldValue> {
147+
self.target.field_values()
148+
}
149+
}
150+
151+
#[cfg(all(test, target_os = "illumos"))]
152+
mod tests {
153+
use super::*;
154+
use crate::kstat::CollectionDetails;
155+
use crate::kstat::KstatSampler;
156+
use crate::kstat::TargetStatus;
157+
use kstat_rs::Ctl;
158+
use oximeter::Producer;
159+
use slog::Drain;
160+
use slog::Logger;
161+
use std::time::Duration;
162+
use tokio::time::Instant;
163+
use uuid::Uuid;
164+
use uuid::uuid;
165+
166+
/// The metric names we expect to produce for each zone.
167+
const ZONE_METRICS: &[&str] =
168+
&["cpu_nsec_user", "cpu_nsec_sys", "cpu_nsec_waitrq"];
169+
170+
fn test_logger() -> Logger {
171+
let dec =
172+
slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter);
173+
let drain = slog_term::FullFormat::new(dec).build().fuse();
174+
Logger::root(drain, slog::o!("component" => "fake-cleanup-task"))
175+
}
176+
177+
const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29");
178+
const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a");
179+
const SLED_MODEL: &str = "fake-gimlet";
180+
const SLED_REVISION: u32 = 1;
181+
const SLED_SERIAL: &str = "fake-serial";
182+
183+
fn test_target() -> ZoneTarget {
184+
ZoneTarget {
185+
rack_id: RACK_ID,
186+
sled_id: SLED_ID,
187+
sled_model: SLED_MODEL.into(),
188+
sled_revision: SLED_REVISION,
189+
sled_serial: SLED_SERIAL.into(),
190+
}
191+
}
192+
193+
#[test]
194+
fn test_kstat_interested() {
195+
let mut zone = Zone::new(test_target(), false);
196+
197+
let ctl = Ctl::new().unwrap();
198+
let ctl = ctl.update().unwrap();
199+
200+
// There should be at least the global zone kstat.
201+
let kstat = ctl
202+
.filter(Some("zones"), None, None)
203+
.next()
204+
.expect("should have at least one zones kstat");
205+
206+
// Not interested when not time synced.
207+
assert!(!zone.interested(&kstat));
208+
209+
// Interested when time synced.
210+
zone.time_synced = true;
211+
assert!(zone.interested(&kstat));
212+
213+
// Not interested in non-zone kstats.
214+
if let Some(cpu_kstat) =
215+
ctl.filter(Some("cpu"), Some(0), Some("sys")).next()
216+
{
217+
assert!(!zone.interested(&cpu_kstat));
218+
}
219+
}
220+
221+
#[test]
222+
fn test_zone_cpu_samples() {
223+
let zone = Zone::new(test_target(), true);
224+
let ctl = Ctl::new().unwrap();
225+
let ctl = ctl.update().unwrap();
226+
227+
// Collect kstats for the first zone we find.
228+
let mut kstat = ctl
229+
.filter(Some("zones"), None, None)
230+
.next()
231+
.expect("should have at least one zones kstat");
232+
let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap();
233+
let data = ctl.read(&mut kstat).unwrap();
234+
let samples =
235+
zone.to_samples(&[(creation_time, kstat, data)]).unwrap();
236+
237+
// We should get exactly 3 samples (one per metric) for one zone.
238+
assert_eq!(samples.len(), 3);
239+
240+
// Verify we got one sample for each expected metric.
241+
let mut metric_names: Vec<_> =
242+
samples.iter().map(|s| s.timeseries_name.to_string()).collect();
243+
metric_names.sort();
244+
245+
let mut expected: Vec<_> = ZONE_METRICS
246+
.iter()
247+
.map(|m| format!("zone:{m}"))
248+
.collect();
249+
expected.sort();
250+
assert_eq!(metric_names, expected);
251+
252+
// All samples should have the same zone_name field.
253+
let zone_names: Vec<_> = samples
254+
.iter()
255+
.filter_map(|s| {
256+
s.sorted_metric_fields().get("zone_name").and_then(|f| {
257+
match &f.value {
258+
oximeter::FieldValue::String(s) => {
259+
Some(s.as_ref().to_string())
260+
}
261+
_ => None,
262+
}
263+
})
264+
})
265+
.collect();
266+
assert_eq!(zone_names.len(), 3);
267+
assert!(
268+
zone_names.windows(2).all(|w| w[0] == w[1]),
269+
"all samples should have the same zone_name"
270+
);
271+
}
272+
273+
#[tokio::test]
274+
async fn test_kstat_sampler() {
275+
let mut sampler = KstatSampler::new(&test_logger()).unwrap();
276+
let zone = Zone::new(test_target(), true);
277+
let details = CollectionDetails::never(Duration::from_secs(1));
278+
let id = sampler.add_target(zone, details).await.unwrap();
279+
let samples: Vec<_> = sampler.produce().unwrap().collect();
280+
assert!(samples.is_empty());
281+
282+
// Pause time, and advance until we're notified of new samples.
283+
tokio::time::pause();
284+
const MAX_DURATION: Duration = Duration::from_secs(3);
285+
const STEP_DURATION: Duration = Duration::from_secs(1);
286+
let now = Instant::now();
287+
let expected_counts = loop {
288+
tokio::time::advance(STEP_DURATION).await;
289+
if now.elapsed() > MAX_DURATION {
290+
panic!("Waited too long for samples");
291+
}
292+
if let Some(counts) = sampler.sample_counts() {
293+
break counts;
294+
}
295+
};
296+
let samples: Vec<_> = sampler.produce().unwrap().collect();
297+
println!("{samples:#?}");
298+
assert_eq!(samples.len(), expected_counts.total);
299+
assert_eq!(expected_counts.overflow, 0);
300+
301+
// Test status and remove behavior.
302+
tokio::time::resume();
303+
assert!(matches!(
304+
sampler.target_status(id).await.unwrap(),
305+
TargetStatus::Ok { .. },
306+
));
307+
sampler.remove_target(id).await.unwrap();
308+
assert!(sampler.target_status(id).await.is_err());
309+
}
310+
}

0 commit comments

Comments
 (0)