|
| 1 | +// This Source Code Form is subject to the terms of the Mozilla Public |
| 2 | +// License, v. 2.0. If a copy of the MPL was not distributed with this |
| 3 | +// file, You can obtain one at https://mozilla.org/MPL/2.0/. |
| 4 | + |
| 5 | +//! Report CPU metrics about zones on the host system |
| 6 | +
|
| 7 | +use crate::kstat::ConvertNamedData; |
| 8 | +use crate::kstat::Error; |
| 9 | +use crate::kstat::KstatList; |
| 10 | +use crate::kstat::KstatTarget; |
| 11 | +use crate::kstat::hrtime_to_utc; |
| 12 | +use kstat_rs::Data; |
| 13 | +use kstat_rs::Kstat; |
| 14 | +use kstat_rs::Named; |
| 15 | +use oximeter::FieldType; |
| 16 | +use oximeter::FieldValue; |
| 17 | +use oximeter::Sample; |
| 18 | +use oximeter::Target; |
| 19 | +use oximeter::types::Cumulative; |
| 20 | +use uuid::Uuid; |
| 21 | + |
| 22 | +oximeter::use_timeseries!("zone.toml"); |
| 23 | +pub use self::zone::Zone as ZoneTarget; |
| 24 | + |
| 25 | +/// CPU metrics for all zones on a sled. |
| 26 | +#[derive(Clone, Debug)] |
| 27 | +pub struct Zone { |
| 28 | + /// The target for this sled's CPUs. |
| 29 | + pub target: ZoneTarget, |
| 30 | + /// Flag indicating whether the sled is synced with NTP. |
| 31 | + pub time_synced: bool, |
| 32 | +} |
| 33 | + |
| 34 | +impl Zone { |
| 35 | + /// Create a new `Zone` with the given target and synchronization flag. |
| 36 | + pub fn new(target: ZoneTarget, time_synced: bool) -> Self { |
| 37 | + Self { target, time_synced } |
| 38 | + } |
| 39 | + |
| 40 | + /// Return the sled ID. |
| 41 | + pub fn sled_id(&self) -> Uuid { |
| 42 | + self.target.sled_id |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | +impl KstatTarget for Zone { |
| 47 | + fn interested(&self, kstat: &Kstat<'_>) -> bool { |
| 48 | + self.time_synced && kstat.ks_module == "zones" |
| 49 | + } |
| 50 | + |
| 51 | + fn to_samples( |
| 52 | + &self, |
| 53 | + kstats: KstatList<'_, '_>, |
| 54 | + ) -> Result<Vec<Sample>, Error> { |
| 55 | + let mut samples = Vec::new(); |
| 56 | + |
| 57 | + for (creation_time, kstat, data) in kstats.iter() { |
| 58 | + let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?; |
| 59 | + |
| 60 | + let Data::Named(named) = data else { |
| 61 | + return Err(Error::ExpectedNamedKstat); |
| 62 | + }; |
| 63 | + |
| 64 | + let mut zone_name: Option<&str> = None; |
| 65 | + let mut nsec_user: Option<u64> = None; |
| 66 | + let mut nsec_sys: Option<u64> = None; |
| 67 | + let mut nsec_waitrq: Option<u64> = None; |
| 68 | + |
| 69 | + for named_data in named.iter() { |
| 70 | + let Named { name, value } = named_data; |
| 71 | + match *name { |
| 72 | + "zonename" => zone_name = Some(value.as_str()?), |
| 73 | + "nsec_user" => nsec_user = Some(value.as_u64()?), |
| 74 | + "nsec_sys" => nsec_sys = Some(value.as_u64()?), |
| 75 | + "nsec_waitrq" => nsec_waitrq = Some(value.as_u64()?), |
| 76 | + _ => {} |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + let zone_name = zone_name.ok_or(Error::NoSuchKstat)?.to_string(); |
| 81 | + |
| 82 | + let user_metric = zone::CpuNsecUser { |
| 83 | + zone_name: zone_name.clone().into(), |
| 84 | + datum: Cumulative::with_start_time( |
| 85 | + *creation_time, |
| 86 | + nsec_user.ok_or(Error::NoSuchKstat)?, |
| 87 | + ), |
| 88 | + }; |
| 89 | + let user_sample = Sample::new_with_timestamp( |
| 90 | + snapshot_time, |
| 91 | + &self.target, |
| 92 | + &user_metric, |
| 93 | + ) |
| 94 | + .map_err(Error::Sample)?; |
| 95 | + samples.push(user_sample); |
| 96 | + |
| 97 | + let sys_metric = zone::CpuNsecSys { |
| 98 | + zone_name: zone_name.clone().into(), |
| 99 | + datum: Cumulative::with_start_time( |
| 100 | + *creation_time, |
| 101 | + nsec_sys.ok_or(Error::NoSuchKstat)?, |
| 102 | + ), |
| 103 | + }; |
| 104 | + let sys_sample = Sample::new_with_timestamp( |
| 105 | + snapshot_time, |
| 106 | + &self.target, |
| 107 | + &sys_metric, |
| 108 | + ) |
| 109 | + .map_err(Error::Sample)?; |
| 110 | + samples.push(sys_sample); |
| 111 | + |
| 112 | + let waitrq_metric = zone::CpuNsecWaitrq { |
| 113 | + zone_name: zone_name.clone().into(), |
| 114 | + datum: Cumulative::with_start_time( |
| 115 | + *creation_time, |
| 116 | + nsec_waitrq.ok_or(Error::NoSuchKstat)?, |
| 117 | + ), |
| 118 | + }; |
| 119 | + let waitrq_sample = Sample::new_with_timestamp( |
| 120 | + snapshot_time, |
| 121 | + &self.target, |
| 122 | + &waitrq_metric, |
| 123 | + ) |
| 124 | + .map_err(Error::Sample)?; |
| 125 | + samples.push(waitrq_sample); |
| 126 | + } |
| 127 | + |
| 128 | + Ok(samples) |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +// NOTE: Delegate to the inner target type for this implementation. |
| 133 | +impl Target for Zone { |
| 134 | + fn name(&self) -> &'static str { |
| 135 | + self.target.name() |
| 136 | + } |
| 137 | + |
| 138 | + fn field_names(&self) -> &'static [&'static str] { |
| 139 | + self.target.field_names() |
| 140 | + } |
| 141 | + |
| 142 | + fn field_types(&self) -> Vec<FieldType> { |
| 143 | + self.target.field_types() |
| 144 | + } |
| 145 | + |
| 146 | + fn field_values(&self) -> Vec<FieldValue> { |
| 147 | + self.target.field_values() |
| 148 | + } |
| 149 | +} |
| 150 | + |
| 151 | +#[cfg(all(test, target_os = "illumos"))] |
| 152 | +mod tests { |
| 153 | + use super::*; |
| 154 | + use crate::kstat::CollectionDetails; |
| 155 | + use crate::kstat::KstatSampler; |
| 156 | + use crate::kstat::TargetStatus; |
| 157 | + use kstat_rs::Ctl; |
| 158 | + use oximeter::Producer; |
| 159 | + use slog::Drain; |
| 160 | + use slog::Logger; |
| 161 | + use std::time::Duration; |
| 162 | + use tokio::time::Instant; |
| 163 | + use uuid::Uuid; |
| 164 | + use uuid::uuid; |
| 165 | + |
| 166 | + /// The metric names we expect to produce for each zone. |
| 167 | + const ZONE_METRICS: &[&str] = |
| 168 | + &["cpu_nsec_user", "cpu_nsec_sys", "cpu_nsec_waitrq"]; |
| 169 | + |
| 170 | + fn test_logger() -> Logger { |
| 171 | + let dec = |
| 172 | + slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter); |
| 173 | + let drain = slog_term::FullFormat::new(dec).build().fuse(); |
| 174 | + Logger::root(drain, slog::o!("component" => "fake-cleanup-task")) |
| 175 | + } |
| 176 | + |
| 177 | + const RACK_ID: Uuid = uuid!("de784702-cafb-41a9-b3e5-93af189def29"); |
| 178 | + const SLED_ID: Uuid = uuid!("88240343-5262-45f4-86f1-3c82fe383f2a"); |
| 179 | + const SLED_MODEL: &str = "fake-gimlet"; |
| 180 | + const SLED_REVISION: u32 = 1; |
| 181 | + const SLED_SERIAL: &str = "fake-serial"; |
| 182 | + |
| 183 | + fn test_target() -> ZoneTarget { |
| 184 | + ZoneTarget { |
| 185 | + rack_id: RACK_ID, |
| 186 | + sled_id: SLED_ID, |
| 187 | + sled_model: SLED_MODEL.into(), |
| 188 | + sled_revision: SLED_REVISION, |
| 189 | + sled_serial: SLED_SERIAL.into(), |
| 190 | + } |
| 191 | + } |
| 192 | + |
| 193 | + #[test] |
| 194 | + fn test_kstat_interested() { |
| 195 | + let mut zone = Zone::new(test_target(), false); |
| 196 | + |
| 197 | + let ctl = Ctl::new().unwrap(); |
| 198 | + let ctl = ctl.update().unwrap(); |
| 199 | + |
| 200 | + // There should be at least the global zone kstat. |
| 201 | + let kstat = ctl |
| 202 | + .filter(Some("zones"), None, None) |
| 203 | + .next() |
| 204 | + .expect("should have at least one zones kstat"); |
| 205 | + |
| 206 | + // Not interested when not time synced. |
| 207 | + assert!(!zone.interested(&kstat)); |
| 208 | + |
| 209 | + // Interested when time synced. |
| 210 | + zone.time_synced = true; |
| 211 | + assert!(zone.interested(&kstat)); |
| 212 | + |
| 213 | + // Not interested in non-zone kstats. |
| 214 | + if let Some(cpu_kstat) = |
| 215 | + ctl.filter(Some("cpu"), Some(0), Some("sys")).next() |
| 216 | + { |
| 217 | + assert!(!zone.interested(&cpu_kstat)); |
| 218 | + } |
| 219 | + } |
| 220 | + |
| 221 | + #[test] |
| 222 | + fn test_zone_cpu_samples() { |
| 223 | + let zone = Zone::new(test_target(), true); |
| 224 | + let ctl = Ctl::new().unwrap(); |
| 225 | + let ctl = ctl.update().unwrap(); |
| 226 | + |
| 227 | + // Collect kstats for the first zone we find. |
| 228 | + let mut kstat = ctl |
| 229 | + .filter(Some("zones"), None, None) |
| 230 | + .next() |
| 231 | + .expect("should have at least one zones kstat"); |
| 232 | + let creation_time = hrtime_to_utc(kstat.ks_crtime).unwrap(); |
| 233 | + let data = ctl.read(&mut kstat).unwrap(); |
| 234 | + let samples = |
| 235 | + zone.to_samples(&[(creation_time, kstat, data)]).unwrap(); |
| 236 | + |
| 237 | + // We should get exactly 3 samples (one per metric) for one zone. |
| 238 | + assert_eq!(samples.len(), 3); |
| 239 | + |
| 240 | + // Verify we got one sample for each expected metric. |
| 241 | + let mut metric_names: Vec<_> = |
| 242 | + samples.iter().map(|s| s.timeseries_name.to_string()).collect(); |
| 243 | + metric_names.sort(); |
| 244 | + |
| 245 | + let mut expected: Vec<_> = ZONE_METRICS |
| 246 | + .iter() |
| 247 | + .map(|m| format!("zone:{m}")) |
| 248 | + .collect(); |
| 249 | + expected.sort(); |
| 250 | + assert_eq!(metric_names, expected); |
| 251 | + |
| 252 | + // All samples should have the same zone_name field. |
| 253 | + let zone_names: Vec<_> = samples |
| 254 | + .iter() |
| 255 | + .filter_map(|s| { |
| 256 | + s.sorted_metric_fields().get("zone_name").and_then(|f| { |
| 257 | + match &f.value { |
| 258 | + oximeter::FieldValue::String(s) => { |
| 259 | + Some(s.as_ref().to_string()) |
| 260 | + } |
| 261 | + _ => None, |
| 262 | + } |
| 263 | + }) |
| 264 | + }) |
| 265 | + .collect(); |
| 266 | + assert_eq!(zone_names.len(), 3); |
| 267 | + assert!( |
| 268 | + zone_names.windows(2).all(|w| w[0] == w[1]), |
| 269 | + "all samples should have the same zone_name" |
| 270 | + ); |
| 271 | + } |
| 272 | + |
| 273 | + #[tokio::test] |
| 274 | + async fn test_kstat_sampler() { |
| 275 | + let mut sampler = KstatSampler::new(&test_logger()).unwrap(); |
| 276 | + let zone = Zone::new(test_target(), true); |
| 277 | + let details = CollectionDetails::never(Duration::from_secs(1)); |
| 278 | + let id = sampler.add_target(zone, details).await.unwrap(); |
| 279 | + let samples: Vec<_> = sampler.produce().unwrap().collect(); |
| 280 | + assert!(samples.is_empty()); |
| 281 | + |
| 282 | + // Pause time, and advance until we're notified of new samples. |
| 283 | + tokio::time::pause(); |
| 284 | + const MAX_DURATION: Duration = Duration::from_secs(3); |
| 285 | + const STEP_DURATION: Duration = Duration::from_secs(1); |
| 286 | + let now = Instant::now(); |
| 287 | + let expected_counts = loop { |
| 288 | + tokio::time::advance(STEP_DURATION).await; |
| 289 | + if now.elapsed() > MAX_DURATION { |
| 290 | + panic!("Waited too long for samples"); |
| 291 | + } |
| 292 | + if let Some(counts) = sampler.sample_counts() { |
| 293 | + break counts; |
| 294 | + } |
| 295 | + }; |
| 296 | + let samples: Vec<_> = sampler.produce().unwrap().collect(); |
| 297 | + println!("{samples:#?}"); |
| 298 | + assert_eq!(samples.len(), expected_counts.total); |
| 299 | + assert_eq!(expected_counts.overflow, 0); |
| 300 | + |
| 301 | + // Test status and remove behavior. |
| 302 | + tokio::time::resume(); |
| 303 | + assert!(matches!( |
| 304 | + sampler.target_status(id).await.unwrap(), |
| 305 | + TargetStatus::Ok { .. }, |
| 306 | + )); |
| 307 | + sampler.remove_target(id).await.unwrap(); |
| 308 | + assert!(sampler.target_status(id).await.is_err()); |
| 309 | + } |
| 310 | +} |
0 commit comments