Skip to content

Commit d1f4c1b

Browse files
iximeowgjcolombo
andauthored
make sleds report their CPU families to Nexus (#8725)
RFD 505 proposes that instances should be able to set a "minimum hardware platform" or "minimum CPU platform" that allows users to constrain an instance to run on sleds that have a specific set of CPU features available. This allows a user to opt a VM into advanced hardware features (e.g. AVX-512 support) by constraining it to run only on sleds that support those features. For this to work, Nexus needs to understand what CPUs are present in which sleds. Have sled-agent query CPUID to get CPU vendor and family information and report this to Nexus as part of the sled hardware manifest. ---- This is largely code Greg had put together that, now that the control plane bits that build on it are imminently to be PR'd, it's time to merge. Co-authored-by: Greg Colombo <[email protected]>
1 parent 2a9af3b commit d1f4c1b

File tree

50 files changed

+522
-14
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+522
-14
lines changed

dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,6 +1119,7 @@ sled 2eb69596-f081-4e2d-9425-9994926e0832 (role = Gimlet, serial serial1)
11191119
found at: <REDACTED_TIMESTAMP> from fake sled agent
11201120
address: [fd00:1122:3344:102::1]:12345
11211121
usable hw threads: 10
1122+
CPU family: amd_milan
11221123
usable memory (GiB): 0
11231124
reservoir (GiB): 0
11241125
physical disks:
@@ -1233,6 +1234,7 @@ sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2)
12331234
found at: <REDACTED_TIMESTAMP> from fake sled agent
12341235
address: [fd00:1122:3344:103::1]:12345
12351236
usable hw threads: 10
1237+
CPU family: amd_milan
12361238
usable memory (GiB): 0
12371239
reservoir (GiB): 0
12381240
physical disks:
@@ -1347,6 +1349,7 @@ sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0)
13471349
found at: <REDACTED_TIMESTAMP> from fake sled agent
13481350
address: [fd00:1122:3344:101::1]:12345
13491351
usable hw threads: 10
1352+
CPU family: amd_milan
13501353
usable memory (GiB): 0
13511354
reservoir (GiB): 0
13521355
physical disks:

dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ sled 2b8f0cb3-0295-4b3c-bc58-4fe88b57112c (role = Gimlet, serial serial1)
124124
found at: <REDACTED_TIMESTAMP> from fake sled agent
125125
address: [fd00:1122:3344:102::1]:12345
126126
usable hw threads: 10
127+
CPU family: amd_milan
127128
usable memory (GiB): 0
128129
reservoir (GiB): 0
129130
physical disks:
@@ -235,6 +236,7 @@ sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0)
235236
found at: <REDACTED_TIMESTAMP> from fake sled agent
236237
address: [fd00:1122:3344:101::1]:12345
237238
usable hw threads: 10
239+
CPU family: amd_milan
238240
usable memory (GiB): 0
239241
reservoir (GiB): 0
240242
physical disks:
@@ -348,6 +350,7 @@ sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2)
348350
found at: <REDACTED_TIMESTAMP> from fake sled agent
349351
address: [fd00:1122:3344:103::1]:12345
350352
usable hw threads: 10
353+
CPU family: amd_milan
351354
usable memory (GiB): 0
352355
reservoir (GiB): 0
353356
physical disks:

nexus-sled-agent-shared/src/inventory.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@ use omicron_uuid_kinds::{SledUuid, ZpoolUuid};
4040
use schemars::schema::{Schema, SchemaObject};
4141
use schemars::{JsonSchema, SchemaGenerator};
4242
use serde::{Deserialize, Serialize};
43-
// Export this type for convenience -- this way, dependents don't have to
43+
// Export these types for convenience -- this way, dependents don't have to
4444
// depend on sled-hardware-types.
45-
pub use sled_hardware_types::Baseboard;
45+
pub use sled_hardware_types::{Baseboard, SledCpuFamily};
4646
use strum::EnumIter;
4747
use tufaceous_artifact::{ArtifactHash, KnownArtifactKind};
4848

@@ -121,6 +121,7 @@ pub struct Inventory {
121121
pub baseboard: Baseboard,
122122
pub usable_hardware_threads: u32,
123123
pub usable_physical_ram: ByteCount,
124+
pub cpu_family: SledCpuFamily,
124125
pub reservoir_size: ByteCount,
125126
pub disks: Vec<InventoryDisk>,
126127
pub zpools: Vec<InventoryZpool>,

nexus/db-model/src/inventory.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::ArtifactHash;
88
use crate::Generation;
99
use crate::PhysicalDiskKind;
1010
use crate::omicron_zone_config::{self, OmicronZoneNic};
11+
use crate::sled_cpu_family::SledCpuFamily;
1112
use crate::typed_uuid::DbTypedUuid;
1213
use crate::{
1314
ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32,
@@ -910,6 +911,7 @@ pub struct InvSledAgent {
910911
pub sled_role: SledRole,
911912
pub usable_hardware_threads: SqlU32,
912913
pub usable_physical_ram: ByteCount,
914+
pub cpu_family: SledCpuFamily,
913915
pub reservoir_size: ByteCount,
914916
// Soft foreign key to an `InvOmicronSledConfig`
915917
pub ledgered_sled_config: Option<DbTypedUuid<OmicronSledConfigKind>>,
@@ -1325,6 +1327,7 @@ impl InvSledAgent {
13251327
usable_physical_ram: ByteCount::from(
13261328
sled_agent.usable_physical_ram,
13271329
),
1330+
cpu_family: sled_agent.cpu_family.into(),
13281331
reservoir_size: ByteCount::from(sled_agent.reservoir_size),
13291332
ledgered_sled_config: ledgered_sled_config.map(From::from),
13301333
reconciler_status,

nexus/db-model/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ mod silo_group;
104104
mod silo_user;
105105
mod silo_user_password_hash;
106106
mod sled;
107+
mod sled_cpu_family;
107108
mod sled_instance;
108109
mod sled_policy;
109110
mod sled_resource_vmm;
@@ -225,6 +226,7 @@ pub use silo_group::*;
225226
pub use silo_user::*;
226227
pub use silo_user_password_hash::*;
227228
pub use sled::*;
229+
pub use sled_cpu_family::*;
228230
pub use sled_instance::*;
229231
pub use sled_policy::to_db_sled_policy; // Do not expose DbSledPolicy
230232
pub use sled_resource_vmm::*;

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(179, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(180, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(180, "sled-cpu-family"),
3132
KnownVersion::new(179, "add-pending-mgs-updates-host-phase-1"),
3233
KnownVersion::new(178, "change-lldp-management-ip-to-inet"),
3334
KnownVersion::new(177, "add-host-ereport-part-number"),

nexus/db-model/src/sled.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use super::{ByteCount, Generation, SledState, SqlU16, SqlU32};
66
use crate::collection::DatastoreCollectionConfig;
77
use crate::ipv6;
88
use crate::sled::shared::Baseboard;
9+
use crate::sled_cpu_family::SledCpuFamily;
910
use crate::sled_policy::DbSledPolicy;
1011
use chrono::{DateTime, Utc};
1112
use db_macros::Asset;
@@ -40,6 +41,8 @@ pub struct SledSystemHardware {
4041

4142
// current VMM reservoir size
4243
pub reservoir_size: ByteCount,
44+
45+
pub cpu_family: SledCpuFamily,
4346
}
4447

4548
/// Database representation of a Sled.
@@ -84,6 +87,16 @@ pub struct Sled {
8487

8588
// ServiceAddress (Repo Depot API). Uses `ip`.
8689
pub repo_depot_port: SqlU16,
90+
91+
/// The family of this sled's CPU.
92+
///
93+
/// This is primarily useful for questions about instance CPU platform
94+
/// compatibility; it is too broad for topology-related sled selection
95+
/// and more precise than a more general report of microarchitecture. We
96+
/// likely should include much more about the sled's CPU alongside this for
97+
/// those broader questions and reporting (see
98+
/// <https://github.com/oxidecomputer/omicron/issues/8730> for examples).
99+
pub cpu_family: SledCpuFamily,
87100
}
88101

89102
impl Sled {
@@ -185,6 +198,7 @@ impl From<Sled> for params::SledAgentInfo {
185198
usable_physical_ram: sled.usable_physical_ram.into(),
186199
reservoir_size: sled.reservoir_size.into(),
187200
generation: sled.sled_agent_gen.into(),
201+
cpu_family: sled.cpu_family.into(),
188202
decommissioned,
189203
}
190204
}
@@ -229,6 +243,8 @@ pub struct SledUpdate {
229243
// ServiceAddress (Repo Depot API). Uses `ip`.
230244
pub repo_depot_port: SqlU16,
231245

246+
pub cpu_family: SledCpuFamily,
247+
232248
// Generation number - owned and incremented by sled-agent.
233249
pub sled_agent_gen: Generation,
234250
}
@@ -258,6 +274,7 @@ impl SledUpdate {
258274
ip: addr.ip().into(),
259275
port: addr.port().into(),
260276
repo_depot_port: repo_depot_port.into(),
277+
cpu_family: hardware.cpu_family,
261278
sled_agent_gen,
262279
}
263280
}
@@ -296,6 +313,7 @@ impl SledUpdate {
296313
repo_depot_port: self.repo_depot_port,
297314
last_used_address,
298315
sled_agent_gen: self.sled_agent_gen,
316+
cpu_family: self.cpu_family,
299317
}
300318
}
301319

nexus/db-model/src/sled_cpu_family.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use super::impl_enum_type;
6+
use serde::{Deserialize, Serialize};
7+
8+
impl_enum_type!(
9+
SledCpuFamilyEnum:
10+
11+
#[derive(
12+
Copy,
13+
Clone,
14+
Debug,
15+
PartialEq,
16+
AsExpression,
17+
FromSqlRow,
18+
Serialize,
19+
Deserialize
20+
)]
21+
pub enum SledCpuFamily;
22+
23+
Unknown => b"unknown"
24+
AmdMilan => b"amd_milan"
25+
AmdTurin => b"amd_turin"
26+
AmdTurinDense => b"amd_turin_dense"
27+
);
28+
29+
impl From<nexus_sled_agent_shared::inventory::SledCpuFamily> for SledCpuFamily {
30+
fn from(value: nexus_sled_agent_shared::inventory::SledCpuFamily) -> Self {
31+
use nexus_sled_agent_shared::inventory::SledCpuFamily as InputFamily;
32+
match value {
33+
InputFamily::Unknown => Self::Unknown,
34+
InputFamily::AmdMilan => Self::AmdMilan,
35+
InputFamily::AmdTurin => Self::AmdTurin,
36+
InputFamily::AmdTurinDense => Self::AmdTurinDense,
37+
}
38+
}
39+
}
40+
41+
impl From<SledCpuFamily> for nexus_sled_agent_shared::inventory::SledCpuFamily {
42+
fn from(value: SledCpuFamily) -> Self {
43+
match value {
44+
SledCpuFamily::Unknown => Self::Unknown,
45+
SledCpuFamily::AmdMilan => Self::AmdMilan,
46+
SledCpuFamily::AmdTurin => Self::AmdTurin,
47+
SledCpuFamily::AmdTurinDense => Self::AmdTurinDense,
48+
}
49+
}
50+
}

nexus/db-queries/src/db/datastore/crucible_dataset.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ mod test {
294294
use crate::db::pub_test_utils::TestDatabase;
295295
use nexus_db_model::Generation;
296296
use nexus_db_model::SledBaseboard;
297+
use nexus_db_model::SledCpuFamily;
297298
use nexus_db_model::SledSystemHardware;
298299
use nexus_db_model::SledUpdate;
299300
use omicron_common::api::external::ByteCount;
@@ -323,6 +324,7 @@ mod test {
323324
usable_hardware_threads: 128,
324325
usable_physical_ram: (64 << 30).try_into().unwrap(),
325326
reservoir_size: (16 << 30).try_into().unwrap(),
327+
cpu_family: SledCpuFamily::AmdMilan,
326328
},
327329
Uuid::new_v4(),
328330
Generation::new(),

nexus/db-queries/src/db/datastore/inventory.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1444,6 +1444,8 @@ impl DataStore {
14441444
sled_agent.usable_physical_ram,
14451445
)
14461446
.into_sql::<diesel::sql_types::Int8>(),
1447+
nexus_db_model::SledCpuFamily::from(sled_agent.cpu_family)
1448+
.into_sql::<nexus_db_schema::enums::SledCpuFamilyEnum>(),
14471449
nexus_db_model::ByteCount::from(
14481450
sled_agent.reservoir_size,
14491451
)
@@ -1498,6 +1500,7 @@ impl DataStore {
14981500
sa_dsl::sled_role,
14991501
sa_dsl::usable_hardware_threads,
15001502
sa_dsl::usable_physical_ram,
1503+
sa_dsl::cpu_family,
15011504
sa_dsl::reservoir_size,
15021505
sa_dsl::ledgered_sled_config,
15031506
sa_dsl::reconciler_status_kind,
@@ -1529,6 +1532,7 @@ impl DataStore {
15291532
_sled_role,
15301533
_usable_hardware_threads,
15311534
_usable_physical_ram,
1535+
_cpu_family,
15321536
_reservoir_size,
15331537
_ledgered_sled_config,
15341538
_reconciler_status_kind,
@@ -3958,6 +3962,7 @@ impl DataStore {
39583962
sled_role: s.sled_role.into(),
39593963
usable_hardware_threads: u32::from(s.usable_hardware_threads),
39603964
usable_physical_ram: s.usable_physical_ram.into(),
3965+
cpu_family: s.cpu_family.into(),
39613966
reservoir_size: s.reservoir_size.into(),
39623967
// For disks, zpools, and datasets, the map for a sled ID is
39633968
// only populated if there is at least one disk/zpool/dataset

0 commit comments

Comments
 (0)