Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions bottlerocket-settings-models/modeled-types/src/kubernetes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ const IMAGE_GC_THRESHOLD_MIN: i32 = 0;
// Define the bounds for the `time-slicing.replicas` field
const TIME_SLICING_REPLICAS_MIN: i32 = 2;
const TIME_SLICING_REPLICAS_MAX: i32 = i32::MAX;
// Define the bounds for the `mps.replicas` field
const MPS_REPLICAS_MIN: i32 = 2;
// 48 is the current max supported on Volta and later cards
const MPS_REPLICAS_MAX: i32 = 48;

/// KubernetesName represents a string that contains a valid Kubernetes resource name. It stores
/// the original string and makes it accessible through standard traits.
Expand Down Expand Up @@ -1559,6 +1563,7 @@ pub struct NvidiaDevicePluginSettings {
device_list_strategy: NvidiaDeviceListStrategy,
device_sharing_strategy: NvidiaDeviceSharingStrategy,
time_slicing: NvidiaTimeSlicingSettings,
mps: NvidiaMpsSettings,
device_partitioning_strategy: NvidiaDevicePartitioningStrategy,
mig: NvidiaMigSettings,
}
Expand Down Expand Up @@ -1612,6 +1617,7 @@ impl IntoIterator for NvidiaDeviceListStrategy {
pub enum NvidiaDeviceSharingStrategy {
None,
TimeSlicing,
Mps,
}

#[model(impl_default = true)]
Expand All @@ -1621,6 +1627,13 @@ pub struct NvidiaTimeSlicingSettings {
fail_requests_greater_than_one: bool,
}

/// NvidiaMpsSettings contains the settings for NVIDIA Multi-Process Service (MPS) GPU sharing.
#[model(impl_default = true)]
pub struct NvidiaMpsSettings {
replicas: BoundedI32<MPS_REPLICAS_MIN, MPS_REPLICAS_MAX>,
rename_by_default: bool,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum NvidiaDevicePartitioningStrategy {
Expand Down Expand Up @@ -1762,6 +1775,7 @@ mod test_nvidia_device_plugins {
),),
device_sharing_strategy: None,
time_slicing: None,
mps: None,
device_partitioning_strategy: None,
mig: None
}
Expand All @@ -1785,6 +1799,7 @@ mod test_nvidia_device_plugins {
),),
device_sharing_strategy: Some(NvidiaDeviceSharingStrategy::TimeSlicing),
time_slicing: None,
mps: None,
device_partitioning_strategy: None,
mig: None
}
Expand Down Expand Up @@ -1816,6 +1831,7 @@ mod test_nvidia_device_plugins {
),),
device_sharing_strategy: None,
time_slicing: None,
mps: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: None
}
Expand All @@ -1840,6 +1856,7 @@ mod test_nvidia_device_plugins {
),),
device_sharing_strategy: None,
time_slicing: None,
mps: None,
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMigSettings {
profile: Some(HashMap::from([(
Expand Down Expand Up @@ -1943,4 +1960,40 @@ mod test_nvidia_device_plugins {
assert_eq!(results_5, test_json_5);
assert_eq!(results_6, test_json_6);
}

#[test]
fn test_sharing_strategy_accepts_mps() {
let json = r#"{"device-sharing-strategy":"mps"}"#;
let settings: NvidiaDevicePluginSettings = serde_json::from_str(json).unwrap();
assert_eq!(
settings.device_sharing_strategy,
Some(NvidiaDeviceSharingStrategy::Mps)
);
}

#[test]
fn test_mps_replicas_valid_range() {
for replicas in [2, 24, 48] {
let json = format!(r#"{{"mps":{{"replicas":{}}}}}"#, replicas);
let settings: NvidiaDevicePluginSettings = serde_json::from_str(&json).unwrap();
let actual = settings
.mps
.as_ref()
.unwrap()
.replicas
.as_ref()
.unwrap()
.get();
assert_eq!(actual, replicas);
}
}

#[test]
fn test_mps_replicas_rejects_out_of_range() {
for replicas in [0, 1, -1, 49] {
let json = format!(r#"{{"mps":{{"replicas":{}}}}}"#, replicas);
let result: Result<NvidiaDevicePluginSettings, _> = serde_json::from_str(&json);
assert!(result.is_err(), "replicas={} should be rejected", replicas);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl SettingsModel for KubeletDevicePluginsV1 {
}

fn set(_current_value: Option<Self>, _target: Self) -> Result<()> {
// Set anything that can be parsed as ECSSettingsV1.
// Set anything that can be parsed as KubeletDevicePluginsV1.
Ok(())
}

Expand All @@ -45,7 +45,8 @@ mod test {
use bottlerocket_modeled_types::{
MigProfile, NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy,
NvidiaDeviceListStrategyValues, NvidiaDevicePartitioningStrategy,
NvidiaDeviceSharingStrategy, NvidiaGpuModel, NvidiaMigSettings, NvidiaTimeSlicingSettings,
NvidiaDeviceSharingStrategy, NvidiaGpuModel, NvidiaMigSettings, NvidiaMpsSettings,
NvidiaTimeSlicingSettings,
};
use bounded_integer::BoundedI32;
use std::collections::HashMap;
Expand All @@ -59,9 +60,24 @@ mod test {
);
}

#[test]
fn test_serde_kubelet_device_plugins_with_mps() {
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"mps","mps":{"replicas":4},"device-partitioning-strategy":"none"}}"#;

let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
assert_eq!(
device_plugins
.nvidia
.as_ref()
.unwrap()
.device_sharing_strategy,
Some(NvidiaDeviceSharingStrategy::Mps)
);
}

#[test]
fn test_serde_kubelet_device_plugins_vec() {
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":["volume-mounts","envvar"],"device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":["volume-mounts","envvar"],"device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"mps":{},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;

let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
assert_eq!(
Expand All @@ -80,6 +96,7 @@ mod test {
rename_by_default: Some(true),
fail_requests_greater_than_one: Some(true),
}),
mps: Some(NvidiaMpsSettings::default()),
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMigSettings {
profile: Some(HashMap::from([(
Expand All @@ -90,14 +107,13 @@ mod test {
})
}
);

let results = serde_json::to_string(&device_plugins).unwrap();
assert_eq!(results, test_json);
let serialized = serde_json::to_string(&device_plugins).unwrap();
assert_eq!(serialized, test_json);
}

#[test]
fn test_serde_kubelet_device_plugins_scalar() {
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"mps":{},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;

let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
assert_eq!(
Expand All @@ -115,6 +131,7 @@ mod test {
rename_by_default: Some(true),
fail_requests_greater_than_one: Some(true),
}),
mps: Some(NvidiaMpsSettings::default()),
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
mig: Some(NvidiaMigSettings {
profile: Some(HashMap::from([(
Expand All @@ -125,8 +142,7 @@ mod test {
})
}
);

let results = serde_json::to_string(&device_plugins).unwrap();
assert_eq!(results, test_json);
let serialized = serde_json::to_string(&device_plugins).unwrap();
assert_eq!(serialized, test_json);
}
}