Skip to content

Commit 4954450

Browse files
authored
Merge pull request #107 from yeazelm/add_mps
feat: add MPS GPU sharing settings model
2 parents 479b7bf + 741fcf2 commit 4954450

File tree

2 files changed

+79
-10
lines changed
  • bottlerocket-settings-models

2 files changed

+79
-10
lines changed

bottlerocket-settings-models/modeled-types/src/kubernetes.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ const IMAGE_GC_THRESHOLD_MIN: i32 = 0;
2424
// Define the bounds for the `time-slicing.replicas` field
2525
const TIME_SLICING_REPLICAS_MIN: i32 = 2;
2626
const TIME_SLICING_REPLICAS_MAX: i32 = i32::MAX;
27+
// Define the bounds for the `mps.replicas` field
28+
const MPS_REPLICAS_MIN: i32 = 2;
29+
// 48 is the current max supported on Volta and later cards
30+
const MPS_REPLICAS_MAX: i32 = 48;
2731

2832
/// KubernetesName represents a string that contains a valid Kubernetes resource name. It stores
2933
/// the original string and makes it accessible through standard traits.
@@ -1559,6 +1563,7 @@ pub struct NvidiaDevicePluginSettings {
15591563
device_list_strategy: NvidiaDeviceListStrategy,
15601564
device_sharing_strategy: NvidiaDeviceSharingStrategy,
15611565
time_slicing: NvidiaTimeSlicingSettings,
1566+
mps: NvidiaMpsSettings,
15621567
device_partitioning_strategy: NvidiaDevicePartitioningStrategy,
15631568
mig: NvidiaMigSettings,
15641569
}
@@ -1612,6 +1617,7 @@ impl IntoIterator for NvidiaDeviceListStrategy {
16121617
pub enum NvidiaDeviceSharingStrategy {
16131618
None,
16141619
TimeSlicing,
1620+
Mps,
16151621
}
16161622

16171623
#[model(impl_default = true)]
@@ -1621,6 +1627,13 @@ pub struct NvidiaTimeSlicingSettings {
16211627
fail_requests_greater_than_one: bool,
16221628
}
16231629

1630+
/// NvidiaMpsSettings contains the settings for NVIDIA Multi-Process Service (MPS) GPU sharing.
1631+
#[model(impl_default = true)]
1632+
pub struct NvidiaMpsSettings {
1633+
replicas: BoundedI32<MPS_REPLICAS_MIN, MPS_REPLICAS_MAX>,
1634+
rename_by_default: bool,
1635+
}
1636+
16241637
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
16251638
#[serde(rename_all = "lowercase")]
16261639
pub enum NvidiaDevicePartitioningStrategy {
@@ -1768,6 +1781,7 @@ mod test_nvidia_device_plugins {
17681781
),),
17691782
device_sharing_strategy: None,
17701783
time_slicing: None,
1784+
mps: None,
17711785
device_partitioning_strategy: None,
17721786
mig: None
17731787
}
@@ -1791,6 +1805,7 @@ mod test_nvidia_device_plugins {
17911805
),),
17921806
device_sharing_strategy: Some(NvidiaDeviceSharingStrategy::TimeSlicing),
17931807
time_slicing: None,
1808+
mps: None,
17941809
device_partitioning_strategy: None,
17951810
mig: None
17961811
}
@@ -1822,6 +1837,7 @@ mod test_nvidia_device_plugins {
18221837
),),
18231838
device_sharing_strategy: None,
18241839
time_slicing: None,
1840+
mps: None,
18251841
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
18261842
mig: None
18271843
}
@@ -1846,6 +1862,7 @@ mod test_nvidia_device_plugins {
18461862
),),
18471863
device_sharing_strategy: None,
18481864
time_slicing: None,
1865+
mps: None,
18491866
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
18501867
mig: Some(NvidiaMigSettings {
18511868
profile: Some(HashMap::from([(
@@ -1949,4 +1966,40 @@ mod test_nvidia_device_plugins {
19491966
assert_eq!(results_5, test_json_5);
19501967
assert_eq!(results_6, test_json_6);
19511968
}
1969+
1970+
#[test]
1971+
fn test_sharing_strategy_accepts_mps() {
1972+
let json = r#"{"device-sharing-strategy":"mps"}"#;
1973+
let settings: NvidiaDevicePluginSettings = serde_json::from_str(json).unwrap();
1974+
assert_eq!(
1975+
settings.device_sharing_strategy,
1976+
Some(NvidiaDeviceSharingStrategy::Mps)
1977+
);
1978+
}
1979+
1980+
#[test]
1981+
fn test_mps_replicas_valid_range() {
1982+
for replicas in [2, 24, 48] {
1983+
let json = format!(r#"{{"mps":{{"replicas":{}}}}}"#, replicas);
1984+
let settings: NvidiaDevicePluginSettings = serde_json::from_str(&json).unwrap();
1985+
let actual = settings
1986+
.mps
1987+
.as_ref()
1988+
.unwrap()
1989+
.replicas
1990+
.as_ref()
1991+
.unwrap()
1992+
.get();
1993+
assert_eq!(actual, replicas);
1994+
}
1995+
}
1996+
1997+
#[test]
1998+
fn test_mps_replicas_rejects_out_of_range() {
1999+
for replicas in [0, 1, -1, 49] {
2000+
let json = format!(r#"{{"mps":{{"replicas":{}}}}}"#, replicas);
2001+
let result: Result<NvidiaDevicePluginSettings, _> = serde_json::from_str(&json);
2002+
assert!(result.is_err(), "replicas={} should be rejected", replicas);
2003+
}
2004+
}
19522005
}

bottlerocket-settings-models/settings-extensions/kubelet-device-plugins/src/lib.rs

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ impl SettingsModel for KubeletDevicePluginsV1 {
2020
}
2121

2222
fn set(_current_value: Option<Self>, _target: Self) -> Result<()> {
23-
// Set anything that can be parsed as ECSSettingsV1.
23+
// Set anything that can be parsed as KubeletDevicePluginsV1.
2424
Ok(())
2525
}
2626

@@ -45,7 +45,8 @@ mod test {
4545
use bottlerocket_modeled_types::{
4646
MigProfile, NvidiaDeviceIdStrategy, NvidiaDeviceListStrategy,
4747
NvidiaDeviceListStrategyValues, NvidiaDevicePartitioningStrategy,
48-
NvidiaDeviceSharingStrategy, NvidiaGpuModel, NvidiaMigSettings, NvidiaTimeSlicingSettings,
48+
NvidiaDeviceSharingStrategy, NvidiaGpuModel, NvidiaMigSettings, NvidiaMpsSettings,
49+
NvidiaTimeSlicingSettings,
4950
};
5051
use bounded_integer::BoundedI32;
5152
use std::collections::HashMap;
@@ -59,9 +60,24 @@ mod test {
5960
);
6061
}
6162

63+
#[test]
64+
fn test_serde_kubelet_device_plugins_with_mps() {
65+
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"mps","mps":{"replicas":4},"device-partitioning-strategy":"none"}}"#;
66+
67+
let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
68+
assert_eq!(
69+
device_plugins
70+
.nvidia
71+
.as_ref()
72+
.unwrap()
73+
.device_sharing_strategy,
74+
Some(NvidiaDeviceSharingStrategy::Mps)
75+
);
76+
}
77+
6278
#[test]
6379
fn test_serde_kubelet_device_plugins_vec() {
64-
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":["volume-mounts","envvar"],"device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
80+
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":["volume-mounts","envvar"],"device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"mps":{},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
6581

6682
let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
6783
assert_eq!(
@@ -80,6 +96,7 @@ mod test {
8096
rename_by_default: Some(true),
8197
fail_requests_greater_than_one: Some(true),
8298
}),
99+
mps: Some(NvidiaMpsSettings::default()),
83100
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
84101
mig: Some(NvidiaMigSettings {
85102
profile: Some(HashMap::from([(
@@ -90,14 +107,13 @@ mod test {
90107
})
91108
}
92109
);
93-
94-
let results = serde_json::to_string(&device_plugins).unwrap();
95-
assert_eq!(results, test_json);
110+
let serialized = serde_json::to_string(&device_plugins).unwrap();
111+
assert_eq!(serialized, test_json);
96112
}
97113

98114
#[test]
99115
fn test_serde_kubelet_device_plugins_scalar() {
100-
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
116+
let test_json = r#"{"nvidia":{"pass-device-specs":true,"device-id-strategy":"index","device-list-strategy":"volume-mounts","device-sharing-strategy":"time-slicing","time-slicing":{"replicas":2,"rename-by-default":true,"fail-requests-greater-than-one":true},"mps":{},"device-partitioning-strategy":"mig","mig":{"profile":{"a100.40gb":"1g.5gb"}}}}"#;
101117

102118
let device_plugins: KubeletDevicePluginsV1 = serde_json::from_str(test_json).unwrap();
103119
assert_eq!(
@@ -115,6 +131,7 @@ mod test {
115131
rename_by_default: Some(true),
116132
fail_requests_greater_than_one: Some(true),
117133
}),
134+
mps: Some(NvidiaMpsSettings::default()),
118135
device_partitioning_strategy: Some(NvidiaDevicePartitioningStrategy::MIG),
119136
mig: Some(NvidiaMigSettings {
120137
profile: Some(HashMap::from([(
@@ -125,8 +142,7 @@ mod test {
125142
})
126143
}
127144
);
128-
129-
let results = serde_json::to_string(&device_plugins).unwrap();
130-
assert_eq!(results, test_json);
145+
let serialized = serde_json::to_string(&device_plugins).unwrap();
146+
assert_eq!(serialized, test_json);
131147
}
132148
}

0 commit comments

Comments
 (0)