55 "k8s.io/apimachinery/pkg/api/meta"
66 "k8s.io/apimachinery/pkg/api/resource"
77 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
8+ "nebius.ai/slurm-operator/internal/consts"
89
910 mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
1011 prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
@@ -22,12 +23,17 @@ type SlurmClusterSpec struct {
2223 // +kubebuilder:validation:Optional
2324 // +kubebuilder:default="gpu"
2425 ClusterType string `json:"clusterType,omitempty"`
25-
26- // Pause defines whether to gracefully stop the cluster.
27- // Setting it to false after cluster has been paused starts the cluster back
26+ // Maintenance defines the maintenance window for the cluster.
27+ // It can have the following values:
28+ // - none: No maintenance is performed. The cluster operates normally.
29+ // - downscale: Scales down all components to 0.
30+ // - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
31+ // - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
2832 //
2933 // +kubebuilder:validation:Optional
30- Pause bool `json:"pause,omitempty"` // TODO cluster pausing/resuming
34+ // +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
35+ // +kubebuilder:default="none"
36+ Maintenance * consts.MaintenanceMode `json:"maintenance,omitempty"`
3137
3238 // NCCLSettings
3339 // +kubebuilder:validation:Optional
@@ -77,8 +83,12 @@ type SlurmClusterSpec struct {
7783 // SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
7884 //
7985 // +kubebuilder:validation:Optional
80- // +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose ", maxJobCount: 10000, minJobAge: 86400}
86+ // +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
8187 SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
88+ // Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
89+ //
90+ // +kubebuilder:default=false
91+ UseDefaultAppArmorProfile bool `json:"useDefaultAppArmorProfile,omitempty"`
8292}
8393
8494// SlurmConfig represents the Slurm configuration in slurm.conf
@@ -107,8 +117,8 @@ type SlurmConfig struct {
107117 // Additional parameters for the task plugin
108118 //
109119 // +kubebuilder:validation:Optional
110- // +kubebuilder:default="Verbose "
111- // +kubebuilder:validation:Pattern="^(( None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
120+ // +kubebuilder:default=""
121+ // +kubebuilder:validation:Pattern="^(|(( None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+) $"
112122 TaskPluginParam * string `json:"taskPluginParam,omitempty"`
113123 // Keep N last jobs in controller memory
114124 //
@@ -140,7 +150,7 @@ type NCCLSettings struct {
140150
141151 // TopologyType define type of NCCL GPU topology
142152 //
143- // +kubebuilder:validation:Enum="H100 GPU cluster"; auto;custom
153+ // +kubebuilder:validation:Enum=auto;custom
144154 // +kubebuilder:validation:Optional
145155 // +kubebuilder:default="auto"
146156 TopologyType string `json:"topologyType,omitempty"`
@@ -227,7 +237,7 @@ type NCCLBenchmark struct {
227237 // FailedJobsHistoryLimit defines the number of failed finished jobs to retain
228238 //
229239 // +kubebuilder:validation:Optional
230- // +kubebuilder:default=3
240+ // +kubebuilder:default=16
231241 FailedJobsHistoryLimit int32 `json:"failedJobsHistoryLimit,omitempty"`
232242
233243 // Image defines the nccl container image
@@ -585,6 +595,9 @@ type AccountingSlurmConf struct {
585595 // +kubebuilder:default=0
586596 PriorityWeightFairshare * int16 `json:"priorityWeightFairshare,omitempty"`
587597 // +kubebuilder:validation:Optional
598+ // +kubebuilder:default=0
599+ PriorityWeightQOS * int16 `json:"priorityWeightQOS,omitempty"`
600+ // +kubebuilder:validation:Optional
588601 PriorityWeightTRES * string `json:"priorityWeightTRES,omitempty"`
589602}
590603
@@ -640,6 +653,11 @@ type SlurmNodeWorker struct {
640653 // +kubebuilder:validation:Optional
641654 SupervisordConfigMapRefName string `json:"supervisordConfigMapRefName,omitempty"`
642655
656+ // SSHDConfigMapRefName is the name of the SSHD config, which runs in slurmd container
657+ //
658+ // +kubebuilder:validation:Optional
659+ SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`
660+
643661 // Volumes represents the volume configurations for the worker node
644662 //
645663 // +kubebuilder:validation:Required
@@ -713,6 +731,11 @@ type SlurmNodeLogin struct {
713731 // +kubebuilder:validation:Optional
714732 SshdServiceAnnotations map [string ]string `json:"sshdServiceAnnotations,omitempty"`
715733
734+ // SSHDConfigMapRefName is the name of the SSHD config, which runs in login container
735+ //
736+ // +kubebuilder:validation:Optional
737+ SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`
738+
716739 // SshRootPublicKeys represents the list of public authorized_keys for SSH connection to Slurm login nodes
717740 //
718741 // +kubebuilder:validation:Required
@@ -871,11 +894,30 @@ type NodeVolumeJailSubMount struct {
871894 // +kubebuilder:validation:Required
872895 MountPath string `json:"mountPath"`
873896
897+ // SubPath points to a specific entry inside the volume.
898+ // Corresponds to the subPath field in the K8s volumeMount structure.
899+ // See official docs for details: https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath
900+ //
901+ // +kubebuilder:validation:Optional
902+ // +kubebuilder:default=""
903+ SubPath string `json:"subPath"`
904+
905+ // ReadOnly defines whether the mount point should be read-only
906+ //
907+ // +kubebuilder:validation:Optional
908+ // +kubebuilder:default=false
909+ ReadOnly bool `json:"readOnly"`
910+
874911 // VolumeSourceName defines the name of the volume source for the sub-mount.
875912 // Must correspond to the name of one of [VolumeSource]
876913 //
877- // +kubebuilder:validation:Required
878- VolumeSourceName string `json:"volumeSourceName"`
914+ // +kubebuilder:validation:Optional
915+ VolumeSourceName * string `json:"volumeSourceName"`
916+
917+ // VolumeClaimTemplateSpec defines the [corev1.PersistentVolumeClaim] template specification
918+ //
919+ // +kubebuilder:validation:Optional
920+ VolumeClaimTemplateSpec * corev1.PersistentVolumeClaimSpec `json:"volumeClaimTemplateSpec,omitempty"`
879921}
880922
881923type Telemetry struct {
0 commit comments