@@ -28,10 +28,11 @@ type SlurmClusterSpec struct {
2828 // - none: No maintenance is performed. The cluster operates normally.
2929 // - downscale: Scales down all components to 0.
3030 // - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
31+ // - downscaleAndOverwritePopulateJail: Scales down all components to 0 and overwrite populateJail (same as overwrite=true).
3132 // - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
3233 //
3334 // +kubebuilder:validation:Optional
34- // +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
35+ // +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;downscaleAndOverwritePopulateJail; skipPopulateJail
3536 // +kubebuilder:default="none"
3637 Maintenance * consts.MaintenanceMode `json:"maintenance,omitempty"`
3738
@@ -83,8 +84,15 @@ type SlurmClusterSpec struct {
8384 // SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
8485 //
8586 // +kubebuilder:validation:Optional
86- // +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
87+ // +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", epilog: "", prolog: "", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
8788 SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
89+
90+ // MPIConfig represents the PMIx configuration in mpi.conf. Not all options are supported.
91+ //
92+ // +kubebuilder:validation:Optional
93+ // +kubebuilder:default={pmixEnv: "OMPI_MCA_btl_tcp_if_include=eth0"}
94+ MPIConfig MPIConfig `json:"mpiConfig,omitempty"`
95+
8896 // Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
8997 //
9098 // +kubebuilder:default=false
@@ -114,6 +122,16 @@ type SlurmConfig struct {
114122 // +kubebuilder:default="Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
115123 // +kubebuilder:validation:Pattern="^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$"
116124 DebugFlags * string `json:"debugFlags,omitempty"`
125+ // Defines specific file to run the epilog when job ends. Default value is no epilog
126+ //
127+ // +kubebuilder:validation:Optional
128+ // +kubebuilder:default=""
129+ Epilog * string `json:"epilog,omitempty"`
130+ // Defines specific file to run the prolog when job starts. Default value is no prolog
131+ //
132+ // +kubebuilder:validation:Optional
133+ // +kubebuilder:default=""
134+ Prolog * string `json:"prolog,omitempty"`
117135 // Additional parameters for the task plugin
118136 //
119137 // +kubebuilder:validation:Optional
@@ -132,6 +150,16 @@ type SlurmConfig struct {
132150 MinJobAge * int32 `json:"minJobAge,omitempty"`
133151}
134152
153+ type MPIConfig struct {
154+ // Semicolon separated list of environment variables to be set in job environments to be used by PMIx.
155+ // Defaults to "OMPI_MCA_btl_tcp_if_include=eth0" to avoid "lo" and "docker" interfaces to be selected by OpenMPI.
156+ //
157+ // +kubebuilder:validation:Optional
158+ // +kubebuilder:default="OMPI_MCA_btl_tcp_if_include=eth0"
159+ // +kubebuilder:validation:Optional
160+ PMIxEnv string `json:"pmixEnv,omitempty"`
161+ }
162+
135163type PartitionConfiguration struct {
136164 // ConfigType
137165 // +kubebuilder:validation:Enum=default;custom
@@ -308,7 +336,8 @@ type NCCLArguments struct {
308336 // +kubebuilder:default="0"
309337 ThresholdMoreThan string `json:"thresholdMoreThan,omitempty"`
310338
311- // UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
339+ // UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
340+ // According to NVIDIA these env vars should be used only for debugging.
312341 // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
313342 //
314343 // +kubebuilder:validation:Optional
@@ -570,6 +599,8 @@ type SlurmdbdConfig struct {
570599
571600type AccountingSlurmConf struct {
572601 // +kubebuilder:validation:Optional
602+ // +kubebuilder:validation:Pattern="^((Billing|CPU|Mem|VMem|Node|Energy|Pages|FS/Disk|FS/Lustre|Gres/gpu|Gres/gpu:tesla|Gres/gpu:volta)(,)?)+$"
603+ // +kubebuilder:default="Billing,CPU,Mem,Node,VMem"
573604 AccountingStorageTRES * string `json:"accountingStorageTRES,omitempty"`
574605 // +kubebuilder:validation:Optional
575606 AccountingStoreFlags * string `json:"accountingStoreFlags,omitempty"`
@@ -581,6 +612,7 @@ type AccountingSlurmConf struct {
581612 AcctGatherProfileType * string `json:"acctGatherProfileType,omitempty"`
582613 // +kubebuilder:validation:Optional
583614 // +kubebuilder:validation:Enum="jobacct_gather/linux";"jobacct_gather/cgroup";"jobacct_gather/none"
615+ // +kubebuilder:default="jobacct_gather/cgroup"
584616 JobAcctGatherType * string `json:"jobAcctGatherType,omitempty"`
585617 // +kubebuilder:validation:Optional
586618 // +kubebuilder:default=30
@@ -681,6 +713,68 @@ type SlurmNodeWorker struct {
681713 //
682714 // +kubebuilder:validation:Optional
683715 SlurmNodeExtra string `json:"slurmNodeExtra,omitempty"`
716+
717+ // PriorityClass defines the priority class for the Slurm worker node
718+ //
719+ // +kubebuilder:validation:Optional
720+ PriorityClass string `json:"priorityClass,omitempty"`
721+ // It's alpha feature and will be moved to separate CRD in the future
722+ // Rebooter defines the configuration for the Slurm worker node rebooter
723+ //
724+ // +kubebuilder:validation:Optional
725+ Rebooter Rebooter `json:"rebooter"`
726+ }
727+
728+ // Rebooter defines the configuration for the Slurm worker node rebooter
729+ type Rebooter struct {
730+ // enabled defines whether the rebooter is enabled
731+ //
732+ // +kubebuilder:validation:Optional
733+ // +kubebuilder:default=false
734+ Enabled bool `json:"enabled"`
735+
736+ // Image defines the rebooter container image
737+ //
738+ // +kubebuilder:validation:Optional
739+ Image string `json:"image"`
740+
741+ // imagePullPolicy defines the image pull policy
742+ //
743+ // +kubebuilder:validation:Enum=Always;Never;IfNotPresent
744+ // +kubebuilder:validation:Optional
745+ // +kubebuilder:default="IfNotPresent"
746+ ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"`
747+
748+ // Resources defines the [corev1.ResourceRequirements] for the container
749+ //
750+ // +kubebuilder:validation:Optional
751+ Resources corev1.ResourceList `json:"resources,omitempty"`
752+
753+ // evictionMethod defines the method of eviction for the Slurm worker node
754+ // Must be one of [drain, evict]. Now only evict is supported
755+ //
756+ // +kubebuilder:validation:Optional
757+ // +kubebuilder:validation:Enum="evict"
758+ // +kubebuilder:default="evict"
759+ EvictionMethod string `json:"evictionMethod,omitempty"`
760+
761+ // logLevel defines the log level for the rebooter
762+ //
763+ // +kubebuilder:validation:Optional
764+ // +kubebuilder:default="info"
765+ // +kubebuilder:validation:Enum="debug";"info";"warn";"error"
766+ LogLevel string `json:"logLevel,omitempty"`
767+
768+ // Namespace defines the namespace where the rebooter will be deployed
769+ // By default, the same namespace as the soperator
770+ //
771+ // +kubebuilder:validation:Optional
772+ Namespace string `json:"namespace,omitempty"`
773+
774+ // serviceAccountName defines the service account name for the rebooter
775+ //
776+ // +kubebuilder:validation:Optional
777+ ServiceAccountName string `json:"serviceAccountName,omitempty"`
684778}
685779
686780// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
@@ -1019,6 +1113,7 @@ const (
10191113 ConditionClusterWorkersAvailable = "WorkersAvailable"
10201114 ConditionClusterLoginAvailable = "LoginAvailable"
10211115 ConditionClusterAccountingAvailable = "AccountingAvailable"
1116+ ConditionClusterPopulateJailMode = "PopulateJailMode"
10221117
10231118 PhaseClusterReconciling = "Reconciling"
10241119 PhaseClusterNotAvailable = "Not available"
0 commit comments