@@ -86,6 +86,13 @@ type SlurmClusterSpec struct {
8686 // +kubebuilder:validation:Optional
8787 // +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", epilog: "", prolog: "", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
8888 SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
89+
90+ // MPIConfig represents the PMIx configuration in mpi.conf. Not all options are supported.
91+ //
92+ // +kubebuilder:validation:Optional
93+ // +kubebuilder:default={pmixEnv: "OMPI_MCA_btl_tcp_if_include=eth0"}
94+ MPIConfig MPIConfig `json:"mpiConfig,omitempty"`
95+
8996 // Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
9097 //
9198 // +kubebuilder:default=false
@@ -143,6 +150,16 @@ type SlurmConfig struct {
143150 MinJobAge * int32 `json:"minJobAge,omitempty"`
144151}
145152
153+ type MPIConfig struct {
154+ // Semicolon separated list of environment variables to be set in job environments to be used by PMIx.
155+ // Defaults to "OMPI_MCA_btl_tcp_if_include=eth0" to avoid "lo" and "docker" interfaces to be selected by OpenMPI.
156+ //
157+ // +kubebuilder:validation:Optional
158+ // +kubebuilder:default="OMPI_MCA_btl_tcp_if_include=eth0"
159+ // +kubebuilder:validation:Optional
160+ PMIxEnv string `json:"pmixEnv,omitempty"`
161+ }
162+
146163type PartitionConfiguration struct {
147164 // ConfigType
148165 // +kubebuilder:validation:Enum=default;custom
@@ -319,7 +336,8 @@ type NCCLArguments struct {
319336 // +kubebuilder:default="0"
320337 ThresholdMoreThan string `json:"thresholdMoreThan,omitempty"`
321338
322- // UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
339+ // UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
340+ // According to NVIDIA these env vars should be used only for debugging.
323341 // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
324342 //
325343 // +kubebuilder:validation:Optional
0 commit comments