Skip to content

Commit 1b65493

Browse files
authored
Merge pull request #389 from nebius/many-small-changes-aggregated/1
Pre-create enroot credentials, exclude enroot bind-mounts from motd, make Docker mount /dev/infiniband, store /tmp on disk and add tmpfs /mnt/memory
2 parents e1adc1b + 16feb1a commit 1b65493

File tree

41 files changed

+274
-119
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+274
-119
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.17.0
1+
1.18.0

api/v1/slurmcluster_types.go

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ type SlurmClusterSpec struct {
8686
// +kubebuilder:validation:Optional
8787
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", epilog: "", prolog: "", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
8888
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
89+
90+
// MPIConfig represents the PMIx configuration in mpi.conf. Not all options are supported.
91+
//
92+
// +kubebuilder:validation:Optional
93+
// +kubebuilder:default={pmixEnv: "OMPI_MCA_btl_tcp_if_include=eth0"}
94+
MPIConfig MPIConfig `json:"mpiConfig,omitempty"`
95+
8996
// Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
9097
//
9198
// +kubebuilder:default=false
@@ -143,6 +150,16 @@ type SlurmConfig struct {
143150
MinJobAge *int32 `json:"minJobAge,omitempty"`
144151
}
145152

153+
type MPIConfig struct {
154+
// Semicolon separated list of environment variables to be set in job environments to be used by PMIx.
155+
// Defaults to "OMPI_MCA_btl_tcp_if_include=eth0" to avoid "lo" and "docker" interfaces to be selected by OpenMPI.
156+
//
157+
// +kubebuilder:validation:Optional
158+
// +kubebuilder:default="OMPI_MCA_btl_tcp_if_include=eth0"
159+
// +kubebuilder:validation:Optional
160+
PMIxEnv string `json:"pmixEnv,omitempty"`
161+
}
162+
146163
type PartitionConfiguration struct {
147164
// ConfigType
148165
// +kubebuilder:validation:Enum=default;custom
@@ -319,7 +336,8 @@ type NCCLArguments struct {
319336
// +kubebuilder:default="0"
320337
ThresholdMoreThan string `json:"thresholdMoreThan,omitempty"`
321338

322-
// UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
339+
// UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
340+
// According to NVIDIA these env vars should be used only for debugging.
323341
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
324342
//
325343
// +kubebuilder:validation:Optional

api/v1/zz_generated.deepcopy.go

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/slurm.nebius.ai_slurmclusters.yaml

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,6 +1082,19 @@ spec:
10821082
- downscaleAndOverwritePopulateJail
10831083
- skipPopulateJail
10841084
type: string
1085+
mpiConfig:
1086+
default:
1087+
pmixEnv: OMPI_MCA_btl_tcp_if_include=eth0
1088+
description: MPIConfig represents the PMIx configuration in mpi.conf.
1089+
Not all options are supported.
1090+
properties:
1091+
pmixEnv:
1092+
default: OMPI_MCA_btl_tcp_if_include=eth0
1093+
description: |-
1094+
Semicolon separated list of environment variables to be set in job environments to be used by PMIx.
1095+
Defaults to "OMPI_MCA_btl_tcp_if_include=eth0" to avoid "lo" and "docker" interfaces to be selected by OpenMPI.
1096+
type: string
1097+
type: object
10851098
ncclSettings:
10861099
description: NCCLSettings
10871100
properties:
@@ -1205,7 +1218,8 @@ spec:
12051218
useInfiniband:
12061219
default: true
12071220
description: |-
1208-
UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
1221+
UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
1222+
According to NVIDIA these env vars should be used only for debugging.
12091223
https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
12101224
type: boolean
12111225
type: object
@@ -1487,9 +1501,9 @@ spec:
14871501
defCpuPerGPU: 16
14881502
defMemPerNode: 1228800
14891503
epilog: ""
1490-
prolog: ""
14911504
maxJobCount: 10000
14921505
minJobAge: 86400
1506+
prolog: ""
14931507
taskPluginParam: ""
14941508
description: SlurmConfig represents the Slurm configuration in slurm.conf.
14951509
Not all options are supported.
@@ -1519,11 +1533,8 @@ spec:
15191533
type: integer
15201534
epilog:
15211535
default: ""
1522-
description: The Epilog script runs after a job completes
1523-
type: string
1524-
prolog:
1525-
default: ""
1526-
description: The Prolog script runs before a job starts on the compute node
1536+
description: Defines specific file to run the epilog when job
1537+
ends. Default value is no epilog
15271538
type: string
15281539
maxJobCount:
15291540
default: 10000
@@ -1536,6 +1547,11 @@ spec:
15361547
time
15371548
format: int32
15381549
type: integer
1550+
prolog:
1551+
default: ""
1552+
description: Defines specific file to run the prolog when job
1553+
starts. Default value is no prolog
1554+
type: string
15391555
taskPluginParam:
15401556
default: ""
15411557
description: Additional parameters for the task plugin

config/crd/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# It should be run by config/default
44
resources:
55
- bases/slurm.nebius.ai_slurmclusters.yaml
6-
- bases/slurm.nebius.ai_nodeconfigurators.yaml
6+
#- bases/slurm.nebius.ai_nodeconfigurators.yaml
77

88
#+kubebuilder:scaffold:crdkustomizeresource
99

config/manager/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ resources:
33
images:
44
- name: controller
55
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
6-
newTag: 1.17.0
6+
newTag: 1.18.0

config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ spec:
8787
value: "false"
8888
- name: SLURM_OPERATOR_WATCH_NAMESPACES
8989
value: "*"
90-
image: controller:1.17.0
90+
image: controller:1.18.0
9191
imagePullPolicy: Always
9292
name: manager
9393
securityContext:

helm/slurm-cluster-storage/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ apiVersion: v2
22
name: helm-slurm-cluster-storage
33
description: A Helm chart for Kubernetes
44
type: application
5-
version: "1.17.0"
6-
appVersion: "1.17.0"
5+
version: "1.18.0"
6+
appVersion: "1.18.0"

helm/slurm-cluster-storage/templates/jail-pvc.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ kind: PersistentVolumeClaim
33
metadata:
44
namespace: {{ .Release.Namespace }}
55
name: {{ include "slurm-cluster-storage.volume.jail.pvc" . }}
6+
annotations:
7+
k8up.io/backup: 'true'
68
spec:
79
storageClassName: {{ include "slurm-cluster-storage.volume.jail.storageClass" . }}
810
resources:

helm/slurm-cluster/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ apiVersion: v2
22
name: helm-slurm-cluster
33
description: A Helm chart for Kubernetes
44
type: application
5-
version: "1.17.0"
6-
appVersion: "1.17.0"
5+
version: "1.18.0"
6+
appVersion: "1.18.0"
77
kubeVersion: ">=1.29.0-0"

0 commit comments

Comments
 (0)