Skip to content

Commit 9b33f35

Browse files
authored
Merge pull request #308 from nebius/dev
Soperator release 1.17.0
2 parents 3d3bfb0 + 988a816 commit 9b33f35

File tree

103 files changed

+3203
-1206
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+3203
-1206
lines changed

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# Global code owners (applies to the whole repo)
2-
* @dstaroff @asteny @rdjjke @Uburro
2+
* @dstaroff @asteny @rdjjke @Uburro @itechdima

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM golang:1.23@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 AS operator_builder
1+
FROM golang:1.23@sha256:7ea4c9dcb2b97ff8ee80a67db3d44f98c8ffa0d191399197007d8459c1453041 AS operator_builder
22

33
ARG GO_LDFLAGS=""
44
ARG BUILD_TIME
@@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
1616
go build -o slurm_operator ./cmd/
1717

1818
#######################################################################################################################
19-
FROM alpine:latest@sha256:21dc6063fd678b478f57c0e13f47560d0ea4eeba26dfc947b2a4f81f686b9f45 AS slurm-operator
19+
FROM alpine:latest@sha256:b97e2a89d0b9e4011bb88c02ddf01c544b8c781acf1f4d559e7c8f12f1047ac3 AS slurm-operator
2020

2121
COPY --from=operator_builder /operator/slurm_operator /usr/bin/
2222

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.16.1
1+
1.17.0

api/v1/slurmcluster_types.go

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"k8s.io/apimachinery/pkg/api/meta"
66
"k8s.io/apimachinery/pkg/api/resource"
77
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
8+
"nebius.ai/slurm-operator/internal/consts"
89

910
mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
1011
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
@@ -22,12 +23,17 @@ type SlurmClusterSpec struct {
2223
// +kubebuilder:validation:Optional
2324
// +kubebuilder:default="gpu"
2425
ClusterType string `json:"clusterType,omitempty"`
25-
26-
// Pause defines whether to gracefully stop the cluster.
27-
// Setting it to false after cluster has been paused starts the cluster back
26+
// Maintenance defines the maintenance window for the cluster.
27+
// It can have the following values:
28+
// - none: No maintenance is performed. The cluster operates normally.
29+
// - downscale: Scales down all components to 0.
30+
// - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
31+
// - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
2832
//
2933
// +kubebuilder:validation:Optional
30-
Pause bool `json:"pause,omitempty"` // TODO cluster pausing/resuming
34+
// +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
35+
// +kubebuilder:default="none"
36+
Maintenance *consts.MaintenanceMode `json:"maintenance,omitempty"`
3137

3238
// NCCLSettings
3339
// +kubebuilder:validation:Optional
@@ -77,8 +83,12 @@ type SlurmClusterSpec struct {
7783
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
7884
//
7985
// +kubebuilder:validation:Optional
80-
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose", maxJobCount: 10000, minJobAge: 86400}
86+
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
8187
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
88+
// Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
89+
//
90+
// +kubebuilder:default=false
91+
UseDefaultAppArmorProfile bool `json:"useDefaultAppArmorProfile,omitempty"`
8292
}
8393

8494
// SlurmConfig represents the Slurm configuration in slurm.conf
@@ -107,8 +117,8 @@ type SlurmConfig struct {
107117
// Additional parameters for the task plugin
108118
//
109119
// +kubebuilder:validation:Optional
110-
// +kubebuilder:default="Verbose"
111-
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
120+
// +kubebuilder:default=""
121+
// +kubebuilder:validation:Pattern="^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$"
112122
TaskPluginParam *string `json:"taskPluginParam,omitempty"`
113123
// Keep N last jobs in controller memory
114124
//
@@ -140,7 +150,7 @@ type NCCLSettings struct {
140150

141151
// TopologyType define type of NCCL GPU topology
142152
//
143-
// +kubebuilder:validation:Enum="H100 GPU cluster";auto;custom
153+
// +kubebuilder:validation:Enum=auto;custom
144154
// +kubebuilder:validation:Optional
145155
// +kubebuilder:default="auto"
146156
TopologyType string `json:"topologyType,omitempty"`
@@ -227,7 +237,7 @@ type NCCLBenchmark struct {
227237
// FailedJobsHistoryLimit defines the number of failed finished jobs to retain
228238
//
229239
// +kubebuilder:validation:Optional
230-
// +kubebuilder:default=3
240+
// +kubebuilder:default=16
231241
FailedJobsHistoryLimit int32 `json:"failedJobsHistoryLimit,omitempty"`
232242

233243
// Image defines the nccl container image
@@ -585,6 +595,9 @@ type AccountingSlurmConf struct {
585595
// +kubebuilder:default=0
586596
PriorityWeightFairshare *int16 `json:"priorityWeightFairshare,omitempty"`
587597
// +kubebuilder:validation:Optional
598+
// +kubebuilder:default=0
599+
PriorityWeightQOS *int16 `json:"priorityWeightQOS,omitempty"`
600+
// +kubebuilder:validation:Optional
588601
PriorityWeightTRES *string `json:"priorityWeightTRES,omitempty"`
589602
}
590603

@@ -640,6 +653,11 @@ type SlurmNodeWorker struct {
640653
// +kubebuilder:validation:Optional
641654
SupervisordConfigMapRefName string `json:"supervisordConfigMapRefName,omitempty"`
642655

656+
// SSHDConfigMapRefName is the name of the SSHD config, which runs in slurmd container
657+
//
658+
// +kubebuilder:validation:Optional
659+
SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`
660+
643661
// Volumes represents the volume configurations for the worker node
644662
//
645663
// +kubebuilder:validation:Required
@@ -713,6 +731,11 @@ type SlurmNodeLogin struct {
713731
// +kubebuilder:validation:Optional
714732
SshdServiceAnnotations map[string]string `json:"sshdServiceAnnotations,omitempty"`
715733

734+
// SSHDConfigMapRefName is the name of the SSHD config, which runs in login container
735+
//
736+
// +kubebuilder:validation:Optional
737+
SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`
738+
716739
// SshRootPublicKeys represents the list of public authorized_keys for SSH connection to Slurm login nodes
717740
//
718741
// +kubebuilder:validation:Required
@@ -871,11 +894,30 @@ type NodeVolumeJailSubMount struct {
871894
// +kubebuilder:validation:Required
872895
MountPath string `json:"mountPath"`
873896

897+
// SubPath points to a specific entry inside the volume.
898+
// Corresponds to the subPath field in the K8s volumeMount structure.
899+
// See official docs for details: https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath
900+
//
901+
// +kubebuilder:validation:Optional
902+
// +kubebuilder:default=""
903+
SubPath string `json:"subPath"`
904+
905+
// ReadOnly defines whether the mount point should be read-only
906+
//
907+
// +kubebuilder:validation:Optional
908+
// +kubebuilder:default=false
909+
ReadOnly bool `json:"readOnly"`
910+
874911
// VolumeSourceName defines the name of the volume source for the sub-mount.
875912
// Must correspond to the name of one of [VolumeSource]
876913
//
877-
// +kubebuilder:validation:Required
878-
VolumeSourceName string `json:"volumeSourceName"`
914+
// +kubebuilder:validation:Optional
915+
VolumeSourceName *string `json:"volumeSourceName"`
916+
917+
// VolumeClaimTemplateSpec defines the [corev1.PersistentVolumeClaim] template specification
918+
//
919+
// +kubebuilder:validation:Optional
920+
VolumeClaimTemplateSpec *corev1.PersistentVolumeClaimSpec `json:"volumeClaimTemplateSpec,omitempty"`
879921
}
880922

881923
type Telemetry struct {

api/v1/zz_generated.deepcopy.go

Lines changed: 27 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
4141
otelv1beta1 "github.com/open-telemetry/opentelemetry-operator/apis/v1beta1"
4242
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
43+
apparmor "sigs.k8s.io/security-profiles-operator/api/apparmorprofile/v1alpha1"
4344

4445
slurmv1 "nebius.ai/slurm-operator/api/v1"
4546
"nebius.ai/slurm-operator/internal/check"
@@ -65,6 +66,9 @@ func init() {
6566
if check.IsMariaDbCRDInstalled() {
6667
utilruntime.Must(mariadbv1alpha1.AddToScheme(scheme))
6768
}
69+
if check.IsAppArmorCRDInstalled() {
70+
utilruntime.Must(apparmor.AddToScheme(scheme))
71+
}
6872

6973
utilruntime.Must(slurmv1.AddToScheme(scheme))
7074

0 commit comments

Comments
 (0)