Skip to content

Commit 7067e09

Browse files
authored
Merge pull request #1131 from nebius/dev
Release 1.21.1
2 parents de52738 + 860c59f commit 7067e09

File tree

67 files changed

+864
-299
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+864
-299
lines changed

.github/workflows/e2e_test.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ name: E2E test soperator
22

33
on:
44
schedule:
5-
- cron: '0 */2 * * *'
5+
# Every hour
6+
- cron: '0 */1 * * *'
67
workflow_dispatch:
78
inputs:
89
terraform_repo:
@@ -51,7 +52,7 @@ jobs:
5152

5253
steps:
5354
- name: Harden Runner
54-
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
55+
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
5556
with:
5657
egress-policy: audit
5758

.github/workflows/github_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ jobs:
9595
{
9696
"key": "docs",
9797
"title": "## 📔Docs",
98-
"labels": ["doc", "docs"]
98+
"labels": ["doc", "docs", "documentation"]
9999
},
100100
{
101101
"key": "other",

.github/workflows/gpubench_only.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717

1818
steps:
1919
- name: Harden Runner
20-
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
20+
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
2121
with:
2222
egress-policy: audit
2323

.github/workflows/one_job.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ on:
55
branches:
66
- main
77
- dev
8+
tags:
9+
- 'build**' # Trigger on tags starting with "build"
810
paths-ignore:
911
- 'docs/**'
1012
- 'CODEOWNERS'
@@ -40,7 +42,7 @@ jobs:
4042

4143
steps:
4244
- name: Harden Runner
43-
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
45+
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
4446
with:
4547
egress-policy: audit
4648

@@ -98,7 +100,7 @@ jobs:
98100
runner: ARM64
99101
steps:
100102
- name: Harden Runner
101-
uses: step-security/harden-runner@002fdce3c6a235733a90a27c80493a3241e56863 # v2.12.1
103+
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
102104
with:
103105
egress-policy: audit
104106

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.21.0
1+
1.21.1

api/v1/slurmcluster_types.go

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ type SlurmClusterSpec struct {
8585
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
8686
//
8787
// +kubebuilder:validation:Optional
88-
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", epilog: "", prolog: "", taskPluginParam: "Autobind=Cores", maxJobCount: 10000, minJobAge: 86400}
88+
// +kubebuilder:default={defMemPerNode: 1048576, defCpuPerGPU: 4, completeWait: 5, epilog: "", prolog: "", taskPluginParam: "Autobind=Cores", maxJobCount: 10000, minJobAge: 86400, messageTimeout: 60}
8989
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
9090

9191
// CustomSlurmConfig represents the raw Slurm configuration from slurm.conf.
@@ -105,7 +105,7 @@ type SlurmClusterSpec struct {
105105
// PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`.
106106
//
107107
// +kubebuilder:validation:Optional
108-
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
108+
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/%h/nccl_logs" } }
109109
PlugStackConfig PlugStackConfig `json:"plugStackConfig,omitempty"`
110110

111111
// SlurmTopologyConfigMapRefName is the name of the slurm topology config.
@@ -141,24 +141,18 @@ type SlurmConfig struct {
141141
// Default real memory size available per allocated node in mebibytes.
142142
//
143143
// +kubebuilder:validation:Optional
144-
// +kubebuilder:default=1228800
144+
// +kubebuilder:default=1048576
145145
DefMemPerNode *int32 `json:"defMemPerNode,omitempty"`
146146
// Default count of CPUs allocated per allocated GPU
147147
//
148148
// +kubebuilder:validation:Optional
149-
// +kubebuilder:default=16
149+
// +kubebuilder:default=4
150150
DefCpuPerGPU *int32 `json:"defCpuPerGPU,omitempty"`
151151
// The time to wait, in seconds, when any job is in the COMPLETING state before any additional jobs are scheduled.
152152
//
153153
// +kubebuilder:validation:Optional
154154
// +kubebuilder:default=5
155155
CompleteWait *int32 `json:"completeWait,omitempty"`
156-
// Defines specific subsystems which should provide more detailed event logging.
157-
//
158-
// +kubebuilder:validation:Optional
159-
// +kubebuilder:default="Priority,Script,SelectType,Steps"
160-
// +kubebuilder:validation:Pattern="^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$"
161-
DebugFlags *string `json:"debugFlags,omitempty"`
162156
// Defines specific file to run the epilog when job ends. Default value is no epilog
163157
//
164158
// +kubebuilder:validation:Optional
@@ -197,7 +191,7 @@ type SlurmConfig struct {
197191
TopologyPlugin string `json:"topologyPlugin,omitempty"`
198192
// TopologyParam is list of comma-separated options identifying network topology options.
199193
//
200-
// +kubebuilder:validation:Optional
194+
// +kubebuilder:default=topology/tree
201195
TopologyParam string `json:"topologyParam,omitempty"`
202196
}
203197

@@ -221,7 +215,7 @@ type PlugStackConfig struct {
221215
// NcclDebug represents the 'NCCL Debug' SPANK plugin configuration.
222216
//
223217
// +kubebuilder:validation:Optional
224-
// +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" }
218+
// +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/%h/nccl_logs" }
225219
NcclDebug PluginConfigNcclDebug `json:"ncclDebug,omitempty"`
226220

227221
// PluginConfigCustom represents a configuration of custom SPANK plugins.
@@ -277,7 +271,7 @@ type PluginConfigNcclDebug struct {
277271

278272
// OutputToFile defines whether to additionally redirect `NCCL_DEBUG` outputs to the output file.
279273
// Output filename will have the following format:
280-
// <WORKER_NAME>.<JOB_ID>.<STEP_ID>.out
274+
// <JOB_ID>.<STEP_ID>.out
281275
//
282276
// +kubebuilder:validation:Optional
283277
// +kubebuilder:default=true
@@ -298,7 +292,7 @@ type PluginConfigNcclDebug struct {
298292
// If the path does not exist, it will be created by the plugin.
299293
//
300294
// +kubebuilder:validation:Optional
301-
// +kubebuilder:default="/opt/soperator-outputs/nccl_logs"
295+
// +kubebuilder:default="/opt/soperator-outputs/%h/nccl_logs"
302296
OutputDirectory string `json:"outputDirectory,omitempty"`
303297
}
304298

@@ -720,10 +714,35 @@ type ExternalDB struct {
720714
//
721715
// +kubebuilder:validation:Optional
722716
User string `json:"user"`
723-
// SecretRef defines the reference to the secret with the password key for the external database
717+
// PasswordSecretKeyRef defines the reference to the secret with the password key for the external database.
718+
// Either this or tls.clientCertSecretName must be provided as client credentials.
724719
//
725720
// +kubebuilder:validation:Optional
726721
PasswordSecretKeyRef PasswordSecretKeyRef `json:"passwordSecretKeyRef"`
722+
// TLS provides the configuration required to establish TLS connection with the external MariaDB.
723+
//
724+
// +kubebuilder:validation:Optional
725+
TLS ExternalDBTLSConfig `json:"tls,omitempty"`
726+
// StorageParameters defines the list of additional parameters to set in slurmdbd.conf's StorageParameters.
727+
// Some values here may be overridden by TLS configuration
728+
//
729+
// +kubebuilder:validation:Optional
730+
StorageParameters map[string]string `json:"storageParameters,omitempty"`
731+
}
732+
733+
type ExternalDBTLSConfig struct {
734+
// ServerCASecretRef defines the reference to a Secret containing the MariaDB server CA certificates.
735+
// The secret should contain a 'ca.crt' key.
736+
// If set, it overrides SSL_CA value in storageParameters
737+
//
738+
// +kubebuilder:validation:Optional
739+
ServerCASecretRef string `json:"serverCASecretRef,omitempty"`
740+
// ClientCertSecretName defines the reference to a Kubernetes TLS Secret (with tls.crt and tls.key files).
741+
// Either this or passwordSecretKeyRef must be provided as client credentials.
742+
// If set, it overrides SSL_CERT and SSL_KEY values in storageParameters
743+
//
744+
// +kubebuilder:validation:Optional
745+
ClientCertSecretRef string `json:"clientCertSecretRef,omitempty"`
727746
}
728747

729748
type PasswordSecretKeyRef struct {

api/v1/zz_generated.deepcopy.go

Lines changed: 24 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/slurm.nebius.ai_slurmclusters.yaml

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,7 +1292,7 @@ spec:
12921292
ncclDebug:
12931293
enabled: false
12941294
logLevel: INFO
1295-
outputDirectory: /opt/soperator-outputs/nccl_logs
1295+
outputDirectory: /opt/soperator-outputs/%h/nccl_logs
12961296
outputToFile: true
12971297
outputToStdOut: false
12981298
required: false
@@ -1336,7 +1336,7 @@ spec:
13361336
default:
13371337
enabled: false
13381338
logLevel: INFO
1339-
outputDirectory: /opt/soperator-outputs/nccl_logs
1339+
outputDirectory: /opt/soperator-outputs/%h/nccl_logs
13401340
outputToFile: true
13411341
outputToStdOut: false
13421342
required: false
@@ -1357,7 +1357,7 @@ spec:
13571357
- TRACE
13581358
type: string
13591359
outputDirectory:
1360-
default: /opt/soperator-outputs/nccl_logs
1360+
default: /opt/soperator-outputs/%h/nccl_logs
13611361
description: |-
13621362
OutputDirectory defines a directory path where OutputToFile has to be created.
13631363

@@ -1371,7 +1371,7 @@ spec:
13711371
description: |-
13721372
OutputToFile defines whether to additionally redirect `NCCL_DEBUG` outputs to the output file.
13731373
Output filename will have the following format:
1374-
<WORKER_NAME>.<JOB_ID>.<STEP_ID>.out
1374+
<JOB_ID>.<STEP_ID>.out
13751375
type: boolean
13761376
outputToStdOut:
13771377
default: false
@@ -3183,11 +3183,11 @@ spec:
31833183
slurmConfig:
31843184
default:
31853185
completeWait: 5
3186-
debugFlags: Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs
3187-
defCpuPerGPU: 16
3188-
defMemPerNode: 1228800
3186+
defCpuPerGPU: 4
3187+
defMemPerNode: 1048576
31893188
epilog: ""
31903189
maxJobCount: 10000
3190+
messageTimeout: 60
31913191
minJobAge: 86400
31923192
prolog: ""
31933193
taskPluginParam: Autobind=Cores
@@ -3200,19 +3200,13 @@ spec:
32003200
the COMPLETING state before any additional jobs are scheduled.
32013201
format: int32
32023202
type: integer
3203-
debugFlags:
3204-
default: Priority,Script,SelectType,Steps
3205-
description: Defines specific subsystems which should provide
3206-
more detailed event logging.
3207-
pattern: ^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$
3208-
type: string
32093203
defCpuPerGPU:
3210-
default: 16
3204+
default: 4
32113205
description: Default count of CPUs allocated per allocated GPU
32123206
format: int32
32133207
type: integer
32143208
defMemPerNode:
3215-
default: 1228800
3209+
default: 1048576
32163210
description: Default real memory size available per allocated
32173211
node in mebibytes.
32183212
format: int32
@@ -3250,6 +3244,7 @@ spec:
32503244
description: Additional parameters for the task plugin
32513245
type: string
32523246
topologyParam:
3247+
default: topology/tree
32533248
description: TopologyParam is list of comma-separated options
32543249
identifying network topology options.
32553250
type: string
@@ -4723,8 +4718,9 @@ spec:
47234718
database
47244719
type: string
47254720
passwordSecretKeyRef:
4726-
description: SecretRef defines the reference to the secret
4727-
with the password key for the external database
4721+
description: |-
4722+
PasswordSecretKeyRef defines the reference to the secret with the password key for the external database.
4723+
Either this or tls.clientCertSecretName must be provided as client credentials.
47284724
properties:
47294725
key:
47304726
description: Key defines the key of password in the
@@ -4741,6 +4737,30 @@ spec:
47414737
database
47424738
format: int32
47434739
type: integer
4740+
storageParameters:
4741+
additionalProperties:
4742+
type: string
4743+
description: |-
4744+
StorageParameters defines the list of additional parameters to set in slurmdbd.conf's StorageParameters.
4745+
Some values here may be overridden by TLS configuration
4746+
type: object
4747+
tls:
4748+
description: TLS provides the configuration required to
4749+
establish TLS connection with the external MariaDB.
4750+
properties:
4751+
clientCertSecretRef:
4752+
description: |-
4753+
ClientCertSecretName defines the reference to a Kubernetes TLS Secret (with tls.crt and tls.key files).
4754+
Either this or passwordSecretKeyRef must be provided as client credentials.
4755+
If set, it overrides SSL_CERT and SSL_KEY values in storageParameters
4756+
type: string
4757+
serverCASecretRef:
4758+
description: |-
4759+
ServerCASecretRef defines the reference to a Secret containing the MariaDB server CA certificates.
4760+
The secret should contain a 'ca.crt' key.
4761+
If set, it overrides SSL_CA value in storageParameters
4762+
type: string
4763+
type: object
47444764
user:
47454765
description: Key defines the key of username and password
47464766
in the secret

config/manager/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ resources:
33
images:
44
- name: controller
55
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
6-
newTag: 1.21.0
6+
newTag: 1.21.1

config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ spec:
8787
value: "false"
8888
- name: SLURM_OPERATOR_WATCH_NAMESPACES
8989
value: "*"
90-
image: controller:1.21.0
90+
image: controller:1.21.1
9191
imagePullPolicy: Always
9292
name: manager
9393
securityContext:

0 commit comments

Comments
 (0)