Skip to content

Commit 33222a4

Browse files
authored
Merge pull request #1171 from nebius/dev
Release soperator 1.21.5
2 parents 081d6ee + bfd8ff7 commit 33222a4

File tree

38 files changed

+255
-300
lines changed

38 files changed

+255
-300
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.21.4
1+
1.21.5

api/v1/slurmcluster_types.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ type SlurmClusterSpec struct {
105105
// PlugStackConfig represents the Plugin stack configurations in `plugstack.conf`.
106106
//
107107
// +kubebuilder:validation:Optional
108-
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/%h/nccl_logs" } }
108+
// +kubebuilder:default={ pyxis: { required: true, containerImageSave: "/var/cache/enroot-container-images/" }, ncclDebug: { required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" } }
109109
PlugStackConfig PlugStackConfig `json:"plugStackConfig,omitempty"`
110110

111111
// SlurmTopologyConfigMapRefName is the name of the slurm topology config.
@@ -215,7 +215,7 @@ type PlugStackConfig struct {
215215
// NcclDebug represents the 'NCCL Debug' SPANK plugin configuration.
216216
//
217217
// +kubebuilder:validation:Optional
218-
// +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/%h/nccl_logs" }
218+
// +kubebuilder:default={ required: false, enabled: false, logLevel: "INFO", outputToFile: true, outputToStdOut: false, outputDirectory: "/opt/soperator-outputs/nccl_logs" }
219219
NcclDebug PluginConfigNcclDebug `json:"ncclDebug,omitempty"`
220220

221221
// PluginConfigCustom represents a configuration of custom SPANK plugins.
@@ -271,7 +271,7 @@ type PluginConfigNcclDebug struct {
271271

272272
// OutputToFile defines whether to additionally redirect `NCCL_DEBUG` outputs to the output file.
273273
// Output filename will have the following format:
274-
// <JOB_ID>.<STEP_ID>.out
274+
// <WORKER_NAME>.<JOB_ID>.<STEP_ID>.out
275275
//
276276
// +kubebuilder:validation:Optional
277277
// +kubebuilder:default=true
@@ -292,7 +292,7 @@ type PluginConfigNcclDebug struct {
292292
// If the path does not exist, it will be created by the plugin.
293293
//
294294
// +kubebuilder:validation:Optional
295-
// +kubebuilder:default="/opt/soperator-outputs/%h/nccl_logs"
295+
// +kubebuilder:default="/opt/soperator-outputs/nccl_logs"
296296
OutputDirectory string `json:"outputDirectory,omitempty"`
297297
}
298298

config/crd/bases/slurm.nebius.ai_slurmclusters.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,7 +1292,7 @@ spec:
12921292
ncclDebug:
12931293
enabled: false
12941294
logLevel: INFO
1295-
outputDirectory: /opt/soperator-outputs/%h/nccl_logs
1295+
outputDirectory: /opt/soperator-outputs/nccl_logs
12961296
outputToFile: true
12971297
outputToStdOut: false
12981298
required: false
@@ -1336,7 +1336,7 @@ spec:
13361336
default:
13371337
enabled: false
13381338
logLevel: INFO
1339-
outputDirectory: /opt/soperator-outputs/%h/nccl_logs
1339+
outputDirectory: /opt/soperator-outputs/nccl_logs
13401340
outputToFile: true
13411341
outputToStdOut: false
13421342
required: false
@@ -1357,7 +1357,7 @@ spec:
13571357
- TRACE
13581358
type: string
13591359
outputDirectory:
1360-
default: /opt/soperator-outputs/%h/nccl_logs
1360+
default: /opt/soperator-outputs/nccl_logs
13611361
description: |-
13621362
OutputDirectory defines a directory path where OutputToFile has to be created.
13631363

@@ -1371,7 +1371,7 @@ spec:
13711371
description: |-
13721372
OutputToFile defines whether to additionally redirect `NCCL_DEBUG` outputs to the output file.
13731373
Output filename will have the following format:
1374-
<JOB_ID>.<STEP_ID>.out
1374+
<WORKER_NAME>.<JOB_ID>.<STEP_ID>.out
13751375
type: boolean
13761376
outputToStdOut:
13771377
default: false

config/manager/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ resources:
33
images:
44
- name: controller
55
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
6-
newTag: 1.21.4
6+
newTag: 1.21.5

config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ spec:
8787
value: "false"
8888
- name: SLURM_OPERATOR_WATCH_NAMESPACES
8989
value: "*"
90-
image: controller:1.21.4
90+
image: controller:1.21.5
9191
imagePullPolicy: Always
9292
name: manager
9393
securityContext:

config/soperatorchecks/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ resources:
44
images:
55
- name: controller
66
newName: cr.eu-north1.nebius.cloud/soperator/soperatorchecks
7-
newTag: 1.21.4
7+
newTag: 1.21.5
88
patches:
99
# Protect the /metrics endpoint by putting it behind auth.
1010
# If you want your controller-manager to expose the /metrics

docs/features.md

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -113,56 +113,50 @@ At the moment, the following information is gathered and can be viewed by users:
113113

114114
### Centralized Logging Scheme
115115

116-
Soperator implements a centralized logging system that automatically collects and categorizes Slurm workload outputs. Logs are organized by worker node to optimize filesystem performance and processed by OpenTelemetry collectors for centralized analysis.
116+
Soperator implements a centralized logging system that automatically collects and categorizes Slurm workload outputs. Logs are organized by type and processed by OpenTelemetry collectors for centralized analysis.
117117

118118
#### Directory Structure
119119

120-
Logs are separated by worker node to prevent filesystem contention on the shared jail storage:
120+
Logs are organized in a flat structure by log type:
121121

122122
```
123123
/opt/soperator-outputs/
124-
├── worker-0/
125-
│ ├── nccl_logs/ # NCCL debug outputs from worker-0
126-
│ ├── slurm_jobs/ # Slurm job outputs from worker-0
127-
│ └── slurm_scripts/ # Script outputs (prolog, epilog, health checks) from worker-0
128-
├── worker-1/
129-
│ ├── nccl_logs/
130-
│ ├── slurm_jobs/
131-
│ └── slurm_scripts/
132-
└── ...
124+
├── nccl_logs/ # NCCL debug outputs from all workers
125+
├── slurm_jobs/ # Slurm job outputs from all workers
126+
└── slurm_scripts/ # Script outputs (prolog, epilog, health checks) from all workers
133127
```
134128

135129
#### Logging Schema
136130

137-
Log files follow simplified naming patterns without worker prefixes (since worker identity is determined by directory structure):
131+
Log files include the worker name at the beginning of the filename for easy identification:
138132

139133
**NCCL Logs:**
140134
```
141-
job_id.job_step_id.out
142-
Example: 12345.67890.out (in /opt/soperator-outputs/worker-0/nccl_logs/)
135+
worker_name.job_id.job_step_id.out
136+
Example: worker-0.12345.67890.out
143137
```
144138

145139
**Slurm Jobs:**
146140
```
147-
job_name.job_id[.array_id].out
141+
worker_name.job_name.job_id[.array_id].out
148142
Examples:
149-
- benchmark.12345.out
150-
- training.12345.1.out (array job)
143+
- worker-1.benchmark.12345.out
144+
- worker-2.training.12345.1.out (array job)
151145
```
152146

153147
**Slurm Scripts:**
154148
```
155-
script_name[.context].out
149+
worker_name.script_name.context.out
156150
Examples:
157-
- health_checker.prolog.out
158-
- cleanup_enroot.epilog.out
151+
- worker-0.health_checker.prolog.out
152+
- worker-3.cleanup_enroot.epilog.out
159153
```
160154

161155
#### Generated Labels
162156

163-
The logging system automatically extracts metadata and creates the following labels:
157+
The logging system automatically extracts metadata from filenames and creates the following labels:
164158

165-
- `worker_name`: Worker pod identifier extracted from directory path
159+
- `slurm_node_name`: Slurm worker node identifier extracted from filename (e.g., "worker-0", "worker-1")
166160
- `log_type`: Category (nccl_logs, slurm_jobs, slurm_scripts)
167161
- `job_id`, `job_step_id`: For NCCL logs
168162
- `job_name`, `job_array_id`: For Slurm job logs

fluxcd/environment/nebius-cloud/dev/bootstrap/flux-kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
name: nebius-cloud
1212
postBuild:
1313
substitute:
14-
soperator_version: 1.21.4
14+
soperator_version: 1.21.5
1515
path: "./fluxcd/enviroment/nebius-cloud/dev"
1616
prune: true
1717
timeout: 1m

fluxcd/environment/nebius-cloud/prod/bootstrap/flux-kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ spec:
1010
name: nebius-cloud
1111
postBuild:
1212
substitute:
13-
soperator_version: 1.21.4
13+
soperator_version: 1.21.5
1414
path: "./fluxcd/enviroment/nebius-cloud/prod"
1515
prune: false
1616
timeout: 1m

helm/nodeconfigurator/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ type: application
1313
# This is the chart version. This version number should be incremented each time you make changes
1414
# to the chart and its templates, including the app version.
1515
# Versions are expected to follow Semantic Versioning (https://semver.org/)
16-
version: 1.21.4
16+
version: 1.21.5
1717
# This is the version number of the application being deployed. This version number should be
1818
# incremented each time you make changes to the application. Versions are not expected to
1919
# follow Semantic Versioning. They should reflect the version the application is using.
2020
# It is recommended to use it with quotes.
21-
appVersion: "1.21.4"
21+
appVersion: "1.21.5"

0 commit comments

Comments
 (0)