Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.18.1
1.18.3
6 changes: 6 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ type SlurmConfig struct {
// +kubebuilder:validation:Optional
// +kubebuilder:default=86400
MinJobAge *int32 `json:"minJobAge,omitempty"`
// MessageTimeout specifies the permitted time for a round-trip communication to complete in seconds.
// See https://slurm.schedmd.com/slurm.conf.html#OPT_MessageTimeout.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=30
MessageTimeout *int32 `json:"messageTimeout,omitempty"`
}

type MPIConfig struct {
Expand Down
5 changes: 5 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1541,6 +1541,13 @@ spec:
description: Keep N last jobs in controller memory
format: int32
type: integer
messageTimeout:
default: 30
description: |-
MessageTimeout specifies the permitted time for a round-trip communication to complete in seconds.
See https://slurm.schedmd.com/slurm.conf.html#OPT_MessageTimeout.
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ resources:
images:
- name: controller
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
newTag: 1.18.1
newTag: 1.18.3
2 changes: 1 addition & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ spec:
value: "false"
- name: SLURM_OPERATOR_WATCH_NAMESPACES
value: "*"
image: controller:1.18.1
image: controller:1.18.3
imagePullPolicy: Always
name: manager
securityContext:
Expand Down
4 changes: 2 additions & 2 deletions helm/slurm-cluster-storage/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-slurm-cluster-storage
description: A Helm chart for Kubernetes
type: application
version: "1.18.1"
appVersion: "1.18.1"
version: "1.18.3"
appVersion: "1.18.3"
4 changes: 2 additions & 2 deletions helm/slurm-cluster/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ apiVersion: v2
name: helm-slurm-cluster
description: A Helm chart for Kubernetes
type: application
version: "1.18.1"
appVersion: "1.18.1"
version: "1.18.3"
appVersion: "1.18.3"
kubeVersion: ">=1.29.0-0"
18 changes: 9 additions & 9 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,14 +408,14 @@ telemetry: {}
# otelCollectorPort: 8429

images:
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.18.1-jammy-slurm24.05.5"
slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.18.1-jammy-slurm24.05.5"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.18.1-jammy-slurm24.05.5"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.18.1-jammy-slurm24.05.5"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.18.1-jammy-slurm24.05.5"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.18.1-jammy-slurm24.05.5"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.18.1-jammy-slurm24.05.5"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.18.1-jammy-slurm24.05.5"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.18.1-jammy-slurm24.05.5"
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.18.3-jammy-slurm24.05.5"
slurmrestd: "cr.eu-north1.nebius.cloud/soperator/slurmrestd:1.18.3-jammy-slurm24.05.5"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.18.3-jammy-slurm24.05.5"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.18.3-jammy-slurm24.05.5"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.18.3-jammy-slurm24.05.5"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.18.3-jammy-slurm24.05.5"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.18.3-jammy-slurm24.05.5"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.18.3-jammy-slurm24.05.5"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.18.3-jammy-slurm24.05.5"
mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3"
rebooter: "cr.eu-north1.nebius.cloud/soperator/rebooter:1.17.0"
4 changes: 2 additions & 2 deletions helm/soperator-crds/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ apiVersion: v2
name: helm-soperator-crds
description: A Helm chart for Kubernetes
type: application
version: 1.18.1
appVersion: "1.18.1"
version: 1.18.3
appVersion: "1.18.3"
kubeVersion: ">=1.29.0-0"
7 changes: 7 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,13 @@ spec:
description: Keep N last jobs in controller memory
format: int32
type: integer
messageTimeout:
default: 30
description: |-
MessageTimeout specifies the permitted time for a round-trip communication to complete in seconds.
See https://slurm.schedmd.com/slurm.conf.html#OPT_MessageTimeout.
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
Expand Down
4 changes: 2 additions & 2 deletions helm/soperator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ apiVersion: v2
name: helm-soperator
description: A Helm chart for Kubernetes
type: application
version: 1.18.1
appVersion: "1.18.1"
version: 1.18.3
appVersion: "1.18.3"
kubeVersion: ">=1.29.0-0"
7 changes: 7 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,13 @@ spec:
description: Keep N last jobs in controller memory
format: int32
type: integer
messageTimeout:
default: 30
description: |-
MessageTimeout specifies the permitted time for a round-trip communication to complete in seconds.
See https://slurm.schedmd.com/slurm.conf.html#OPT_MessageTimeout.
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
Expand Down
2 changes: 1 addition & 1 deletion helm/soperator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ controllerManager:
slurmOperatorWatchNamespaces: '*'
image:
repository: cr.eu-north1.nebius.cloud/soperator/slurm-operator
tag: 1.18.1
tag: 1.18.3
imagePullPolicy: Always
resources:
limits:
Expand Down
2 changes: 1 addition & 1 deletion images/common/scripts/complement_jail.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,6 @@ pushd "${jaildir}"
# For $worker node only
if [ -n "$worker" ]; then
echo "Update linker cache inside the jail"
flock etc/complement_jail_ldconfig.lock -c "chroot \"${jaildir}\" /usr/sbin/ldconfig"
flock --nonblock etc/complement_jail_ldconfig.lock -c "chroot \"${jaildir}\" /usr/sbin/ldconfig" || true
fi
popd
2 changes: 1 addition & 1 deletion internal/check/maintanence.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package check
import "nebius.ai/slurm-operator/internal/consts"

func IsMaintenanceActive(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance != consts.ModeNone
return maintenance != nil && *maintenance != consts.ModeNone && *maintenance != consts.ModeSkipPopulate
}

func IsModeDownscaleAndDeletePopulate(maintenance *consts.MaintenanceMode) bool {
Expand Down
2 changes: 1 addition & 1 deletion internal/consts/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
package consts

const (
VersionCR = "1.18.1"
VersionCR = "1.18.3"
)
5 changes: 4 additions & 1 deletion internal/controller/reconciler/k8s_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ func (r *JobReconciler) patch(existing, desired client.Object) (client.Patch, er
patchImpl := func(dst, src *batchv1.Job) client.Patch {
res := client.MergeFrom(dst.DeepCopy())

dst.Spec.Template.Spec = src.Spec.Template.Spec
// Dst.Spec.Template is immutable, so we need to copy the desired template to the existing one
// just mutating the fields we need
dst.Spec.Parallelism = src.Spec.Parallelism
dst.Spec.Completions = src.Spec.Completions

return res
}
Expand Down
Loading