Skip to content

Commit 0e64cb0

Browse files
authored
Merge pull request #554 from nebius/dev
Soperator release 1.19.0
2 parents dfe10cf + 46ea89e commit 0e64cb0

File tree

184 files changed

+36370
-8158
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

184 files changed

+36370
-8158
lines changed

.codespellrc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[codespell]
2+
# Ref: https://github.com/codespell-project/codespell#using-a-config-file
3+
skip = .git*,*.svg,go.sum,.codespellrc
4+
check-hidden = true
5+
ignore-words-list = notin

.github/workflows/github_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ jobs:
6363

6464
- name: Generate changelog
6565
id: changelog
66-
uses: mikepenz/release-changelog-builder-action@a57c1b7c90e56d9c8b26a6ed5d1eed159369e117 # v5
66+
uses: mikepenz/release-changelog-builder-action@9a903f73f4cccdc8241077da63578b3faba06403 # v5
6767
with:
6868
mode: "PR"
6969
fromTag: ${{ needs.tag.outputs.previous-tag }}

.github/workflows/gpubench_only.yml

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020

2121
steps:
2222
- name: Harden Runner
23-
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
23+
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
2424
with:
2525
egress-policy: audit
2626

@@ -43,7 +43,7 @@ jobs:
4343

4444
steps:
4545
- name: Harden Runner
46-
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
46+
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
4747
with:
4848
egress-policy: audit
4949

@@ -55,35 +55,11 @@ jobs:
5555
with:
5656
go-version-file: 'go.mod'
5757

58-
- name: Debug vars
59-
run: |
60-
echo "UNSTABLE - is ${{ needs.pre-build.outputs.unstable }}"
61-
make get-version UNSTABLE=${{ needs.pre-build.outputs.unstable }}
62-
6358
- name: Check if version synced
6459
run: make test-version-sync
6560

66-
- name: Set up Docker Buildx
67-
uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca # v3.9.0
68-
69-
- name: Log in to the Github Container registry
70-
uses: docker/login-action@327cd5a69de6c009b9ce71bce8395f28e651bf99
71-
with:
72-
registry: ghcr.io
73-
username: ${{ github.actor }}
74-
password: ${{ secrets.GITHUB_TOKEN }}
75-
7661
- name: Run gpu bench tests
7762
run: |
78-
UNSTABLE=${{ needs.pre-build.outputs.unstable }}
79-
IMAGE_VERSION=$(make get-image-version UNSTABLE=${UNSTABLE})
80-
VERSION=$(make get-version UNSTABLE=${UNSTABLE})
81-
OPERATOR_IMAGE_TAG=$(make get-operator-tag-version UNSTABLE=${UNSTABLE})
82-
8363
echo "Running gpubench tests"
8464
cd ./images/worker/gpubench/
8565
go test
86-
cd -
87-
88-
echo "Building tarball for jail"
89-
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=jail DOCKERFILE=jail/jail.dockerfile DOCKER_OUTPUT="--output type=tar,dest=jail_rootfs.tar"

.github/workflows/one_job.yml

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828

2929
steps:
3030
- name: Harden Runner
31-
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
31+
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
3232
with:
3333
egress-policy: audit
3434

@@ -52,7 +52,7 @@ jobs:
5252

5353
steps:
5454
- name: Harden Runner
55-
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.4
55+
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
5656
with:
5757
egress-policy: audit
5858

@@ -69,11 +69,22 @@ jobs:
6969
echo "UNSTABLE - is ${{ needs.pre-build.outputs.unstable }}"
7070
make get-version UNSTABLE=${{ needs.pre-build.outputs.unstable }}
7171
72-
- name: Check if version synced
73-
run: make test-version-sync
72+
- name: Run make sync-version-from-scratch
73+
run: |
74+
make kustomize helmify yq
75+
make sync-version-from-scratch
76+
77+
- name: Check for uncommitted changes
78+
run: |
79+
if [[ -n "$(git status --porcelain)" ]]; then
80+
echo "❌ Uncommitted changes detected after make sync-version-from-scratch"
81+
git diff
82+
exit 1
83+
fi
84+
shell: bash
7485

7586
- name: Set up Docker Buildx
76-
uses: docker/setup-buildx-action@f7ce87c1d6bead3e36075b2ce75da1f6cc28aaca # v3.9.0
87+
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
7788

7889
- name: Log in to the Github Container registry
7990
uses: docker/login-action@327cd5a69de6c009b9ce71bce8395f28e651bf99
@@ -133,6 +144,11 @@ jobs:
133144
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=populate_jail DOCKERFILE=populate_jail/populate_jail.dockerfile
134145
make docker-push UNSTABLE="${UNSTABLE}" IMAGE_NAME=populate_jail
135146
147+
echo "Building image of the soperatorchecks"
148+
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=soperatorchecks DOCKERFILE=soperatorchecks.dockerfile IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
149+
echo "Pushing image of the soperatorchecks"
150+
make docker-push UNSTABLE="${UNSTABLE}" IMAGE_NAME=soperatorchecks IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
151+
136152
echo "Building image of the operator"
137153
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=slurm-operator DOCKERFILE=Dockerfile IMAGE_VERSION="$OPERATOR_IMAGE_TAG"
138154
echo "Pushing image of the operator"

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ bin
55
cover.out
66
release_all.sh
77
upload_to_build_agent.sh
8+
.vscode

.golangci.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
linters:
2+
presets:
3+
- bugs
4+
- complexity
5+
- error
6+
- format
7+
- import
8+
- metalinter
9+
- module
10+
- performance
11+
- style
12+
- test
13+
- unused
14+
disable:
15+
- depguard
16+
- ineffassign
17+
- funlen
18+
- forcetypeassert
19+
- testpackage
20+
- tagliatelle
21+
- godot
22+
- misspell
23+
- goconst
24+
- dupl
25+
- gci
26+
- whitespace
27+
- gochecknoinits
28+
- gocognit
29+
- nestif
30+
- gocyclo
31+
- maintidx
32+
- godox
33+
- gofumpt
34+
- gomnd
35+
- lll
36+
- nlreturn
37+
- nolintlint
38+
- wsl
39+
- prealloc
40+
fast: true
41+
42+
output:
43+
formats:
44+
- format: colored-line-number
45+
46+
run:
47+
relative-path-mode: gomod
48+
allow-parallel-runners: true
49+
allow-serial-runners: true

.mockery.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
with-expecter: true
2+
issue-845-fix: True
3+
resolve-type-alias: False
4+
packages:
5+
nebius.ai/slurm-operator/internal/slurmapi:
6+
interfaces:
7+
Client:
8+
config:
9+
dir: "{{.InterfaceDirRelative}}/fake"
10+
outpkg: "fake"
11+
filename: "mock_{{ .InterfaceName | camelcase | firstLower }}.go"

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
1616
go build -o slurm_operator ./cmd/
1717

1818
#######################################################################################################################
19-
FROM alpine:latest@sha256:56fa17d2a7e7f168a043a2712e63aed1f8543aeafdcee47c58dcffe38ed51099 AS slurm-operator
19+
FROM alpine:latest@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c AS slurm-operator
2020

2121
COPY --from=operator_builder /operator/slurm_operator /usr/bin/
2222

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@
186186
same "printed page" as the copyright notice for easier
187187
identification within third-party archives.
188188

189-
Copyright 2024 Nebius B.V.
189+
Copyright [yyyy] [name of copyright owner]
190190

191191
Licensed under the Apache License, Version 2.0 (the "License");
192192
you may not use this file except in compliance with the License.

Makefile

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ SHELL = /usr/bin/env bash -o pipefail
1818
.SHELLFLAGS = -ec
1919

2020
# Limit the scope of generation otherwise it will try to generate configs for non-controller code
21-
GENPATH = "./api/v1;"
21+
GENPATH = "./api/v1;./api/v1alpha1;"
2222

2323
CHART_PATH = helm
2424
CHART_OPERATOR_PATH = $(CHART_PATH)/soperator
25+
CHART_SOPERATORCHECKS_PATH = $(CHART_PATH)/soperatorchecks
26+
CHART_NODECONFIGURATOR_PATH = $(CHART_PATH)/nodeconfigurator
2527
CHART_OPERATOR_CRDS_PATH = $(CHART_PATH)/soperator-crds
2628
CHART_CLUSTER_PATH = $(CHART_PATH)/slurm-cluster
2729
CHART_STORAGE_PATH = $(CHART_PATH)/slurm-cluster-storage
@@ -79,8 +81,9 @@ help: ## Display this help.
7981
.PHONY: manifests
8082
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
8183
$(CONTROLLER_GEN) crd webhook paths=$(GENPATH) output:crd:artifacts:config=config/crd/bases
82-
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./internal/controller/clustercontroller/..." output:artifacts:config=config/rbac/clustercontroller/
83-
$(CONTROLLER_GEN) rbac:roleName=node-configurator-role paths="./internal/rebooter/..." output:artifacts:config=config/rbac/node-configurator/
84+
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./internal/controller/..." output:artifacts:config=config/rbac/clustercontroller/
85+
$(CONTROLLER_GEN) rbac:roleName=nodeconfigurator-role paths="./internal/rebooter/..." output:artifacts:config=config/rbac/nodeconfigurator/
86+
$(CONTROLLER_GEN) rbac:roleName=soperator-checks-role paths="./internal/soperatorchecks/..." output:artifacts:config=config/rbac/soperatorchecks/
8487
.PHONY: generate
8588
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
8689
$(CONTROLLER_GEN) object paths=$(GENPATH)
@@ -107,11 +110,19 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
107110

108111
.PHONY: helm
109112
helm: generate manifests ## Update soperator Helm chart
110-
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_PATH)/crds/slurmcluster-crd.yaml
111-
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_CRDS_PATH)/templates/slurmcluster-crd.yaml
113+
$(KUSTOMIZE) build config/crd/bases > $(CHART_OPERATOR_PATH)/crds/slurmcluster-crd.yaml
114+
$(KUSTOMIZE) build config/crd/bases > $(CHART_OPERATOR_CRDS_PATH)/templates/slurmcluster-crd.yaml
115+
# Because of helmify rewrite a file we need to make backup of values.yaml
112116
mv $(CHART_OPERATOR_PATH)/values.yaml $(CHART_OPERATOR_PATH)/values.yaml.bak
113-
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/soperator-helm | $(HELMIFY) $(CHART_OPERATOR_PATH)
117+
mv $(CHART_NODECONFIGURATOR_PATH)/values.yaml $(CHART_NODECONFIGURATOR_PATH)/values.yaml.bak
118+
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/clustercontroller | $(HELMIFY) $(CHART_OPERATOR_PATH)
119+
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/nodeconfigurator | $(HELMIFY) $(CHART_NODECONFIGURATOR_PATH)
120+
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/soperatorchecks | $(HELMIFY) $(CHART_SOPERATORCHECKS_PATH)
114121
mv $(CHART_OPERATOR_PATH)/values.yaml.bak $(CHART_OPERATOR_PATH)/values.yaml
122+
mv $(CHART_NODECONFIGURATOR_PATH)/values.yaml.bak $(CHART_NODECONFIGURATOR_PATH)/values.yaml
123+
# Because of helmify rewrite a file we need to add the missing if statement
124+
@$(SED_COMMAND) '1s|^|{{- if and .Values.rebooter.generateRBAC .Values.rebooter.enabled }}\n|' $(CHART_NODECONFIGURATOR_PATH)/templates/nodeconfigurator-rbac.yaml
125+
@echo -e "\n{{- end }}" >> $(CHART_NODECONFIGURATOR_PATH)/templates/nodeconfigurator-rbac.yaml
115126

116127
.PHONY: get-version
117128
get-version:
@@ -151,6 +162,11 @@ sync-version: yq ## Sync versions from file
151162
@$(YQ) -i ".images.[0].newTag = \"$(OPERATOR_IMAGE_TAG)\"" "config/manager/kustomization.yaml"
152163
@# endregion config/manager/kustomization.yaml
153164

165+
@echo 'Syncing config/soperatorchecks/kustomization.yaml'
166+
@$(YQ) -i ".images.[0].newName = \"$(IMAGE_REPO)/soperatorchecks\"" "config/soperatorchecks/kustomization.yaml"
167+
@$(YQ) -i ".images.[0].newTag = \"$(OPERATOR_IMAGE_TAG)\"" "config/soperatorchecks/kustomization.yaml"
168+
@# endregion config/soperatorchecks/kustomization.yaml
169+
154170
@# region config/manager/manager.yaml
155171
@echo 'Syncing config/manager/manager.yaml'
156172
@$(SED_COMMAND) "s/image: controller:[^ ]*/image: controller:$(OPERATOR_IMAGE_TAG)/" config/manager/manager.yaml
@@ -162,10 +178,14 @@ sync-version: yq ## Sync versions from file
162178
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_CRDS_PATH)/Chart.yaml"
163179
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_CLUSTER_PATH)/Chart.yaml"
164180
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_STORAGE_PATH)/Chart.yaml"
181+
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_SOPERATORCHECKS_PATH)/Chart.yaml"
182+
@$(YQ) -i ".version = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_NODECONFIGURATOR_PATH)/Chart.yaml"
165183
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_PATH)/Chart.yaml"
166184
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_OPERATOR_CRDS_PATH)/Chart.yaml"
167185
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_CLUSTER_PATH)/Chart.yaml"
168186
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_STORAGE_PATH)/Chart.yaml"
187+
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_SOPERATORCHECKS_PATH)/Chart.yaml"
188+
@$(YQ) -i ".appVersion = \"$(OPERATOR_IMAGE_TAG)\"" "$(CHART_NODECONFIGURATOR_PATH)/Chart.yaml"
169189
@# endregion helm chart versions
170190
#
171191
@# region helm/slurm-cluster/values.yaml
@@ -181,6 +201,18 @@ sync-version: yq ## Sync versions from file
181201
@$(YQ) -i ".images.exporter = \"$(IMAGE_REPO)/exporter:$(IMAGE_VERSION)\"" "helm/slurm-cluster/values.yaml"
182202
@# endregion helm/slurm-cluster/values.yaml
183203

204+
@# region helm/nodeconfigurator/values.yaml
205+
@echo 'Syncing helm/nodeconfigurator/values.yaml'
206+
@$(YQ) -i ".rebooter.image.repository = \"$(IMAGE_REPO)/rebooter\"" "helm/nodeconfigurator/values.yaml"
207+
@$(YQ) -i ".rebooter.image.tag = \"$(OPERATOR_IMAGE_TAG)\"" "helm/nodeconfigurator/values.yaml"
208+
@# endregion helm/nodeconfigurator/values.yaml
209+
210+
@# region helm/soperatorchecks/values.yaml
211+
@echo 'Syncing helm/soperatorchecks/values.yaml'
212+
@$(YQ) -i ".checks.manager.image.repository = \"$(IMAGE_REPO)/soperatorchecks\"" "helm/soperatorchecks/values.yaml"
213+
@$(YQ) -i ".checks.manager.image.tag = \"$(OPERATOR_IMAGE_TAG)\"" "helm/soperatorchecks/values.yaml"
214+
@# endregion helm/soperatorchecks/values.yaml
215+
184216
@# region helm/slurm-cluster/templates/_registry_helpers.tpl
185217
@echo "Syncing $(CHART_CLUSTER_PATH)/templates/_registry_helpers.tpl"
186218
@echo '{{/* This file is generated by make sync-version. */}}' > $(CHART_CLUSTER_PATH)/templates/_registry_helpers.tpl
@@ -228,9 +260,7 @@ endif
228260
ifndef DOCKERFILE
229261
$(error DOCKERFILE is not set, docker image cannot be built)
230262
endif
231-
ifeq (${IMAGE_NAME},slurm-operator)
232-
docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .
233-
else ifeq ($(IMAGE_NAME),rebooter)
263+
ifeq ($(filter ${IMAGE_NAME},slurm-operator rebooter soperatorchecks),${IMAGE_NAME})
234264
docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .
235265
else
236266
cd images && docker build $(DOCKER_BUILD_ARGS) --tag $(IMAGE_REPO)/${IMAGE_NAME}:${IMAGE_VERSION} --target ${IMAGE_NAME} ${DOCKER_IGNORE_CACHE} ${DOCKER_LOAD} ${DOCKER_BUILD_PLATFORM} -f ${DOCKERFILE} ${DOCKER_OUTPUT} .

0 commit comments

Comments
 (0)