Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
33222a4
Merge pull request #1171 from nebius/dev
theyoprst Jul 7, 2025
e739686
Merge pull request #1179 from nebius/dev
theyoprst Jul 7, 2025
5beb56a
Merge pull request #1182 from nebius/dev
rdjjke Jul 7, 2025
9085160
Merge pull request #1201 from nebius/dev
itechdima Jul 9, 2025
4a0855b
Merge pull request #1248 from nebius/dev
Uburro Jul 16, 2025
6681a6e
Merge pull request #1250 from nebius/dev
rdjjke Jul 16, 2025
594b1a9
Fix apt install nvslurm-plugin-pyxis
theyoprst Jul 23, 2025
b9f7118
Merge pull request #1286 from nebius/release-1.21.10/fix-install-nvsl…
theyoprst Jul 23, 2025
e9528f7
Update GitHub release workflow to support multi-branch releases
theyoprst Jul 23, 2025
371d42b
Merge pull request #1290 from nebius/release-1.21.10/release-on-versi…
theyoprst Jul 23, 2025
76b1e44
Add macOS support in FileStore
mcheshkov Jul 17, 2025
63af423
Setup simple lint check in CI for every interesting platform
mcheshkov Jul 17, 2025
d8819a5
Merge pull request #1285 from nebius/release-1.21.10/aff-macos-supoor…
theyoprst Jul 23, 2025
44036ed
Fix Slurm controller failover #1268 (#1273)
Uburro Jul 22, 2025
c1952e2
fix wait for controller for new scheme failover (#1276)
Uburro Jul 22, 2025
8d0393a
Fix image silently fails (#1265)
ChessProfessor Jul 21, 2025
a15293e
#1270 Drain slurm nodes right after Active Check failure (#1274)
ChessProfessor Jul 23, 2025
3c5fd1a
Merge pull request #1284 from nebius/release-1.21.10/fix-slurm-contro…
theyoprst Jul 23, 2025
ca248be
Merge pull request #1292 from nebius/release-1.21.10/fix-image-silent…
theyoprst Jul 23, 2025
728af51
Merge pull request #1293 from nebius/release-1.21.10/drain-slurm-node…
theyoprst Jul 23, 2025
f06a02f
Disable bash IB healthchecks on B200
rdjjke Jul 23, 2025
5e66b11
Merge pull request #1294 from nebius/release-1.21.10/fix-bash-hc-for-…
theyoprst Jul 23, 2025
f5025af
Fix CI workflow to trigger on release branches and build stable relea…
theyoprst Jul 23, 2025
772ef73
Merge pull request #1295 from nebius/release-1.21.10/builds-on-sopera…
theyoprst Jul 23, 2025
2a09948
Node deletion does not wait for the epilog to complete. #1279 (#1288)
Uburro Jul 23, 2025
8075413
Merge pull request #1297 from nebius/release-1.21.10/want-for-epilog-…
theyoprst Jul 23, 2025
f733a15
Add time metrics to slurm_job_info metric for comprehensive job monit…
theyoprst Jul 23, 2025
076a89b
Merge pull request #1304 from nebius/release-1.21.10/slurm-job-info-t…
theyoprst Jul 24, 2025
7702e8d
Revert "Fix apt install nvslurm-plugin-pyxis"
theyoprst Jul 24, 2025
0e78a1b
Fix Nebius repository configuration to include OS-specific codename
theyoprst Jul 24, 2025
20ccffd
Merge pull request #1307 from nebius/release-1.21.10/revert-pyxis-fix
theyoprst Jul 24, 2025
fb4ddbc
Remove CR_ONE_TASK_PER_CORE from SelectTypeParameters
rdjjke Jul 24, 2025
d9a82b9
Merge pull request #1310 from nebius/release-1.21.10/fix-select-type-…
theyoprst Jul 24, 2025
2b2ca7c
Remove controller service DNS check from worker wait logic to support…
theyoprst Jul 25, 2025
89cf2c2
Merge pull request #1315 from nebius/release-1.21.10/fix-wait-controller
theyoprst Jul 25, 2025
b88599b
Bump version to 1.21.10
theyoprst Jul 25, 2025
ad250ce
Merge pull request #1318 from nebius/release-1.21.10/bump-version
theyoprst Jul 25, 2025
8157fb7
Add empty line to VERSION to trigger stable build logic
theyoprst Jul 25, 2025
18de68f
Fix HEAD^ error in GitHub Actions by fetching full git history
theyoprst Jul 25, 2025
36f3084
Merge pull request #1319 from nebius/release-1.21.10/bump-version-2
theyoprst Jul 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions .github/workflows/github_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@ on:
push:
branches:
- main
paths-ignore:
- '.github/**'
- '.dockerignore'
- '.editorconfig'
- '.gitignore'
- 'soperator-release-*'
paths:
- 'VERSION'

jobs:
tag:
Expand All @@ -24,11 +22,19 @@ jobs:
with:
fetch-depth: 0

- name: Get previous tag
- name: Get previous tag from current branch
id: get-previous-tag
uses: actions-ecosystem/action-get-latest-tag@b7c32daec3395a9616f88548363a42652b22d435 # v1.6.0
with:
semver_only: true
run: |
# Get the latest release tag (semantic version) that's reachable from the current commit
# Match tags like: 1.2.3 (without 'v' prefix)
PREV_TAG=$(git tag --merged HEAD^ | grep -E '^[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -1)
if [ -z "$PREV_TAG" ]; then
echo "Error: No previous release tag found in current branch history"
echo "This is unexpected - there should be previous releases in the history"
exit 1
fi
echo "tag=${PREV_TAG}" >> "${GITHUB_OUTPUT}"
echo "Found previous tag: ${PREV_TAG}"

- name: Get version
id: get-version
Expand Down Expand Up @@ -111,8 +117,7 @@ jobs:
"pattern": "^(other|docs|doc|dependencies|deps|feat|feature|fix|bug|test|.*)",
"target": "$1"
}
],
"base_branches": ["dev"]
]
}
token: ${{ secrets.GITHUB_TOKEN }}

Expand Down
73 changes: 60 additions & 13 deletions .github/workflows/one_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
branches:
- main
- dev
- soperator-release-*
tags:
- 'build**' # Trigger on tags starting with "build"
paths-ignore:
Expand Down Expand Up @@ -48,6 +49,8 @@ jobs:

- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0 # Fetch git history for the VERSION file changes detection

- name: Install GO
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
Expand All @@ -57,14 +60,21 @@ jobs:

- name: Generate version file
run: |
if [ "${{ github.ref }}" == "refs/heads/main" ]; then
make get-version UNSTABLE=false >> version.txt
echo "false" >> version.txt
else
make get-version UNSTABLE=true >> version.txt
echo "true" >> version.txt
UNSTABLE="true"
if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" =~ ^refs/heads/soperator-release- ]]; then
CHANGED_FILES=$(git diff --name-only HEAD^ HEAD) || {
echo "Error: git diff failed with exit code $?"
exit 1
}

if echo "$CHANGED_FILES" | grep -q "^VERSION$"; then
UNSTABLE="false"
fi
fi

make get-version UNSTABLE=${UNSTABLE} >> version.txt
echo "${UNSTABLE}" >> version.txt

- name: Upload version file
uses: actions/upload-artifact@v4
with:
Expand All @@ -86,6 +96,50 @@ jobs:
echo "Version: $VERSION"
echo "Unstable: $UNSTABLE"

lint:
runs-on: self-hosted

steps:
- name: Harden Runner
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
with:
egress-policy: audit

- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Install GO
uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
with:
go-version-file: 'go.mod'
cache: false

# These steps are not a matrix to avoid allocating 3 jobs on runners for something this small
- name: golangci-lint on linux/amd64
uses: golangci/golangci-lint-action@1481404843c368bc19ca9406f87d6e0fc97bdcfd # v7
with:
version: v2.0.2 # version of golangci-lint, should be in sync with Makefile.
env:
GOEXPERIMENT: synctest
GOARCH: amd64
GOOS: linux
- name: golangci-lint on linux/arm64
uses: golangci/golangci-lint-action@1481404843c368bc19ca9406f87d6e0fc97bdcfd # v7
with:
version: v2.0.2 # version of golangci-lint, should be in sync with Makefile.
env:
GOEXPERIMENT: synctest
GOARCH: arm64
GOOS: linux
- name: golangci-lint on darwin/arm64
uses: golangci/golangci-lint-action@1481404843c368bc19ca9406f87d6e0fc97bdcfd # v7
with:
version: v2.0.2 # version of golangci-lint, should be in sync with Makefile.
env:
GOEXPERIMENT: synctest
GOARCH: arm64
GOOS: darwin

build:
runs-on: ${{ matrix.runner }}
needs: pre-build
Expand Down Expand Up @@ -113,13 +167,6 @@ jobs:
go-version-file: 'go.mod'
cache: false

- name: golangci-lint
uses: golangci/golangci-lint-action@1481404843c368bc19ca9406f87d6e0fc97bdcfd # v7
with:
version: v2.0.2 # version of golangci-lint, should be in sync with Makefile.
env:
GOEXPERIMENT: synctest

- name: Download version artifact
uses: actions/download-artifact@v4
with:
Expand Down
3 changes: 2 additions & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
1.21.9
1.21.10

2 changes: 2 additions & 0 deletions api/v1alpha1/activecheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ type ActiveCheckSlurmJobsStatus struct {
LastJobFailReasons []string `json:"lastJobFailReasons"`
// +kubebuilder:validation:Optional
LastJobSubmitTime *metav1.Time `json:"lastJobSubmitTime"`
// +kubebuilder:validation:Optional
LastJobEndTime *metav1.Time `json:"lastJobEndTime"`
}

// ActiveCheckStatus defines the observed state of ActiveCheck.
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions config/crd/bases/slurm.nebius.ai_activechecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7200,6 +7200,9 @@ spec:
description: ActiveCheckSlurmJobsStatus defines the observed state
of ActiveCheck slurm jobs.
properties:
lastJobEndTime:
format: date-time
type: string
lastJobFailReasons:
items:
type: string
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ resources:
images:
- name: controller
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
newTag: 1.21.9
newTag: 1.21.10
2 changes: 1 addition & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ spec:
value: "false"
- name: SLURM_OPERATOR_WATCH_NAMESPACES
value: "*"
image: controller:1.21.9
image: controller:1.21.10
imagePullPolicy: Always
name: manager
securityContext:
Expand Down
2 changes: 1 addition & 1 deletion config/soperatorchecks/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ resources:
images:
- name: controller
newName: cr.eu-north1.nebius.cloud/soperator/soperatorchecks
newTag: 1.21.9
newTag: 1.21.10
patches:
# Protect the /metrics endpoint by putting it behind auth.
# If you want your controller-manager to expose the /metrics
Expand Down
5 changes: 4 additions & 1 deletion docs/slurm-exporter.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,13 @@ slurm_node_fails_total{node_name="worker-2",state_base="DOWN",state_is_drain="tr
- `standard_output`: Path to stdout file
- `array_job_id`: Array job ID (if applicable)
- `array_task_id`: Array task ID (if applicable)
- `submit_time`: When the job was submitted (Unix timestamp seconds, empty if not available)
- `start_time`: When the job started execution (Unix timestamp seconds, empty if not available)
- `end_time`: When the job completed (Unix timestamp seconds, empty if not available)

**Example:**
```prometheus
slurm_job_info{job_id="12345",job_state="RUNNING",job_state_reason="None",slurm_partition="gpu",job_name="training_job",user_name="researcher",user_id="1000",standard_error="/home/researcher/job.err",standard_output="/home/researcher/job.out",array_job_id="",array_task_id=""} 1
slurm_job_info{job_id="12345",job_state="RUNNING",job_state_reason="None",slurm_partition="gpu",job_name="training_job",user_name="researcher",user_id="1000",standard_error="/home/researcher/job.err",standard_output="/home/researcher/job.out",array_job_id="",array_task_id="",submit_time="1722697200",start_time="1722697230",end_time=""} 1
```

#### Gauge `slurm_node_job`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
name: nebius-cloud
postBuild:
substitute:
soperator_version: 1.21.9
soperator_version: 1.21.10
path: "./fluxcd/enviroment/nebius-cloud/dev"
prune: true
timeout: 1m
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
name: nebius-cloud
postBuild:
substitute:
soperator_version: 1.21.9
soperator_version: 1.21.10
path: "./fluxcd/enviroment/nebius-cloud/prod"
prune: false
timeout: 1m
4 changes: 2 additions & 2 deletions helm/nodeconfigurator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.21.9
version: 1.21.10
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.21.9"
appVersion: "1.21.10"
2 changes: 1 addition & 1 deletion helm/nodeconfigurator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ rebooter:
env: []
image:
repository: "cr.eu-north1.nebius.cloud/soperator/rebooter"
tag: "1.21.9"
tag: "1.21.10"
pullPolicy: IfNotPresent
nodeSelector: {}
resources: {}
Expand Down
4 changes: 2 additions & 2 deletions helm/slurm-cluster-storage/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-slurm-cluster-storage
description: A Helm chart for Kubernetes
type: application
version: "1.21.9"
appVersion: "1.21.9"
version: "1.21.10"
appVersion: "1.21.10"
4 changes: 2 additions & 2 deletions helm/slurm-cluster/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ apiVersion: v2
name: helm-slurm-cluster
description: A Helm chart for Kubernetes
type: application
version: "1.21.9"
appVersion: "1.21.9"
version: "1.21.10"
appVersion: "1.21.10"
kubeVersion: ">=1.29.0-0"
17 changes: 9 additions & 8 deletions helm/slurm-cluster/slurm_scripts/epilog.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@ if [ -n "$SLURM_JOB_GPUS" ]; then
health_checker
)

GPU_COUNT=$(nvidia-smi --list-gpus 2>/dev/null | wc -l || echo 0)
echo "Found ${GPU_COUNT} GPUs"

# Only add hc_* checks if we have exactly 8 GPUs
if [[ "${GPU_COUNT}" -eq 8 ]]; then
gpus_on_node=$(nvidia-smi --query-gpu=name --format=csv,noheader | sort | uniq -c)
if [[ "${gpus_on_node}" == *"8 NVIDIA"* ]]; then
checks+=(
hc_xid
hc_ib_link_state
hc_ib_counters
hc_ib_pkey
)
if [[ "${gpus_on_node}" == *"8 NVIDIA H100"* ]] || [[ "${gpus_on_node}" == *"8 NVIDIA H200"* ]]; then
checks+=(
hc_ib_link_state
hc_ib_pkey
)
fi
else
echo "Skipping hc_* checks because GPU_COUNT=${GPU_COUNT} (need 8)"
echo "Skipping hc_* checks because there are no 8 GPUs"
fi

pushd /opt/slurm_scripts || exit 0
Expand Down
17 changes: 9 additions & 8 deletions helm/slurm-cluster/slurm_scripts/hc_program.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,21 @@ chroot /mnt/jail /bin/bash -s <<-'EOF'
health_checker
)

GPU_COUNT=$(nvidia-smi --list-gpus 2>/dev/null | wc -l || echo 0)
echo "Found ${GPU_COUNT} GPUs"

# Only add hc_* checks if we have exactly 8 GPUs
if [[ "${GPU_COUNT}" -eq 8 ]]; then
gpus_on_node=$(nvidia-smi --query-gpu=name --format=csv,noheader | sort | uniq -c)
if [[ "${gpus_on_node}" == *"8 NVIDIA"* ]]; then
checks+=(
hc_host_service
hc_xid
hc_ib_link_state
hc_ib_counters
hc_ib_pkey
)
if [[ "${gpus_on_node}" == *"8 NVIDIA H100"* ]] || [[ "${gpus_on_node}" == *"8 NVIDIA H200"* ]]; then
checks+=(
hc_ib_link_state
hc_ib_pkey
)
fi
else
echo "Skipping hc_* checks because GPU_COUNT=${GPU_COUNT} (need 8)"
echo "Skipping hc_* checks because there are no 8 GPUs"
fi

pushd /opt/slurm_scripts || exit 0
Expand Down
17 changes: 9 additions & 8 deletions helm/slurm-cluster/slurm_scripts/prolog.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,21 @@ if [ -n "$SLURM_JOB_GPUS" ]; then
health_checker
)

GPU_COUNT=$(nvidia-smi --list-gpus 2>/dev/null | wc -l || echo 0)
echo "Found ${GPU_COUNT} GPUs"

# Only add hc_* checks if we have exactly 8 GPUs
if [[ "${GPU_COUNT}" -eq 8 ]]; then
gpus_on_node=$(nvidia-smi --query-gpu=name --format=csv,noheader | sort | uniq -c)
if [[ "${gpus_on_node}" == *"8 NVIDIA"* ]]; then
checks+=(
hc_host_service
hc_xid
hc_ib_link_state
hc_ib_counters
hc_ib_pkey
)
if [[ "${gpus_on_node}" == *"8 NVIDIA H100"* ]] || [[ "${gpus_on_node}" == *"8 NVIDIA H200"* ]]; then
checks+=(
hc_ib_link_state
hc_ib_pkey
)
fi
else
echo "Skipping hc_* checks because GPU_COUNT=${GPU_COUNT} (need 8)"
echo "Skipping hc_* checks because there are no 8 GPUs"
fi

pushd /opt/slurm_scripts || exit 0
Expand Down
Loading
Loading