Skip to content

Commit 145f189

Browse files
authored
Merge pull request #191 from nebius/dev
Release 1.15.3
2 parents ee4743f + b2fefc5 commit 145f189

File tree

27 files changed

+209
-93
lines changed

27 files changed

+209
-93
lines changed

.github/workflows/github_release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ jobs:
117117
token: ${{ secrets.GITHUB_TOKEN }}
118118

119119
- name: Create GitHub Release with changelog
120-
uses: softprops/action-gh-release@e7a8f85e1c67a31e6ed99a94b41bd0b71bbee6b8 # v2.0.9
120+
uses: softprops/action-gh-release@01570a1f39cb168c169c802c3bceb9e93fb10974 # v2.1.0
121121
env:
122122
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
123123
with:
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
name: Build gpubench only
2+
3+
on:
4+
push:
5+
paths:
6+
- 'images/jail/gpubench/**'
7+
8+
permissions:
9+
contents: read
10+
packages: write
11+
attestations: write
12+
id-token: write
13+
14+
jobs:
15+
pre-build:
16+
runs-on: self-hosted
17+
18+
outputs:
19+
UNSTABLE: ${{ steps.set-env.outputs.unstable }}
20+
21+
steps:
22+
- name: Harden Runner
23+
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
24+
with:
25+
egress-policy: audit
26+
27+
- name: Set environment to global output variables based on branch
28+
id: set-env
29+
run: |
30+
if [ "${{ github.ref }}" == "refs/heads/main" ]; then
31+
echo "unstable=false" >> $GITHUB_OUTPUT
32+
else
33+
echo "unstable=true" >> $GITHUB_OUTPUT
34+
fi
35+
36+
- name: Print UNSTABLE from output
37+
run: |
38+
echo "Branch is - ${{ github.ref }}"
39+
echo "UNSTABLE - ${{ steps.set-env.outputs.unstable }}"
40+
gpubench_only:
41+
runs-on: self-hosted
42+
needs: pre-build
43+
44+
steps:
45+
- name: Harden Runner
46+
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
47+
with:
48+
egress-policy: audit
49+
50+
- name: Checkout repository
51+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
52+
53+
- name: Install GO
54+
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
55+
with:
56+
go-version-file: 'go.mod'
57+
58+
- name: Debug vars
59+
run: |
60+
echo "UNSTABLE - is ${{ needs.pre-build.outputs.unstable }}"
61+
make get-version UNSTABLE=${{ needs.pre-build.outputs.unstable }}
62+
63+
- name: Check if version synced
64+
run: make test-version-sync
65+
66+
- name: Set up Docker Buildx
67+
uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349 # v3.7.1
68+
69+
- name: Log in to the Github Container registry
70+
uses: docker/login-action@7ca345011ac4304463197fac0e56eab1bc7e6af0
71+
with:
72+
registry: ghcr.io
73+
username: ${{ github.actor }}
74+
password: ${{ secrets.GITHUB_TOKEN }}
75+
76+
- name: Run gpu bench tests
77+
run: |
78+
UNSTABLE=${{ needs.pre-build.outputs.unstable }}
79+
IMAGE_VERSION=$(make get-image-version UNSTABLE=${UNSTABLE})
80+
VERSION=$(make get-version UNSTABLE=${UNSTABLE})
81+
OPERATOR_IMAGE_TAG=$(make get-operator-tag-version UNSTABLE=${UNSTABLE})
82+
83+
echo "Running gpubench tests"
84+
cd ./images/jail/gpubench/
85+
go test
86+
cd -
87+
88+
echo "Removing previous jail rootfs tar archive"
89+
rm -rf images/jail_rootfs.tar
90+
91+
echo "Building tarball for jail"
92+
make docker-build UNSTABLE="${UNSTABLE}" IMAGE_NAME=jail DOCKERFILE=jail/jail.dockerfile DOCKER_OUTPUT="--output type=tar,dest=jail_rootfs.tar"

.github/workflows/one_job.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ on:
99
- 'LICENSE'
1010
- 'PROJECT'
1111
- 'README.md'
12+
- 'images/jail/gpubench/**'
1213

1314
permissions:
1415
contents: read
@@ -25,7 +26,7 @@ jobs:
2526

2627
steps:
2728
- name: Harden Runner
28-
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
29+
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
2930
with:
3031
egress-policy: audit
3132

@@ -49,7 +50,7 @@ jobs:
4950

5051
steps:
5152
- name: Harden Runner
52-
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
53+
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
5354
with:
5455
egress-policy: audit
5556

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
1616
go build -o slurm_operator ./cmd/
1717

1818
#######################################################################################################################
19-
FROM alpine:latest@sha256:beefdbd8a1da6d2915566fde36db9db0b524eb737fc57cd1367effd16dc0d06d AS slurm-operator
19+
FROM alpine:latest@sha256:1e42bbe2508154c9126d48c2b8a75420c3544343bf86fd041fb7527e017a4b4a AS slurm-operator
2020

2121
COPY --from=operator_builder /operator/slurm_operator /usr/bin/
2222

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ In general, you need to follow these steps:
156156
2. Install the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator).
157157
3. If you use InfiniBand, install the [NVIDIA Network Operator](https://github.com/Mellanox/network-operator).
158158
4. Install Soperator by applying the [soperator](helm/soperator) Helm chart.
159-
5. Create a Slurm cluster by applying the [slurm-cluster](helm/slurm-cluster) Helm chart.
159+
5. Create a Slurm cluster in a namespace with the same name as the slurm cluster by
160+
applying the [slurm-cluster](helm/slurm-cluster) Helm chart.
160161
6. Wait until the `slurm.nebius.ai/SlurmCluster` resource becomes `Available`.
161162

162163
[//]: # (TODO: Refer to Helm OCI images instead of file directories when the repo is open)

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.15.2
1+
1.15.3

api/v1/slurmcluster_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ type NCCLArguments struct {
255255
// https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
256256
//
257257
// +kubebuilder:validation:Optional
258-
// +kubebuilder:default=false
258+
// +kubebuilder:default=true
259259
UseInfiniband bool `json:"useInfiniband,omitempty"`
260260
}
261261

config/crd/bases/slurm.nebius.ai_slurmclusters.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1195,7 +1195,7 @@ spec:
11951195
special format
11961196
type: string
11971197
useInfiniband:
1198-
default: false
1198+
default: true
11991199
description: |-
12001200
UseInfiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test
12011201
https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html

config/manager/kustomization.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ resources:
33
images:
44
- name: controller
55
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
6-
newTag: 1.15.2
6+
newTag: 1.15.3

config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ spec:
8282
value: "false"
8383
- name: SLURM_OPERATOR_WATCH_NAMESPACES
8484
value: "*"
85-
image: controller:1.15.2
85+
image: controller:1.15.3
8686
imagePullPolicy: Always
8787
name: manager
8888
securityContext:

0 commit comments

Comments
 (0)