Skip to content

Commit ae481ae

Browse files
authored
Merge pull request #62 from nebius/SCHED920
Build slurm_training_diag images for different cuda versions on Github runners
2 parents c8dc750 + 883f422 commit ae481ae

File tree

2 files changed

+124
-19
lines changed

2 files changed

+124
-19
lines changed

.github/workflows/slurm_training_diag.yml

Lines changed: 120 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,48 +13,150 @@ jobs:
1313
name: Compute shared DAYTIME
1414
run: echo "daytime=$(date -u +%Y%m%d%H%M%S)" >> "$GITHUB_OUTPUT"
1515

16-
build_and_push:
17-
runs-on: [self-hosted, X64]
16+
build_single_arch:
17+
name: Build slurm=${{ matrix.slurm_version }} ${{ matrix.variant }} arch=${{ matrix.arch }}
1818
needs: prepare
19+
runs-on: ${{ matrix.runner }}
1920

2021
strategy:
2122
fail-fast: false
2223
matrix:
24+
slurm_version: ["25.05.6", "25.11.2"]
25+
variant: ["cuda12", "cuda13"]
26+
arch: ["amd64", "arm64"]
2327
include:
24-
- name: slurm_training_diag-cuda12
28+
# runner/platform mapping for arch
29+
- arch: amd64
30+
platform: linux/amd64
31+
runner: ubuntu-24.04
32+
- arch: arm64
33+
platform: linux/arm64
34+
runner: ubuntu-24.04-arm
35+
36+
# variant mapping
37+
- variant: cuda12
2538
cuda_version: "12.9.0"
2639
ubuntu_version: "ubuntu24.04"
2740
nccl_tests_version: "2.16.4"
28-
slurm_version: 25.11.2
29-
- name: slurm_training_diag-cuda13
41+
- variant: cuda13
3042
cuda_version: "13.0.2"
3143
ubuntu_version: "ubuntu24.04"
3244
nccl_tests_version: "2.17.6"
33-
slurm_version: 25.11.2
3445

35-
name: Build and push image for ${{ matrix.name }}
3646
env:
3747
DAYTIME: ${{ needs.prepare.outputs.daytime }}
48+
# https://console.eu.nebius.com/project-e00managed-schedulers/registry/registry-e00hrt9na9xsn2px9f
49+
IMAGE_BASE: cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag
3850

3951
steps:
4052
- name: Checkout repository
41-
uses: actions/checkout@v6
53+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
54+
55+
- name: Set up Docker Buildx
56+
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
4257

43-
- name: Add docker-credential-nebius to PATH
44-
run: echo "/home/ml_containers/.nebius/bin" >> $GITHUB_PATH
58+
- name: Install Nebius CLI
59+
shell: bash
60+
run: |
61+
set -euo pipefail
62+
curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
63+
echo "$HOME/.nebius/bin" >> "$GITHUB_PATH"
64+
65+
- name: Restore Nebius config
66+
shell: bash
67+
run: |
68+
set -euo pipefail
69+
mkdir -p "$HOME/.nebius"
70+
echo "${{ secrets.NEBIUS_CONFIG_YAML_B64 }}" | base64 -d > "$HOME/.nebius/config.yaml"
71+
chmod 600 "$HOME/.nebius/config.yaml"
4572
46-
- name: Get image version name
47-
run: echo "cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${{ env.DAYTIME }}"
73+
- name: Configure Nebius docker credential-helper for auth
74+
shell: bash
75+
run: nebius registry configure-helper
4876

49-
- name: Build and push images
50-
# https://console.eu.nebius.com/project-e00managed-schedulers/registry/registry-e00hrt9na9xsn2px9f
77+
- name: Build and push single-arch image
78+
shell: bash
5179
run: |
80+
set -euo pipefail
81+
82+
IMAGE="${IMAGE_BASE}:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${DAYTIME}"
83+
ARCH_TAG="${IMAGE}-${{ matrix.arch }}"
84+
85+
echo "Pushing: ${ARCH_TAG} (${{ matrix.platform }})"
86+
echo " SLURM_VERSION=${{ matrix.slurm_version }}"
87+
echo " CUDA_VERSION=${{ matrix.cuda_version }}"
88+
echo " NCCL_TESTS_VERSION=${{ matrix.nccl_tests_version }}"
89+
5290
docker buildx build \
53-
--platform linux/amd64,linux/arm64 \
54-
--build-arg CUDA_VERSION=${{ matrix.cuda_version }} \
55-
--build-arg NCCL_TESTS_VERSION=${{ matrix.nccl_tests_version }} \
56-
-t cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${{ env.DAYTIME }} \
91+
--platform "${{ matrix.platform }}" \
92+
--build-arg SLURM_VERSION="${{ matrix.slurm_version }}" \
93+
--build-arg CUDA_VERSION="${{ matrix.cuda_version }}" \
94+
--build-arg NCCL_TESTS_VERSION="${{ matrix.nccl_tests_version }}" \
95+
-t "${ARCH_TAG}" \
5796
--target slurm_training_diag \
5897
--push \
5998
--progress=plain \
6099
.
100+
101+
create_manifest:
102+
name: Create manifest slurm=${{ matrix.slurm_version }} ${{ matrix.variant }}
103+
needs: [prepare, build_single_arch]
104+
runs-on: ubuntu-24.04
105+
106+
strategy:
107+
fail-fast: false
108+
matrix:
109+
slurm_version: ["25.05.6", "25.11.2"]
110+
variant: ["cuda12", "cuda13"]
111+
include:
112+
- variant: cuda12
113+
cuda_version: "12.9.0"
114+
ubuntu_version: "ubuntu24.04"
115+
- variant: cuda13
116+
cuda_version: "13.0.2"
117+
ubuntu_version: "ubuntu24.04"
118+
119+
env:
120+
DAYTIME: ${{ needs.prepare.outputs.daytime }}
121+
IMAGE_BASE: cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag
122+
123+
steps:
124+
- name: Set up Docker Buildx
125+
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
126+
127+
- name: Install Nebius CLI
128+
shell: bash
129+
run: |
130+
set -euo pipefail
131+
curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
132+
echo "$HOME/.nebius/bin" >> "$GITHUB_PATH"
133+
134+
- name: Restore Nebius config
135+
shell: bash
136+
run: |
137+
set -euo pipefail
138+
mkdir -p "$HOME/.nebius"
139+
echo "${{ secrets.NEBIUS_CONFIG_YAML_B64 }}" | base64 -d > "$HOME/.nebius/config.yaml"
140+
chmod 600 "$HOME/.nebius/config.yaml"
141+
142+
- name: Configure Nebius docker credential-helper for auth
143+
shell: bash
144+
run: nebius registry configure-helper
145+
146+
- name: Create and push manifest list
147+
shell: bash
148+
run: |
149+
set -euo pipefail
150+
151+
IMAGE="${IMAGE_BASE}:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${DAYTIME}"
152+
AMD="${IMAGE}-amd64"
153+
ARM="${IMAGE}-arm64"
154+
155+
echo "Creating multi-arch manifest: ${IMAGE}"
156+
echo " - ${AMD}"
157+
echo " - ${ARM}"
158+
159+
docker buildx imagetools create \
160+
-t "${IMAGE}" \
161+
"${AMD}" \
162+
"${ARM}"

Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,12 @@ FROM training_diag AS slurm_training_diag
260260
# CUDA Image for training and diagnostics with a slurm client
261261
######
262262

263+
ARG SLURM_VERSION
264+
ENV SLURM_VERSION=$SLURM_VERSION
265+
263266
# Install slurm client and divert files
264267
COPY ansible/slurm-client.yml /opt/ansible/slurm-client.yml
265268
COPY ansible/roles/slurm-client /opt/ansible/roles/slurm-client
266269
COPY ansible/roles/slurm-divert /opt/ansible/roles/slurm-divert
267270
RUN cd /opt/ansible && \
268-
ansible-playbook -i inventory/ -c local slurm-client.yml
271+
ansible-playbook -i inventory/ -c local slurm-client.yml -e "slurm_version=${SLURM_VERSION}"

0 commit comments

Comments
 (0)