@@ -13,48 +13,150 @@ jobs:
1313 name : Compute shared DAYTIME
1414 run : echo "daytime=$(date -u +%Y%m%d%H%M%S)" >> "$GITHUB_OUTPUT"
1515
16- build_and_push :
17- runs-on : [self-hosted, X64]
16+ build_single_arch :
17+ name : Build slurm=${{ matrix.slurm_version }} ${{ matrix.variant }} arch=${{ matrix.arch }}
1818 needs : prepare
19+ runs-on : ${{ matrix.runner }}
1920
2021 strategy :
2122 fail-fast : false
2223 matrix :
24+ slurm_version : ["25.05.6", "25.11.2"]
25+ variant : ["cuda12", "cuda13"]
26+ arch : ["amd64", "arm64"]
2327 include :
24- - name : slurm_training_diag-cuda12
28+ # runner/platform mapping for arch
29+ - arch : amd64
30+ platform : linux/amd64
31+ runner : ubuntu-24.04
32+ - arch : arm64
33+ platform : linux/arm64
34+ runner : ubuntu-24.04-arm
35+
36+ # variant mapping
37+ - variant : cuda12
2538 cuda_version : " 12.9.0"
2639 ubuntu_version : " ubuntu24.04"
2740 nccl_tests_version : " 2.16.4"
28- slurm_version : 25.11.2
29- - name : slurm_training_diag-cuda13
41+ - variant : cuda13
3042 cuda_version : " 13.0.2"
3143 ubuntu_version : " ubuntu24.04"
3244 nccl_tests_version : " 2.17.6"
33- slurm_version : 25.11.2
3445
35- name : Build and push image for ${{ matrix.name }}
3646 env :
3747 DAYTIME : ${{ needs.prepare.outputs.daytime }}
48+ # https://console.eu.nebius.com/project-e00managed-schedulers/registry/registry-e00hrt9na9xsn2px9f
49+ IMAGE_BASE : cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag
3850
3951 steps :
4052 - name : Checkout repository
41- uses : actions/checkout@v6
53+ uses : actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
54+
55+ - name : Set up Docker Buildx
56+ uses : docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
4257
43- - name : Add docker-credential-nebius to PATH
44- run : echo "/home/ml_containers/.nebius/bin" >> $GITHUB_PATH
58+ - name : Install Nebius CLI
59+ shell : bash
60+ run : |
61+ set -euo pipefail
62+ curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
63+ echo "$HOME/.nebius/bin" >> "$GITHUB_PATH"
64+
65+ - name : Restore Nebius config
66+ shell : bash
67+ run : |
68+ set -euo pipefail
69+ mkdir -p "$HOME/.nebius"
70+ echo "${{ secrets.NEBIUS_CONFIG_YAML_B64 }}" | base64 -d > "$HOME/.nebius/config.yaml"
71+ chmod 600 "$HOME/.nebius/config.yaml"
4572
46- - name : Get image version name
47- run : echo "cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${{ env.DAYTIME }}"
73+ - name : Configure Nebius docker credential-helper for auth
74+ shell : bash
75+ run : nebius registry configure-helper
4876
49- - name : Build and push images
50- # https://console.eu.nebius.com/project-e00managed-schedulers/registry/registry-e00hrt9na9xsn2px9f
77+ - name : Build and push single-arch image
78+ shell : bash
5179 run : |
80+ set -euo pipefail
81+
82+ IMAGE="${IMAGE_BASE}:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${DAYTIME}"
83+ ARCH_TAG="${IMAGE}-${{ matrix.arch }}"
84+
85+ echo "Pushing: ${ARCH_TAG} (${{ matrix.platform }})"
86+ echo " SLURM_VERSION=${{ matrix.slurm_version }}"
87+ echo " CUDA_VERSION=${{ matrix.cuda_version }}"
88+ echo " NCCL_TESTS_VERSION=${{ matrix.nccl_tests_version }}"
89+
5290 docker buildx build \
53- --platform linux/amd64,linux/arm64 \
54- --build-arg CUDA_VERSION=${{ matrix.cuda_version }} \
55- --build-arg NCCL_TESTS_VERSION=${{ matrix.nccl_tests_version }} \
56- -t cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${{ env.DAYTIME }} \
91+ --platform "${{ matrix.platform }}" \
92+ --build-arg SLURM_VERSION="${{ matrix.slurm_version }}" \
93+ --build-arg CUDA_VERSION="${{ matrix.cuda_version }}" \
94+ --build-arg NCCL_TESTS_VERSION="${{ matrix.nccl_tests_version }}" \
95+ -t "${ARCH_TAG}" \
5796 --target slurm_training_diag \
5897 --push \
5998 --progress=plain \
6099 .
100+
101+ create_manifest :
102+ name : Create manifest slurm=${{ matrix.slurm_version }} ${{ matrix.variant }}
103+ needs : [prepare, build_single_arch]
104+ runs-on : ubuntu-24.04
105+
106+ strategy :
107+ fail-fast : false
108+ matrix :
109+ slurm_version : ["25.05.6", "25.11.2"]
110+ variant : ["cuda12", "cuda13"]
111+ include :
112+ - variant : cuda12
113+ cuda_version : " 12.9.0"
114+ ubuntu_version : " ubuntu24.04"
115+ - variant : cuda13
116+ cuda_version : " 13.0.2"
117+ ubuntu_version : " ubuntu24.04"
118+
119+ env :
120+ DAYTIME : ${{ needs.prepare.outputs.daytime }}
121+ IMAGE_BASE : cr.eu-north1.nebius.cloud/ml-containers/slurm_training_diag
122+
123+ steps :
124+ - name : Set up Docker Buildx
125+ uses : docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
126+
127+ - name : Install Nebius CLI
128+ shell : bash
129+ run : |
130+ set -euo pipefail
131+ curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash
132+ echo "$HOME/.nebius/bin" >> "$GITHUB_PATH"
133+
134+ - name : Restore Nebius config
135+ shell : bash
136+ run : |
137+ set -euo pipefail
138+ mkdir -p "$HOME/.nebius"
139+ echo "${{ secrets.NEBIUS_CONFIG_YAML_B64 }}" | base64 -d > "$HOME/.nebius/config.yaml"
140+ chmod 600 "$HOME/.nebius/config.yaml"
141+
142+ - name : Configure Nebius docker credential-helper for auth
143+ shell : bash
144+ run : nebius registry configure-helper
145+
146+ - name : Create and push manifest list
147+ shell : bash
148+ run : |
149+ set -euo pipefail
150+
151+ IMAGE="${IMAGE_BASE}:slurm${{ matrix.slurm_version }}-cuda${{ matrix.cuda_version }}-${{ matrix.ubuntu_version }}-${DAYTIME}"
152+ AMD="${IMAGE}-amd64"
153+ ARM="${IMAGE}-arm64"
154+
155+ echo "Creating multi-arch manifest: ${IMAGE}"
156+ echo " - ${AMD}"
157+ echo " - ${ARM}"
158+
159+ docker buildx imagetools create \
160+ -t "${IMAGE}" \
161+ "${AMD}" \
162+ "${ARM}"
0 commit comments