1616
1717permissions :
1818 contents : read
19+ actions : write
1920
2021concurrency :
2122 # Prevent cancelling already-running jobs (avoids resource waste).
5859 make install-e2e-tools
5960 echo "${{ github.workspace }}/bin" >> $GITHUB_PATH
6061
62+ - name : Build e2e binary
63+ run : go build -o bin/e2e ./cmd/e2e
64+
6165 - name : Resolve profile
6266 shell : bash
6367 run : |
7276 echo "NEBIUS_REGION=$(echo "$E2E_PROFILE" | yq -r '.nebius_region')" >> "$GITHUB_ENV"
7377 echo "NEBIUS_TENANT_ID=$(echo "$E2E_PROFILE" | yq -r '.nebius_tenant_id')" >> "$GITHUB_ENV"
7478
79+ - name : Print run parameters
80+ shell : bash
81+ run : |
82+ {
83+ echo "### Run Parameters"
84+ echo ""
85+ echo "- Branch: \`${{ github.ref_name }}\`"
86+ echo "- Terraform branch: \`${{ env.TERRAFORM_REPO_REF }}\`"
87+ echo "- Profile variable: \`$PROFILE_ENV_VAR\`"
88+ echo ""
89+ echo '```yaml'
90+ echo "$E2E_PROFILE" | yq .
91+ echo '```'
92+ } >> "$GITHUB_STEP_SUMMARY"
93+
7594 - name : Install Nebius CLI
7695 shell : bash
7796 env :
@@ -187,6 +206,21 @@ jobs:
187206 path : " ${{ github.workspace }}/terraform-repo"
188207 fetch-depth : 0
189208
209+ - name : Check capacity
210+ shell : bash
211+ env :
212+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
213+ GITHUB_RUN_ID : ${{ github.run_id }}
214+ run : |
215+ cd ${{ env.PATH_TO_INSTALLATION }}
216+ source .envrc
217+ cd -
218+
219+ echo "### Capacity Check" >> "$GITHUB_STEP_SUMMARY"
220+ echo '```' >> "$GITHUB_STEP_SUMMARY"
221+ bin/e2e check-capacity 2>&1 | tee -a "$GITHUB_STEP_SUMMARY"
222+ echo '```' >> "$GITHUB_STEP_SUMMARY"
223+
190224 - name : Terraform Apply
191225 timeout-minutes : 120
192226 run : |
@@ -203,10 +237,10 @@ jobs:
203237 aws configure set region $NEBIUS_REGION
204238 aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
205239
206- go run ./cmd /e2e apply
240+ bin /e2e apply
207241
208242 - name : K8s Cluster Info and NodeGroups
209- if : always()
243+ if : ' !cancelled() '
210244 shell : bash
211245 run : |
212246 echo "=== Listing K8s clusters ==="
@@ -224,57 +258,57 @@ jobs:
224258 done
225259
226260 - name : " K8s Cluster: Pods"
227- if : always()
261+ if : ' !cancelled() '
228262 shell : bash
229263 run : kubectl get pods -A -o wide
230264
231265 - name : " K8s Cluster: Events"
232- if : always()
266+ if : ' !cancelled() '
233267 shell : bash
234268 run : kubectl get events -A --sort-by='.lastTimestamp'
235269
236270 - name : " K8s Cluster: Nodes"
237- if : always()
271+ if : ' !cancelled() '
238272 shell : bash
239273 run : |
240274 kubectl get nodes
241275 echo ""
242276 kubectl get nodes -o yaml
243277
244278 - name : " K8s Cluster: Jobs"
245- if : always()
279+ if : ' !cancelled() '
246280 shell : bash
247281 run : |
248282 kubectl -n soperator get job
249283 echo ""
250284 kubectl -n soperator get job -o yaml
251285
252286 - name : " K8s Cluster: Helm Releases"
253- if : always()
287+ if : ' !cancelled() '
254288 shell : bash
255289 run : |
256290 kubectl get helmreleases -n flux-system
257291 echo ""
258292 kubectl get helmreleases -n flux-system -o yaml
259293
260294 - name : " K8s Cluster: Slurm Cluster CRs"
261- if : always()
295+ if : ' !cancelled() '
262296 shell : bash
263297 run : |
264298 kubectl get slurmclusters -A
265299 echo ""
266300 kubectl get slurmclusters -A -o yaml
267301
268302 - name : " K8s Cluster: Slurm Active Checks CRs"
269- if : always()
303+ if : ' !cancelled() '
270304 shell : bash
271305 run : |
272306 kubectl get activechecks -A
273307 echo ""
274308 kubectl get activechecks -A -o yaml
275309
276310 - name : Slurm Cluster State
277- if : always()
311+ if : ' !cancelled() '
278312 shell : bash
279313 run : |
280314 kubectl exec -n soperator controller-0 -- sinfo -N || true
@@ -284,37 +318,37 @@ jobs:
284318 kubectl exec -n soperator controller-0 -- sacct --parsable2 --allusers --starttime=now-6hours | column -t -s'|'
285319
286320 - name : Collect Full Kubernetes Cluster Info
287- if : always()
321+ if : ' !cancelled() '
288322 shell : bash
289323 run : |
290324 mkdir -p ./cluster-info
291325 kubectl cluster-info dump --namespaces=kruise-system,soperator-system,soperator,flux-system --output-directory=./cluster-info
292326
293327 - name : Upload Full Kubernetes Cluster Info
294- if : always()
328+ if : ' !cancelled() '
295329 uses : actions/upload-artifact@v6
296330 with :
297331 name : cluster-info
298332 path : ./cluster-info
299333 retention-days : 7
300334
301335 - name : Collect Soperator Outputs
302- if : always()
336+ if : ' !cancelled() '
303337 shell : bash
304338 run : |
305339 mkdir -p ./soperator-outputs
306340 kubectl cp soperator/controller-0:/mnt/jail/opt/soperator-outputs ./soperator-outputs
307341
308342 - name : Upload Soperator Outputs
309- if : always()
343+ if : ' !cancelled() '
310344 uses : actions/upload-artifact@v6
311345 with :
312346 name : soperator-outputs
313347 path : ./soperator-outputs
314348 retention-days : 7
315349
316350 - name : Terraform Destroy
317- if : always()
351+ if : ' !cancelled() '
318352 timeout-minutes : 30
319353 run : |
320354 cd ${{ env.PATH_TO_INSTALLATION }}
@@ -327,7 +361,7 @@ jobs:
327361 aws configure set region $NEBIUS_REGION
328362 aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
329363
330- go run ./cmd /e2e destroy
364+ bin /e2e destroy
331365
332366 - name : Force cleanup compute instances on failure
333367 if : failure()
@@ -414,7 +448,11 @@ jobs:
414448 # Terraform repo latest commits
415449 echo ""
416450 echo "### Terraform Repo Latest Commits on ${{ env.TERRAFORM_REPO_REF }}"
417- git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
418- echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
419- done
451+ if [[ -d "${{ github.workspace }}/terraform-repo/.git" ]]; then
452+ git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
453+ echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
454+ done
455+ else
456+ echo "Terraform repo not checked out."
457+ fi
420458 } >> $GITHUB_STEP_SUMMARY
0 commit comments