Skip to content

Commit 73abdf5

Browse files
committed
Polish capacity check: strip log timestamps, skip steps on cancel, add output to job summary
Also add run parameters and profile to E2E job summary
1 parent e29ec40 commit 73abdf5

File tree

3 files changed

+48
-26
lines changed

3 files changed

+48
-26
lines changed

.github/workflows/e2e_test.yml

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,21 @@ jobs:
7676
echo "NEBIUS_REGION=$(echo "$E2E_PROFILE" | yq -r '.nebius_region')" >> "$GITHUB_ENV"
7777
echo "NEBIUS_TENANT_ID=$(echo "$E2E_PROFILE" | yq -r '.nebius_tenant_id')" >> "$GITHUB_ENV"
7878
79+
- name: Print run parameters
80+
shell: bash
81+
run: |
82+
{
83+
echo "### Run Parameters"
84+
echo ""
85+
echo "- Branch: \`${{ github.ref_name }}\`"
86+
echo "- Terraform branch: \`${{ env.TERRAFORM_REPO_REF }}\`"
87+
echo "- Profile variable: \`$PROFILE_ENV_VAR\`"
88+
echo ""
89+
echo '```yaml'
90+
echo "$E2E_PROFILE" | yq .
91+
echo '```'
92+
} >> "$GITHUB_STEP_SUMMARY"
93+
7994
- name: Install Nebius CLI
8095
shell: bash
8196
env:
@@ -200,7 +215,11 @@ jobs:
200215
cd ${{ env.PATH_TO_INSTALLATION }}
201216
source .envrc
202217
cd -
203-
bin/e2e check-capacity
218+
219+
echo "### Capacity Check" >> "$GITHUB_STEP_SUMMARY"
220+
echo '```' >> "$GITHUB_STEP_SUMMARY"
221+
bin/e2e check-capacity 2>&1 | tee -a "$GITHUB_STEP_SUMMARY"
222+
echo '```' >> "$GITHUB_STEP_SUMMARY"
204223
205224
- name: Terraform Apply
206225
timeout-minutes: 120
@@ -221,7 +240,7 @@ jobs:
221240
bin/e2e apply
222241
223242
- name: K8s Cluster Info and NodeGroups
224-
if: always()
243+
if: '!cancelled()'
225244
shell: bash
226245
run: |
227246
echo "=== Listing K8s clusters ==="
@@ -239,57 +258,57 @@ jobs:
239258
done
240259
241260
- name: "K8s Cluster: Pods"
242-
if: always()
261+
if: '!cancelled()'
243262
shell: bash
244263
run: kubectl get pods -A -o wide
245264

246265
- name: "K8s Cluster: Events"
247-
if: always()
266+
if: '!cancelled()'
248267
shell: bash
249268
run: kubectl get events -A --sort-by='.lastTimestamp'
250269

251270
- name: "K8s Cluster: Nodes"
252-
if: always()
271+
if: '!cancelled()'
253272
shell: bash
254273
run: |
255274
kubectl get nodes
256275
echo ""
257276
kubectl get nodes -o yaml
258277
259278
- name: "K8s Cluster: Jobs"
260-
if: always()
279+
if: '!cancelled()'
261280
shell: bash
262281
run: |
263282
kubectl -n soperator get job
264283
echo ""
265284
kubectl -n soperator get job -o yaml
266285
267286
- name: "K8s Cluster: Helm Releases"
268-
if: always()
287+
if: '!cancelled()'
269288
shell: bash
270289
run: |
271290
kubectl get helmreleases -n flux-system
272291
echo ""
273292
kubectl get helmreleases -n flux-system -o yaml
274293
275294
- name: "K8s Cluster: Slurm Cluster CRs"
276-
if: always()
295+
if: '!cancelled()'
277296
shell: bash
278297
run: |
279298
kubectl get slurmclusters -A
280299
echo ""
281300
kubectl get slurmclusters -A -o yaml
282301
283302
- name: "K8s Cluster: Slurm Active Checks CRs"
284-
if: always()
303+
if: '!cancelled()'
285304
shell: bash
286305
run: |
287306
kubectl get activechecks -A
288307
echo ""
289308
kubectl get activechecks -A -o yaml
290309
291310
- name: Slurm Cluster State
292-
if: always()
311+
if: '!cancelled()'
293312
shell: bash
294313
run: |
295314
kubectl exec -n soperator controller-0 -- sinfo -N || true
@@ -299,37 +318,37 @@ jobs:
299318
kubectl exec -n soperator controller-0 -- sacct --parsable2 --allusers --starttime=now-6hours | column -t -s'|'
300319
301320
- name: Collect Full Kubernetes Cluster Info
302-
if: always()
321+
if: '!cancelled()'
303322
shell: bash
304323
run: |
305324
mkdir -p ./cluster-info
306325
kubectl cluster-info dump --namespaces=kruise-system,soperator-system,soperator,flux-system --output-directory=./cluster-info
307326
308327
- name: Upload Full Kubernetes Cluster Info
309-
if: always()
328+
if: '!cancelled()'
310329
uses: actions/upload-artifact@v6
311330
with:
312331
name: cluster-info
313332
path: ./cluster-info
314333
retention-days: 7
315334

316335
- name: Collect Soperator Outputs
317-
if: always()
336+
if: '!cancelled()'
318337
shell: bash
319338
run: |
320339
mkdir -p ./soperator-outputs
321340
kubectl cp soperator/controller-0:/mnt/jail/opt/soperator-outputs ./soperator-outputs
322341
323342
- name: Upload Soperator Outputs
324-
if: always()
343+
if: '!cancelled()'
325344
uses: actions/upload-artifact@v6
326345
with:
327346
name: soperator-outputs
328347
path: ./soperator-outputs
329348
retention-days: 7
330349

331350
- name: Terraform Destroy
332-
if: always()
351+
if: '!cancelled()'
333352
timeout-minutes: 30
334353
run: |
335354
cd ${{ env.PATH_TO_INSTALLATION }}
@@ -429,7 +448,11 @@ jobs:
429448
# Terraform repo latest commits
430449
echo ""
431450
echo "### Terraform Repo Latest Commits on ${{ env.TERRAFORM_REPO_REF }}"
432-
git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
433-
echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
434-
done
451+
if [[ -d "${{ github.workspace }}/terraform-repo/.git" ]]; then
452+
git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
453+
echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
454+
done
455+
else
456+
echo "Terraform repo not checked out."
457+
fi
435458
} >> $GITHUB_STEP_SUMMARY

cmd/e2e/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ import (
1616
)
1717

1818
func main() {
19+
log.SetFlags(0)
20+
1921
if len(os.Args) < 2 {
2022
_, _ = fmt.Fprintf(os.Stderr, "Usage: e2e <apply|destroy|check-capacity>\n")
2123
os.Exit(2)

internal/e2e/capacity.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,15 +120,12 @@ func CheckCapacity(ctx context.Context, profile Profile) error {
120120
log.Printf("CBG platform=%s fabric=%s: nodesets=%v required=%d available=%d (limit=%d usage=%d)",
121121
key.Platform, key.Fabric, d.nodesets, d.required, available, currentLimit, usage)
122122

123-
if available >= d.required {
124-
continue
123+
if available < d.required {
124+
log.Printf("CBG platform=%s fabric=%s: INSUFFICIENT CAPACITY — need %d GPUs but only %d available",
125+
key.Platform, key.Fabric, d.required, available)
126+
insufficient = true
127+
printResourceDetails(ctx, sdk, cbg)
125128
}
126-
127-
log.Printf("CBG platform=%s fabric=%s: INSUFFICIENT CAPACITY — need %d GPUs but only %d available",
128-
key.Platform, key.Fabric, d.required, available)
129-
insufficient = true
130-
131-
printResourceDetails(ctx, sdk, cbg)
132129
}
133130

134131
if !insufficient {

0 commit comments

Comments
 (0)