Skip to content

Commit dda12c6

Browse files
authored
Merge pull request #2204 from nebius/SCHED-1012/capacity-check
SCHED-1012: Add early capacity check
2 parents 8c9c8c4 + 73abdf5 commit dda12c6

File tree

8 files changed

+405
-51
lines changed

8 files changed

+405
-51
lines changed

.github/workflows/e2e_test.yml

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ on:
1616

1717
permissions:
1818
contents: read
19+
actions: write
1920

2021
concurrency:
2122
# Prevent cancelling already-running jobs (avoids resource waste).
@@ -58,6 +59,9 @@ jobs:
5859
make install-e2e-tools
5960
echo "${{ github.workspace }}/bin" >> $GITHUB_PATH
6061
62+
- name: Build e2e binary
63+
run: go build -o bin/e2e ./cmd/e2e
64+
6165
- name: Resolve profile
6266
shell: bash
6367
run: |
@@ -72,6 +76,21 @@ jobs:
7276
echo "NEBIUS_REGION=$(echo "$E2E_PROFILE" | yq -r '.nebius_region')" >> "$GITHUB_ENV"
7377
echo "NEBIUS_TENANT_ID=$(echo "$E2E_PROFILE" | yq -r '.nebius_tenant_id')" >> "$GITHUB_ENV"
7478
79+
- name: Print run parameters
80+
shell: bash
81+
run: |
82+
{
83+
echo "### Run Parameters"
84+
echo ""
85+
echo "- Branch: \`${{ github.ref_name }}\`"
86+
echo "- Terraform branch: \`${{ env.TERRAFORM_REPO_REF }}\`"
87+
echo "- Profile variable: \`$PROFILE_ENV_VAR\`"
88+
echo ""
89+
echo '```yaml'
90+
echo "$E2E_PROFILE" | yq .
91+
echo '```'
92+
} >> "$GITHUB_STEP_SUMMARY"
93+
7594
- name: Install Nebius CLI
7695
shell: bash
7796
env:
@@ -187,6 +206,21 @@ jobs:
187206
path: "${{ github.workspace }}/terraform-repo"
188207
fetch-depth: 0
189208

209+
- name: Check capacity
210+
shell: bash
211+
env:
212+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
213+
GITHUB_RUN_ID: ${{ github.run_id }}
214+
run: |
215+
cd ${{ env.PATH_TO_INSTALLATION }}
216+
source .envrc
217+
cd -
218+
219+
echo "### Capacity Check" >> "$GITHUB_STEP_SUMMARY"
220+
echo '```' >> "$GITHUB_STEP_SUMMARY"
221+
bin/e2e check-capacity 2>&1 | tee -a "$GITHUB_STEP_SUMMARY"
222+
echo '```' >> "$GITHUB_STEP_SUMMARY"
223+
190224
- name: Terraform Apply
191225
timeout-minutes: 120
192226
run: |
@@ -203,10 +237,10 @@ jobs:
203237
aws configure set region $NEBIUS_REGION
204238
aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
205239
206-
go run ./cmd/e2e apply
240+
bin/e2e apply
207241
208242
- name: K8s Cluster Info and NodeGroups
209-
if: always()
243+
if: '!cancelled()'
210244
shell: bash
211245
run: |
212246
echo "=== Listing K8s clusters ==="
@@ -224,57 +258,57 @@ jobs:
224258
done
225259
226260
- name: "K8s Cluster: Pods"
227-
if: always()
261+
if: '!cancelled()'
228262
shell: bash
229263
run: kubectl get pods -A -o wide
230264

231265
- name: "K8s Cluster: Events"
232-
if: always()
266+
if: '!cancelled()'
233267
shell: bash
234268
run: kubectl get events -A --sort-by='.lastTimestamp'
235269

236270
- name: "K8s Cluster: Nodes"
237-
if: always()
271+
if: '!cancelled()'
238272
shell: bash
239273
run: |
240274
kubectl get nodes
241275
echo ""
242276
kubectl get nodes -o yaml
243277
244278
- name: "K8s Cluster: Jobs"
245-
if: always()
279+
if: '!cancelled()'
246280
shell: bash
247281
run: |
248282
kubectl -n soperator get job
249283
echo ""
250284
kubectl -n soperator get job -o yaml
251285
252286
- name: "K8s Cluster: Helm Releases"
253-
if: always()
287+
if: '!cancelled()'
254288
shell: bash
255289
run: |
256290
kubectl get helmreleases -n flux-system
257291
echo ""
258292
kubectl get helmreleases -n flux-system -o yaml
259293
260294
- name: "K8s Cluster: Slurm Cluster CRs"
261-
if: always()
295+
if: '!cancelled()'
262296
shell: bash
263297
run: |
264298
kubectl get slurmclusters -A
265299
echo ""
266300
kubectl get slurmclusters -A -o yaml
267301
268302
- name: "K8s Cluster: Slurm Active Checks CRs"
269-
if: always()
303+
if: '!cancelled()'
270304
shell: bash
271305
run: |
272306
kubectl get activechecks -A
273307
echo ""
274308
kubectl get activechecks -A -o yaml
275309
276310
- name: Slurm Cluster State
277-
if: always()
311+
if: '!cancelled()'
278312
shell: bash
279313
run: |
280314
kubectl exec -n soperator controller-0 -- sinfo -N || true
@@ -284,37 +318,37 @@ jobs:
284318
kubectl exec -n soperator controller-0 -- sacct --parsable2 --allusers --starttime=now-6hours | column -t -s'|'
285319
286320
- name: Collect Full Kubernetes Cluster Info
287-
if: always()
321+
if: '!cancelled()'
288322
shell: bash
289323
run: |
290324
mkdir -p ./cluster-info
291325
kubectl cluster-info dump --namespaces=kruise-system,soperator-system,soperator,flux-system --output-directory=./cluster-info
292326
293327
- name: Upload Full Kubernetes Cluster Info
294-
if: always()
328+
if: '!cancelled()'
295329
uses: actions/upload-artifact@v6
296330
with:
297331
name: cluster-info
298332
path: ./cluster-info
299333
retention-days: 7
300334

301335
- name: Collect Soperator Outputs
302-
if: always()
336+
if: '!cancelled()'
303337
shell: bash
304338
run: |
305339
mkdir -p ./soperator-outputs
306340
kubectl cp soperator/controller-0:/mnt/jail/opt/soperator-outputs ./soperator-outputs
307341
308342
- name: Upload Soperator Outputs
309-
if: always()
343+
if: '!cancelled()'
310344
uses: actions/upload-artifact@v6
311345
with:
312346
name: soperator-outputs
313347
path: ./soperator-outputs
314348
retention-days: 7
315349

316350
- name: Terraform Destroy
317-
if: always()
351+
if: '!cancelled()'
318352
timeout-minutes: 30
319353
run: |
320354
cd ${{ env.PATH_TO_INSTALLATION }}
@@ -327,7 +361,7 @@ jobs:
327361
aws configure set region $NEBIUS_REGION
328362
aws configure set endpoint_url https://storage.$NEBIUS_REGION.nebius.cloud:443
329363
330-
go run ./cmd/e2e destroy
364+
bin/e2e destroy
331365
332366
- name: Force cleanup compute instances on failure
333367
if: failure()
@@ -414,7 +448,11 @@ jobs:
414448
# Terraform repo latest commits
415449
echo ""
416450
echo "### Terraform Repo Latest Commits on ${{ env.TERRAFORM_REPO_REF }}"
417-
git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
418-
echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
419-
done
451+
if [[ -d "${{ github.workspace }}/terraform-repo/.git" ]]; then
452+
git -C "${{ github.workspace }}/terraform-repo" log --format='%h|%s|%ar|%an' -3 2>/dev/null | while IFS='|' read -r hash msg date author; do
453+
echo "- [$hash]($terraform_repo_url/commit/$hash): $msg ($date) <$author>"
454+
done
455+
else
456+
echo "Terraform repo not checked out."
457+
fi
420458
} >> $GITHUB_STEP_SUMMARY

cmd/e2e/main.go

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ package main
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"log"
78
"os"
9+
"os/exec"
810
"os/signal"
911
"syscall"
1012

@@ -14,41 +16,74 @@ import (
1416
)
1517

1618
func main() {
19+
log.SetFlags(0)
20+
1721
if len(os.Args) < 2 {
18-
_, _ = fmt.Fprintf(os.Stderr, "Usage: e2e <apply|destroy>\n")
22+
_, _ = fmt.Fprintf(os.Stderr, "Usage: e2e <apply|destroy|check-capacity>\n")
1923
os.Exit(2)
2024
}
2125

22-
var cfg e2e.Config
23-
if err := envconfig.Process("", &cfg); err != nil {
24-
log.Fatalf("parse config: %v", err)
25-
}
26-
2726
profile, err := e2e.LoadProfile()
2827
if err != nil {
29-
log.Fatalf("load profile: %v", err)
30-
}
31-
cfg.Profile = profile
32-
33-
sshPubKey, err := e2e.GenerateSSHPublicKey()
34-
if err != nil {
35-
log.Fatalf("generate SSH public key: %v", err)
28+
log.Fatalf("Load profile: %v", err)
3629
}
37-
cfg.SSHPublicKey = sshPubKey
3830

3931
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
4032
defer stop()
4133

4234
switch os.Args[1] {
35+
case "check-capacity":
36+
err = runCheckCapacity(ctx, profile)
4337
case "apply":
38+
cfg := loadFullConfig(profile)
4439
err = e2e.Apply(ctx, cfg)
4540
case "destroy":
41+
cfg := loadFullConfig(profile)
4642
err = e2e.Destroy(ctx, cfg)
4743
default:
48-
_, _ = fmt.Fprintf(os.Stderr, "Unknown command: %s\nUsage: e2e <apply|destroy>\n", os.Args[1])
44+
_, _ = fmt.Fprintf(os.Stderr, "Unknown command: %s\nUsage: e2e <apply|destroy|check-capacity>\n", os.Args[1])
4945
os.Exit(2)
5046
}
5147
if err != nil {
5248
log.Fatalf("%s: %v", os.Args[1], err)
5349
}
5450
}
51+
52+
func loadFullConfig(profile e2e.Profile) e2e.Config {
53+
var cfg e2e.Config
54+
if err := envconfig.Process("", &cfg); err != nil {
55+
log.Fatalf("Parse config: %v", err)
56+
}
57+
cfg.Profile = profile
58+
59+
sshPubKey, err := e2e.GenerateSSHPublicKey()
60+
if err != nil {
61+
log.Fatalf("Generate SSH public key: %v", err)
62+
}
63+
cfg.SSHPublicKey = sshPubKey
64+
65+
return cfg
66+
}
67+
68+
func runCheckCapacity(ctx context.Context, profile e2e.Profile) error {
69+
err := e2e.CheckCapacity(ctx, profile)
70+
if !errors.Is(err, e2e.ErrInsufficientCapacity) {
71+
return err
72+
}
73+
74+
log.Print("Insufficient capacity detected with cancel strategy, cancelling workflow")
75+
runID := os.Getenv("GITHUB_RUN_ID")
76+
if runID == "" {
77+
return fmt.Errorf("GITHUB_RUN_ID is not set, cannot cancel workflow")
78+
}
79+
80+
cmd := exec.CommandContext(ctx, "gh", "run", "cancel", runID)
81+
cmd.Stdout = os.Stdout
82+
cmd.Stderr = os.Stderr
83+
if cancelErr := cmd.Run(); cancelErr != nil {
84+
return fmt.Errorf("cancel workflow run %s: %w", runID, cancelErr)
85+
}
86+
87+
log.Printf("Workflow run %s cancelled due to insufficient capacity", runID)
88+
return nil
89+
}

go.mod

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ require (
1414
github.com/kubereboot/kured v0.0.0-20241106074119-94e73465adc3
1515
github.com/mackerelio/go-osstat v0.2.6
1616
github.com/mariadb-operator/mariadb-operator/v25 v25.10.2
17+
github.com/nebius/gosdk v0.0.0-20260224104345-c5b8377510ae
1718
github.com/onsi/ginkgo/v2 v2.27.3
1819
github.com/onsi/gomega v1.38.3
1920
github.com/openkruise/kruise-api v1.8.0
@@ -24,32 +25,36 @@ require (
2425
go.uber.org/zap v1.27.1
2526
golang.org/x/crypto v0.46.0
2627
golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0
28+
google.golang.org/grpc v1.72.1
2729
istio.io/pkg v0.0.0-20241216214326-d70796207df3
2830
k8s.io/api v0.34.3
2931
k8s.io/apimachinery v0.34.3
3032
k8s.io/client-go v0.34.3
31-
k8s.io/component-base v0.34.3
3233
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
3334
sigs.k8s.io/controller-runtime v0.22.4
3435
sigs.k8s.io/security-profiles-operator v0.8.4
3536
sigs.k8s.io/yaml v1.6.0
3637
)
3738

3839
require (
40+
buf.build/gen/go/bufbuild/protovalidate/protocolbuffers/go v1.36.4-20250130201111-63bb56e20495.1 // indirect
3941
github.com/Masterminds/semver/v3 v3.4.0 // indirect
4042
github.com/agext/levenshtein v1.2.3 // indirect
4143
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
4244
github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
43-
github.com/blang/semver/v4 v4.0.0 // indirect
45+
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
4446
github.com/cert-manager/cert-manager v1.18.5 // indirect
4547
github.com/containers/common v0.60.4 // indirect
4648
github.com/distribution/reference v0.6.0 // indirect
4749
github.com/fsnotify/fsnotify v1.9.0 // indirect
4850
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
4951
github.com/getkin/kin-openapi v0.132.0 // indirect
5052
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
53+
github.com/gofrs/flock v0.12.1 // indirect
54+
github.com/golang-jwt/jwt/v4 v4.5.1 // indirect
5155
github.com/google/btree v1.1.3 // indirect
5256
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
57+
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0 // indirect
5358
github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
5459
github.com/hashicorp/go-version v1.7.0 // indirect
5560
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
@@ -68,12 +73,11 @@ require (
6873
github.com/stretchr/objx v0.5.2 // indirect
6974
github.com/ugorji/go/codec v1.2.12 // indirect
7075
github.com/x448/float16 v0.8.4 // indirect
71-
go.opentelemetry.io/otel v1.35.0 // indirect
72-
go.opentelemetry.io/otel/trace v1.35.0 // indirect
7376
go.yaml.in/yaml/v2 v2.4.3 // indirect
7477
go.yaml.in/yaml/v3 v3.0.4 // indirect
7578
golang.org/x/mod v0.30.0 // indirect
7679
golang.org/x/sync v0.19.0 // indirect
80+
google.golang.org/genproto/googleapis/rpc v0.0.0-20260223185530-2f722ef697dc // indirect
7781
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
7882
sigs.k8s.io/randfill v1.0.0 // indirect
7983
sigs.k8s.io/release-utils v0.8.1 // indirect
@@ -116,7 +120,7 @@ require (
116120
golang.org/x/time v0.11.0 // indirect
117121
golang.org/x/tools v0.39.0 // indirect
118122
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
119-
google.golang.org/protobuf v1.36.8 // indirect
123+
google.golang.org/protobuf v1.36.11 // indirect
120124
gopkg.in/inf.v0 v0.9.1 // indirect
121125
gopkg.in/yaml.v3 v3.0.1 // indirect
122126
k8s.io/apiextensions-apiserver v0.34.2 // indirect

0 commit comments

Comments
 (0)