Skip to content

Commit 6f84e19

Browse files
committed
unite cluster setup and github action of e2e-mnnvl with e2e
1 parent 066820a commit 6f84e19

File tree

10 files changed

+610
-757
lines changed

10 files changed

+610
-757
lines changed
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
git# /*
2+
# Copyright 2025 The Grove Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# */
16+
17+
name: "E2E Setup"
18+
description: "Install all tools and dependencies needed to run E2E tests (Go, k3d, skaffold, helm, Python + deps)"
19+
20+
inputs:
21+
go-version:
22+
description: "Go version to install"
23+
required: false
24+
default: "1.25.7"
25+
26+
runs:
27+
using: "composite"
28+
steps:
29+
# NVIDIA self-hosted runners don't have make installed by default
30+
- name: Install build-essential for make
31+
shell: bash
32+
run: |
33+
sudo apt-get update
34+
sudo apt install build-essential -y
35+
36+
- name: Set up Go
37+
uses: actions/setup-go@v4
38+
with:
39+
go-version: ${{ inputs.go-version }}
40+
41+
- name: Install k3d
42+
shell: bash
43+
run: |
44+
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
45+
k3d version
46+
47+
- name: Install skaffold
48+
shell: bash
49+
run: |
50+
curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
51+
sudo install skaffold /usr/local/bin/
52+
skaffold version
53+
54+
- name: Install Helm
55+
shell: bash
56+
run: |
57+
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
58+
helm version
59+
60+
- name: Install Python3 and pip
61+
shell: bash
62+
run: |
63+
sudo apt-get update
64+
sudo apt-get install -y python3 python3-pip
65+
66+
- name: Install Python dependencies
67+
shell: bash
68+
run: |
69+
echo "Installing Python dependencies..."
70+
pip3 install --break-system-packages -r operator/hack/e2e-cluster/requirements.txt
71+
72+
echo "Verifying Python dependencies..."
73+
python3 -c "import docker; import sh; import typer; import pydantic; import rich; print('All dependencies installed successfully')"

.github/workflows/build-check-test.yaml

Lines changed: 11 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ jobs:
8383

8484
# E2E tests - only run after build, check, and test jobs succeed
8585
# Only triggered by changes to operator or .github folders
86+
#
87+
# Matrix entries can set:
88+
# test_name (required) - name shown in the GitHub Actions UI
89+
# test_pattern (optional) - Go test -run pattern (standard e2e tests)
90+
# make_target (optional) - Makefile target, defaults to run-e2e-full
8691
e2e:
8792
needs: [test, build, check, changes]
8893
# Run on non-draft PRs (or draft PRs with 'run-e2e' label)
@@ -109,9 +114,11 @@ jobs:
109114
test_pattern: "^Test_TAS"
110115
- test_name: cert_management
111116
test_pattern: "^Test_CM"
117+
- test_name: auto_mnnvl
118+
make_target: "run-e2e-mnnvl-full"
112119
name: E2E - ${{ matrix.test_name }}
113120
steps:
114-
# print runner specs so we have a record incase of failures
121+
# print runner specs so we have a record in case of failures
115122
- name: Print runner specs
116123
run: |
117124
echo "CPUs: $(nproc)"
@@ -120,49 +127,12 @@ jobs:
120127
- name: Checkout code
121128
uses: actions/checkout@v4
122129

123-
# NVIDIA self-hosted runners don't have make installed by default
124-
- name: Install build-essential for make
125-
run: |
126-
sudo apt-get update
127-
sudo apt install build-essential -y
128-
129-
- name: Set up Go
130-
uses: actions/setup-go@v4
131-
with:
132-
go-version: "1.25.7"
133-
134-
- name: Install k3d
135-
run: |
136-
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
137-
k3d version
138-
139-
- name: Install skaffold
140-
run: |
141-
curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
142-
sudo install skaffold /usr/local/bin/
143-
skaffold version
144-
145-
- name: Install Helm
146-
run: |
147-
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
148-
helm version
149-
150-
- name: Install Python3 and pip
151-
run: |
152-
sudo apt-get update
153-
sudo apt-get install -y python3 python3-pip
154-
155-
- name: Install Python dependencies
156-
run: |
157-
echo "Installing Python dependencies..."
158-
pip3 install --break-system-packages -r operator/hack/e2e-cluster/requirements.txt
159-
160-
echo "Verifying Python dependencies..."
161-
python3 -c "import docker; import sh; import typer; import pydantic; import rich; print('All dependencies installed successfully')"
130+
- name: E2E Setup
131+
uses: ./.github/actions/e2e-setup
162132

163133
- name: Run e2e tests - ${{ matrix.test_name }}
164134
run: |
165-
make run-e2e-full TEST_PATTERN='${{ matrix.test_pattern }}'
135+
make ${{ matrix.make_target || 'run-e2e-full' }} TEST_PATTERN='${{ matrix.test_pattern }}'
166136
working-directory: operator
167137

168138
# The test code handles cleanup via Teardown(), but this step provides
@@ -184,70 +154,3 @@ jobs:
184154
path: operator/e2e-diagnostics/
185155
if-no-files-found: warn
186156
retention-days: 7
187-
188-
# E2E tests for autoMNNVL feature
189-
# Runs all 4 configurations (supported/unsupported x enabled/disabled) sequentially
190-
# Uses a separate cluster setup from the main e2e tests
191-
e2e-mnnvl:
192-
needs: [test, build, check, changes]
193-
if: |
194-
github.event_name == 'pull_request' &&
195-
needs.changes.outputs.e2e-relevant == 'true' &&
196-
(github.event.pull_request.draft == false || contains(github.event.pull_request.labels.*.name, 'run-e2e'))
197-
runs-on: cpu-amd-m5-2xlarge
198-
timeout-minutes: 60
199-
name: E2E - auto_mnnvl
200-
steps:
201-
- name: Print runner specs
202-
run: |
203-
echo "CPUs: $(nproc)"
204-
echo "RAM: $(free -h | awk '/^Mem:/ {print $2}')"
205-
206-
- name: Checkout code
207-
uses: actions/checkout@v4
208-
209-
- name: Install build-essential for make
210-
run: |
211-
sudo apt-get update
212-
sudo apt install build-essential -y
213-
214-
- name: Set up Go
215-
uses: actions/setup-go@v4
216-
with:
217-
go-version: "1.25.7"
218-
219-
- name: Install k3d
220-
run: |
221-
curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash
222-
k3d version
223-
224-
- name: Install skaffold
225-
run: |
226-
curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64
227-
sudo install skaffold /usr/local/bin/
228-
skaffold version
229-
230-
- name: Install Helm
231-
run: |
232-
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
233-
helm version
234-
235-
- name: Run autoMNNVL e2e tests (all 4 configurations)
236-
run: |
237-
make run-e2e-mnnvl-full
238-
working-directory: operator
239-
240-
- name: Cleanup k3d cluster
241-
if: always()
242-
working-directory: operator
243-
run: |
244-
make e2e-mnnvl-cluster-down || true
245-
246-
- name: Upload test logs on failure
247-
if: failure()
248-
uses: actions/upload-artifact@v4
249-
with:
250-
name: e2e-test-logs-auto_mnnvl
251-
path: /tmp/mnnvl-e2e-results.log
252-
if-no-files-found: warn
253-
retention-days: 7

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,10 @@ dev
3131
.vscode
3232
.idea
3333
.zed
34+
.local
3435
.DS_Store
3536

37+
3638
# hack tools binaries
3739
hack/tools/*
3840
operator/hack/tools/*
@@ -50,3 +52,6 @@ scheduler/bin/*
5052

5153
# generated/copied chart resources
5254
operator/charts/crds/*
55+
56+
# Python bytecode cache
57+
__pycache__/

operator/Makefile

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ run-e2e:
111111

112112
# Create a k3d cluster for e2e testing (with Grove and Kai scheduler deployed)
113113
# This is optional - tests work with any Kubernetes cluster that has the required components
114+
# Pass E2E_CREATE_FLAGS to add CLI flags (e.g. --skip-kai --skip-topology --skip-prepull)
114115
.PHONY: e2e-cluster-up
115116
e2e-cluster-up:
116-
@$(MODULE_HACK_DIR)/e2e-cluster/create-e2e-cluster.py
117+
@$(MODULE_HACK_DIR)/e2e-cluster/create-e2e-cluster.py $(E2E_CREATE_FLAGS)
117118

118119
# Delete the k3d e2e test cluster
119120
.PHONY: e2e-cluster-down
@@ -134,18 +135,24 @@ run-e2e-full: e2e-cluster-up
134135
@$(MAKE) e2e-cluster-down
135136

136137
# Run autoMNNVL e2e tests (all 4 configurations: supported/unsupported x enabled/disabled)
137-
# This creates a dedicated k3d cluster, runs all configurations sequentially, then cleans up.
138-
# Images are built with skaffold/ko as part of cluster setup (no Docker build required).
138+
# Creates a lightweight k3d cluster (2 workers, no Kai/topology), runs all configurations
139+
# sequentially via config-cluster.py, then cleans up. Uses the same e2e-cluster-up target
140+
# with MNNVL-specific env overrides and flags.
139141
# Usage: make run-e2e-mnnvl-full
140142
.PHONY: run-e2e-mnnvl-full
141-
run-e2e-mnnvl-full:
143+
run-e2e-mnnvl-full: export E2E_WORKER_NODES = 2
144+
run-e2e-mnnvl-full: E2E_CREATE_FLAGS = --skip-kai --skip-topology --skip-prepull
145+
run-e2e-mnnvl-full: e2e-cluster-up
146+
@echo "> Pushing alpine image to local registry..."
147+
@docker pull alpine:latest || true
148+
@docker tag alpine:latest localhost:5001/alpine:latest
149+
@docker push localhost:5001/alpine:latest
142150
@echo "> Running autoMNNVL e2e tests (all 4 configurations)..."
143-
@python3 $(MODULE_HACK_DIR)/e2e-autoMNNVL/run_autoMNNVL_e2e_all.py
151+
@python3 $(MODULE_HACK_DIR)/e2e-autoMNNVL/run_autoMNNVL_e2e_all.py || \
152+
(echo "Tests failed, cleaning up cluster..."; $(MAKE) e2e-cluster-down; exit 1)
153+
@echo "> Tests passed, cleaning up cluster..."
154+
@$(MAKE) e2e-cluster-down
144155

145-
# Delete the autoMNNVL e2e test cluster
146-
.PHONY: e2e-mnnvl-cluster-down
147-
e2e-mnnvl-cluster-down:
148-
@python3 $(MODULE_HACK_DIR)/e2e-autoMNNVL/setup_autoMNNVL_cluster.py --shutdown
149156

150157
# Make targets for local development and testing
151158
# -------------------------------------------------------------

operator/e2e/tests/auto-mnnvl/shared_cases.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/ai-dynamo/grove/operator/internal/mnnvl"
2727
"github.com/stretchr/testify/assert"
2828
"github.com/stretchr/testify/require"
29+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
)
3132

@@ -43,10 +44,16 @@ func testNoMNNVLArtifactsWhenDisabled(t *testing.T, tc testContext) {
4344
// Wait a bit to ensure reconciliation has time to run
4445
time.Sleep(10 * time.Second)
4546

46-
// Verify no ComputeDomain exists
47-
cdName := fmt.Sprintf("%s-0", pcsName)
48-
_, err = tc.dynamicClient.Resource(computeDomainGVR).Namespace(tc.namespace).Get(tc.ctx, cdName, metav1.GetOptions{})
49-
assert.Error(t, err, "No ComputeDomain should be created when feature is disabled")
47+
// Verify no ComputeDomain exists.
48+
// If the CRD itself is not installed (unsupported scenario), the List call returns
49+
// a NotFound error -- that also means zero ComputeDomains, which is what we want.
50+
cdList, err := tc.dynamicClient.Resource(computeDomainGVR).Namespace(tc.namespace).List(tc.ctx, metav1.ListOptions{})
51+
if k8serrors.IsNotFound(err) {
52+
// CRD not installed → no ComputeDomains can exist, which is the expected state.
53+
} else {
54+
require.NoError(t, err, "Failed to list ComputeDomains")
55+
assert.Empty(t, cdList.Items, "Expected 0 ComputeDomains when feature is disabled, got %d", len(cdList.Items))
56+
}
5057

5158
// Verify PCSGs do not get auto-mnnvl annotation
5259
pcsgNames := []string{

0 commit comments

Comments
 (0)