Skip to content

Commit 3c64bd1

Browse files
committed
fix
1 parent cac4dde commit 3c64bd1

File tree

6 files changed

+26
-9
lines changed

6 files changed

+26
-9
lines changed

.github/workflows/build-check-test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jobs:
115115
- test_name: cert_management
116116
test_pattern: "^Test_CM"
117117
- test_name: auto_mnnvl
118+
test_pattern: "^Test_AutoMNNVL"
118119
make_target: "run-e2e-mnnvl-full"
119120
name: E2E - ${{ matrix.test_name }}
120121
steps:

operator/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ run-e2e-full: e2e-cluster-up
141141
# Usage: make run-e2e-mnnvl-full
142142
.PHONY: run-e2e-mnnvl-full
143143
run-e2e-mnnvl-full: export E2E_WORKER_NODES = 2
144-
run-e2e-mnnvl-full: E2E_CREATE_FLAGS = --skip-kai --skip-topology --skip-prepull
144+
run-e2e-mnnvl-full: export E2E_K3S_IMAGE = rancher/k3s:v1.34.2-k3s1
145+
run-e2e-mnnvl-full: E2E_CREATE_FLAGS = --skip-prepull
145146
run-e2e-mnnvl-full: e2e-cluster-up
146147
@echo "> Pushing alpine image to local registry..."
147148
@docker pull alpine:latest || true

operator/hack/e2e-autoMNNVL/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ make run-e2e-mnnvl-full
4747

4848
# Or, manage cluster manually and run tests separately:
4949

50-
# 1. Create the MNNVL cluster (lightweight: 2 workers, no Kai/topology)
51-
E2E_WORKER_NODES=2 make e2e-cluster-up E2E_CREATE_FLAGS="--skip-kai --skip-topology --skip-prepull"
50+
# 1. Create the MNNVL cluster (lightweight: 2 workers, skip image prepull only)
51+
E2E_WORKER_NODES=2 E2E_K3S_IMAGE=rancher/k3s:v1.34.2-k3s1 make e2e-cluster-up E2E_CREATE_FLAGS="--skip-prepull"
5252

5353
# 2. Push alpine image for test workloads
5454
docker pull alpine:latest && docker tag alpine:latest localhost:5001/alpine:latest && docker push localhost:5001/alpine:latest
@@ -80,5 +80,5 @@ make e2e-cluster-down
8080
- **Cluster name:** `shared-e2e-test-cluster` (same as standard e2e)
8181
- **Nodes:** 1 server + 2 agents (lightweight — standard e2e uses 30)
8282
- **Registry:** local registry on port 5001
83-
- **Skaffold profile:** `topology-test` (same as standard e2e)
83+
- **Skaffold profile:** `topology-test` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced)
8484
- **Fake GPU:** [fake-gpu-operator](https://github.com/run-ai/fake-gpu-operator) v0.0.72 (provides ComputeDomain CRD)

operator/hack/e2e-autoMNNVL/run_autoMNNVL_e2e_all.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,13 @@ class ConfigEntry:
103103
# (preflight failure) in this invalid configuration; the e2e test itself
104104
# validates the expected failure behaviour.
105105
CONFIGS: list[ConfigEntry] = [
106-
ConfigEntry("Config4_UnsupportedAndDisabled",
106+
ConfigEntry("Config1_UnsupportedAndDisabled",
107107
["--fake-gpu=no", "--auto-mnnvl=disabled"]),
108-
ConfigEntry("Config3_UnsupportedButEnabled",
108+
ConfigEntry("Config2_UnsupportedButEnabled",
109109
["--fake-gpu=no", "--auto-mnnvl=enabled", "--skip-operator-wait"]),
110-
ConfigEntry("Config1_SupportedAndEnabled",
110+
ConfigEntry("Config3_SupportedAndEnabled",
111111
["--fake-gpu=yes", "--auto-mnnvl=enabled"]),
112-
ConfigEntry("Config2_SupportedButDisabled",
112+
ConfigEntry("Config4_SupportedButDisabled",
113113
["--fake-gpu=yes", "--auto-mnnvl=disabled"]),
114114
]
115115

operator/hack/e2e-cluster/config-cluster.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,21 @@ def ensure_auto_mnnvl(enabled: bool, *, skip_wait: bool = False) -> None:
216216
f" -n {GROVE_NAMESPACE} --timeout=120s",
217217
check=False,
218218
)
219+
# Cert-rotation may exit the process to trigger a restart; wait for that cycle.
220+
log_info("Allowing time for operator cert refresh restart cycle...")
221+
time.sleep(20)
222+
log_info("Waiting for Grove operator pod to be ready after restart...")
223+
run(
224+
f"kubectl rollout status deployment/{GROVE_RELEASE}"
225+
f" -n {GROVE_NAMESPACE} --timeout=120s",
226+
check=False,
227+
)
228+
run(
229+
f"kubectl wait --for=condition=Ready pods"
230+
f" -l app.kubernetes.io/name=grove-operator"
231+
f" -n {GROVE_NAMESPACE} --timeout=120s",
232+
check=False,
233+
)
219234
else:
220235
log_warning("Skipping operator readiness check (--skip-operator-wait)")
221236
time.sleep(5)

operator/hack/e2e-cluster/create-e2e-cluster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ class ClusterConfig(BaseSettings):
123123
lb_port: str = "8090:80"
124124
worker_nodes: int = Field(default=30, ge=1, le=100)
125125
worker_memory: str = Field(default="150m", pattern=r"^\d+[mMgG]?$")
126-
k3s_image: str = "rancher/k3s:v1.34.2-k3s1"
126+
k3s_image: str = "rancher/k3s:v1.33.5-k3s1"
127127
kai_version: str = Field(default=DEPENDENCIES['kai_scheduler']['version'], pattern=r"^v[\d.]+(-[\w.]+)?$")
128128
skaffold_profile: str = "topology-test"
129129
max_retries: int = Field(default=3, ge=1, le=10)

0 commit comments

Comments
 (0)