fix

shmuel-runai · shmuel-runai · commit 3c64bd10a925 · 2026-02-15T13:38:03.000+02:00
diff --git a/.github/workflows/build-check-test.yaml b/.github/workflows/build-check-test.yaml
@@ -115,6 +115,7 @@ jobs:
           - test_name: cert_management
             test_pattern: "^Test_CM"
           - test_name: auto_mnnvl
+            test_pattern: "^Test_AutoMNNVL"
             make_target: "run-e2e-mnnvl-full"
     name: E2E - ${{ matrix.test_name }}
     steps:
diff --git a/operator/Makefile b/operator/Makefile
@@ -141,7 +141,8 @@ run-e2e-full: e2e-cluster-up
 # Usage: make run-e2e-mnnvl-full
 .PHONY: run-e2e-mnnvl-full
 run-e2e-mnnvl-full: export E2E_WORKER_NODES = 2
-run-e2e-mnnvl-full: E2E_CREATE_FLAGS = --skip-kai --skip-topology --skip-prepull
+run-e2e-mnnvl-full: export E2E_K3S_IMAGE = rancher/k3s:v1.34.2-k3s1
+run-e2e-mnnvl-full: E2E_CREATE_FLAGS = --skip-prepull
 run-e2e-mnnvl-full: e2e-cluster-up
 	@echo "> Pushing alpine image to local registry..."
 	@docker pull alpine:latest || true
diff --git a/operator/hack/e2e-autoMNNVL/README.md b/operator/hack/e2e-autoMNNVL/README.md
@@ -47,8 +47,8 @@ make run-e2e-mnnvl-full
 
 # Or, manage cluster manually and run tests separately:
 
-# 1. Create the MNNVL cluster (lightweight: 2 workers, no Kai/topology)
-E2E_WORKER_NODES=2 make e2e-cluster-up E2E_CREATE_FLAGS="--skip-kai --skip-topology --skip-prepull"
+# 1. Create the MNNVL cluster (lightweight: 2 workers, skip image prepull only)
+E2E_WORKER_NODES=2 E2E_K3S_IMAGE=rancher/k3s:v1.34.2-k3s1 make e2e-cluster-up E2E_CREATE_FLAGS="--skip-prepull"
 
 # 2. Push alpine image for test workloads
 docker pull alpine:latest && docker tag alpine:latest localhost:5001/alpine:latest && docker push localhost:5001/alpine:latest
@@ -80,5 +80,5 @@ make e2e-cluster-down
 - **Cluster name:** `shared-e2e-test-cluster` (same as standard e2e)
 - **Nodes:** 1 server + 2 agents (lightweight — standard e2e uses 30)
 - **Registry:** local registry on port 5001
-- **Skaffold profile:** `topology-test` (same as standard e2e)
+- **Skaffold profile:** `topology-test` (same as standard e2e; Kai and topology are installed, only worker count and prepull are reduced)
 - **Fake GPU:** [fake-gpu-operator](https://github.com/run-ai/fake-gpu-operator) v0.0.72 (provides ComputeDomain CRD)
diff --git a/operator/hack/e2e-autoMNNVL/run_autoMNNVL_e2e_all.py b/operator/hack/e2e-autoMNNVL/run_autoMNNVL_e2e_all.py
@@ -103,13 +103,13 @@ class ConfigEntry:
 # (preflight failure) in this invalid configuration; the e2e test itself
 # validates the expected failure behaviour.
 CONFIGS: list[ConfigEntry] = [
-    ConfigEntry("Config4_UnsupportedAndDisabled",
+    ConfigEntry("Config1_UnsupportedAndDisabled",
                 ["--fake-gpu=no", "--auto-mnnvl=disabled"]),
-    ConfigEntry("Config3_UnsupportedButEnabled",
+    ConfigEntry("Config2_UnsupportedButEnabled",
                 ["--fake-gpu=no", "--auto-mnnvl=enabled", "--skip-operator-wait"]),
-    ConfigEntry("Config1_SupportedAndEnabled",
+    ConfigEntry("Config3_SupportedAndEnabled",
                 ["--fake-gpu=yes", "--auto-mnnvl=enabled"]),
-    ConfigEntry("Config2_SupportedButDisabled",
+    ConfigEntry("Config4_SupportedButDisabled",
                 ["--fake-gpu=yes", "--auto-mnnvl=disabled"]),
 ]
 
diff --git a/operator/hack/e2e-cluster/config-cluster.py b/operator/hack/e2e-cluster/config-cluster.py
@@ -216,6 +216,21 @@ def ensure_auto_mnnvl(enabled: bool, *, skip_wait: bool = False) -> None:
             f" -n {GROVE_NAMESPACE} --timeout=120s",
             check=False,
         )
+        # Cert-rotation may exit the process to trigger a restart; wait for that cycle.
+        log_info("Allowing time for operator cert refresh restart cycle...")
+        time.sleep(20)
+        log_info("Waiting for Grove operator pod to be ready after restart...")
+        run(
+            f"kubectl rollout status deployment/{GROVE_RELEASE}"
+            f" -n {GROVE_NAMESPACE} --timeout=120s",
+            check=False,
+        )
+        run(
+            f"kubectl wait --for=condition=Ready pods"
+            f" -l app.kubernetes.io/name=grove-operator"
+            f" -n {GROVE_NAMESPACE} --timeout=120s",
+            check=False,
+        )
     else:
         log_warning("Skipping operator readiness check (--skip-operator-wait)")
         time.sleep(5)
diff --git a/operator/hack/e2e-cluster/create-e2e-cluster.py b/operator/hack/e2e-cluster/create-e2e-cluster.py
@@ -123,7 +123,7 @@ class ClusterConfig(BaseSettings):
     lb_port: str = "8090:80"
     worker_nodes: int = Field(default=30, ge=1, le=100)
     worker_memory: str = Field(default="150m", pattern=r"^\d+[mMgG]?$")
-    k3s_image: str = "rancher/k3s:v1.34.2-k3s1"
+    k3s_image: str = "rancher/k3s:v1.33.5-k3s1"
     kai_version: str = Field(default=DEPENDENCIES['kai_scheduler']['version'], pattern=r"^v[\d.]+(-[\w.]+)?$")
     skaffold_profile: str = "topology-test"
     max_retries: int = Field(default=3, ge=1, le=10)