Skip to content

Commit 052b018

Browse files
authored
fix: failing e2e and gpu e2e tests (kubeflow#3234)
Signed-off-by: Akash Jaiswal <akashjaiswal3846@gmail.com>
1 parent 6208a95 commit 052b018

File tree

2 files changed

+27
-25
lines changed

2 files changed

+27
-25
lines changed

hack/e2e-setup-cluster.sh

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,22 @@ kubectl apply --server-side -k manifests/overlays/runtimes || (
9292
exit 1
9393
)
9494

95-
# TODO (andreyvelich): We should build runtime images before adding them.
96-
TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
97-
DEEPSPEED_RUNTIME_IMAGE=ghcr.io/kubeflow/trainer/deepspeed-runtime:latest
98-
JAX_RUNTIME_IMAGE=nvcr.io/nvidia/jax:25.10-py3
99-
100-
# Load Torch runtime image in KinD
101-
${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE}
102-
load_image_to_kind ${TORCH_RUNTIME_IMAGE}
103-
104-
# Load DeepSpeed runtime image in KinD
105-
${CONTAINER_RUNTIME} pull ${DEEPSPEED_RUNTIME_IMAGE}
106-
load_image_to_kind ${DEEPSPEED_RUNTIME_IMAGE}
107-
108-
# Load JAX runtime image in KinD
109-
${CONTAINER_RUNTIME} pull ${JAX_RUNTIME_IMAGE}
110-
load_image_to_kind ${JAX_RUNTIME_IMAGE}
95+
# hotfix(jaiakash) - skip pre-load due to kind failure
96+
# # TODO (andreyvelich): We should build runtime images before adding them.
97+
# TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
98+
# DEEPSPEED_RUNTIME_IMAGE=ghcr.io/kubeflow/trainer/deepspeed-runtime:latest
99+
# JAX_RUNTIME_IMAGE=nvcr.io/nvidia/jax:25.10-py3
100+
101+
# # Load Torch runtime image in KinD
102+
# ${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE}
103+
# load_image_to_kind ${TORCH_RUNTIME_IMAGE}
104+
105+
# # Load DeepSpeed runtime image in KinD
106+
# ${CONTAINER_RUNTIME} pull ${DEEPSPEED_RUNTIME_IMAGE}
107+
# load_image_to_kind ${DEEPSPEED_RUNTIME_IMAGE}
108+
109+
# # Load JAX runtime image in KinD
110+
# ${CONTAINER_RUNTIME} pull ${JAX_RUNTIME_IMAGE}
111+
# load_image_to_kind ${JAX_RUNTIME_IMAGE}
111112

112113
print_cluster_info

hack/e2e-setup-gpu-cluster.sh

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -180,14 +180,15 @@ kubectl get clustertrainingruntimes -o json | jq '
180180
.items[].spec.template.spec.replicatedJobs[].template.spec.template.spec.runtimeClassName = "nvidia"
181181
' | kubectl apply -f -
182182

183-
# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
184-
TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
185-
${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE}
186-
load_image_to_kind ${TORCH_RUNTIME_IMAGE} ${GPU_CLUSTER_NAME}
187-
188-
# Pre-pull NVIDIA JAX image for JAX runtime.
189-
JAX_RUNTIME_IMAGE=nvcr.io/nvidia/jax:25.10-py3
190-
${CONTAINER_RUNTIME} pull ${JAX_RUNTIME_IMAGE}
191-
load_image_to_kind ${JAX_RUNTIME_IMAGE} ${GPU_CLUSTER_NAME}
183+
# hotfix(jaiakash) - skip pre-load due to kind failure
184+
# # TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
185+
# TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
186+
# ${CONTAINER_RUNTIME} pull ${TORCH_RUNTIME_IMAGE}
187+
# load_image_to_kind ${TORCH_RUNTIME_IMAGE} ${GPU_CLUSTER_NAME}
188+
189+
# # Pre-pull NVIDIA JAX image for JAX runtime.
190+
# JAX_RUNTIME_IMAGE=nvcr.io/nvidia/jax:25.10-py3
191+
# ${CONTAINER_RUNTIME} pull ${JAX_RUNTIME_IMAGE}
192+
# load_image_to_kind ${JAX_RUNTIME_IMAGE} ${GPU_CLUSTER_NAME}
192193

193194
print_cluster_info

0 commit comments

Comments
 (0)