test(tests/containers): add test for ROCm accelerator images (#1414)

jiridanek · jiridanek · commit 5a36337e746a · 2025-07-18T19:23:35.000+02:00
This commit introduces tests for ROCm-enabled workbench images on OpenShift. These tests verify that the images can be deployed successfully on a cluster with AMD GPUs and that both PyTorch and TensorFlow can correctly detect the available accelerator.
To support the testing of large accelerator images, the following changes were made:
- The pod readiness timeout in the test framework has been increased to 10 minutes to allow sufficient time for image pulling.
- The utility was updated to allow for configurable timeouts. `ImageDeployment`
- Existing CUDA tests were updated to use this new configurable timeout.
diff --git a/tests/containers/conftest.py b/tests/containers/conftest.py
@@ -95,8 +95,12 @@ def skip_if_not_workbench_image(image: str) -> Image:
 def skip_if_not_cuda_image(image: str) -> Image:
     image_metadata = get_image_metadata(image)
 
-    if "-cuda-" not in image_metadata.labels["name"]:
-        pytest.skip(f"Image {image} does not have any of '-cuda-' in {image_metadata.labels['name']=}")
+    if "-rocm-" in image_metadata.labels["name"]:
+        pytest.skip(f"Image {image} does have '-rocm-' in {image_metadata.labels['name']=}")
+
+    cuda_label_fragments = ("-cuda-", "-pytorch-", "-tensorflow-")
+    if not any(ide in image_metadata.labels["name"] for ide in cuda_label_fragments):
+        pytest.skip(f"Image {image} does not have any of '{cuda_label_fragments=}' in {image_metadata.labels['name']=}")
 
     return image_metadata
 
@@ -139,15 +143,15 @@ def workbench_image(image: str):
 
 
 @pytest.fixture(scope="function")
-def cuda_workbench_image(workbench_image: str):
-    skip_if_not_cuda_image(workbench_image)
-    yield workbench_image
+def cuda_image(image: str):
+    skip_if_not_cuda_image(image)
+    yield image
 
 
 @pytest.fixture(scope="function")
-def rocm_workbench_image(workbench_image: str):
-    skip_if_not_rocm_image(workbench_image)
-    yield workbench_image
+def rocm_image(image: str):
+    skip_if_not_rocm_image(image)
+    yield image
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/containers/kubernetes_utils.py b/tests/containers/kubernetes_utils.py
@@ -37,6 +37,10 @@ class TestFrameConstants:
     GLOBAL_POLL_INTERVAL_MEDIUM = 10
     TIMEOUT_2MIN = 2 * 60
     TIMEOUT_5MIN = 5 * 60
+    TIMEOUT_20MIN = 20 * 60
+
+    # this includes potentially pulling the image, and cuda images are huge
+    READINESS_TIMEOUT = TIMEOUT_5MIN
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -133,10 +137,14 @@ def __enter__(self) -> Self:
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.tf.destroy()
+        self.tf.destroy(wait=True)
 
     def deploy(
-        self, container_name: str, accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None
+        self,
+        container_name: str,
+        accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None,
+        is_runtime_image: bool = False,
+        timeout: float = TestFrameConstants.READINESS_TIMEOUT,
     ) -> kubernetes.client.models.v1_pod.V1Pod:
         LOGGER.debug(f"Deploying {self.image}")
         # custom namespace is necessary, because we cannot assign a SCC to pods created in one of the default namespaces:
@@ -188,7 +196,15 @@ def deploy(
                         {
                             "name": container_name,
                             "image": self.image,
-                            # "command": ["/bin/sh", "-c", "while true ; do date; sleep 5; done;"],
+                            # "command": ["/bin/sh", "-c", "while true; do date; sleep 5; done;"],
+                            **(
+                                {
+                                    "command": ["/bin/sh"],
+                                    "args": ["-c", "sleep infinity"],
+                                }
+                                if is_runtime_image
+                                else {}
+                            ),
                             "ports": [
                                 {
                                     "containerPort": 8888,
@@ -229,7 +245,11 @@ def deploy(
         self.tf.defer_resource(deployment)
         LOGGER.debug("Waiting for pods to become ready...")
         PodUtils.wait_for_pods_ready(
-            self.client, namespace_name=ns.name, label_selector=f"app={container_name}", expect_pods_count=1
+            self.client,
+            namespace_name=ns.name,
+            label_selector=f"app={container_name}",
+            expect_pods_count=1,
+            timeout=timeout,
         )
 
         core_v1_api = kubernetes.client.api.core_v1_api.CoreV1Api(api_client=self.client.client)
@@ -239,21 +259,22 @@ def deploy(
         assert len(pod_name.items) == 1
         self.pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
 
-        p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
-        t = threading.Thread(target=p.listen_and_serve_until_canceled)
-        t.start()
-        self.tf.defer(t, lambda thread: thread.join())
-        self.tf.defer(p.cancellation_token, lambda token: token.cancel())
-
-        self.port = p.get_actual_port()
-        LOGGER.debug(f"Listening on port {self.port}")
-        Wait.until(
-            "Connecting to pod succeeds",
-            1,
-            30,
-            lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
-        )
-        LOGGER.debug("Done setting up portforward")
+        if not is_runtime_image:
+            p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
+            t = threading.Thread(target=p.listen_and_serve_until_canceled)
+            t.start()
+            self.tf.defer(t, lambda thread: thread.join())
+            self.tf.defer(p.cancellation_token, lambda token: token.cancel())
+
+            self.port = p.get_actual_port()
+            LOGGER.debug(f"Listening on port {self.port}")
+            Wait.until(
+                "Connecting to pod succeeds",
+                1,
+                30,
+                lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
+            )
+            LOGGER.debug("Done setting up portforward")
 
         return self.pod
 
@@ -300,20 +321,16 @@ def exec(self, command: str) -> subprocess.CompletedProcess:
 
 
 class PodUtils:
-    # this includes potentially pulling the image, and cuda images are huge
-    READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_5MIN
-
     # consider using timeout_sampler
     @staticmethod
     def wait_for_pods_ready(
-        client: DynamicClient, namespace_name: str, label_selector: str, expect_pods_count: int
+        client: DynamicClient,
+        namespace_name: str,
+        label_selector: str,
+        expect_pods_count: int,
+        timeout: float = TestFrameConstants.READINESS_TIMEOUT,
     ) -> None:
-        """Wait for all pods in namespace to be ready
-        :param client:
-        :param namespace_name: name of the namespace
-        :param label_selector:
-        :param expect_pods_count:
-        """
+        """Wait for all pods in namespace to be ready"""
 
         # it's a dynamic client with the `resource` parameter already filled in
         class ResourceType(kubernetes.dynamic.Resource, kubernetes.dynamic.DynamicClient):
@@ -359,7 +376,7 @@ def ready() -> bool:
         Wait.until(
             description=f"readiness of all Pods matching {label_selector} in Namespace {namespace_name}",
             poll_interval=TestFrameConstants.GLOBAL_POLL_INTERVAL_MEDIUM,
-            timeout=PodUtils.READINESS_TIMEOUT,
+            timeout=timeout,
             ready=ready,
         )
 
diff --git a/tests/containers/workbenches/accelerator_image_test.py b/tests/containers/workbenches/accelerator_image_test.py
@@ -5,6 +5,7 @@
 import pytest
 
 from tests.containers import conftest, kubernetes_utils
+from tests.containers.kubernetes_utils import TestFrameConstants
 
 code = """
 import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")
@@ -16,12 +17,12 @@
 class TestAccelerator:
     @pytest.mark.cuda
     @pytest.mark.openshift
-    # image must be both jupyterlab image and cuda workbench image
-    def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
+    # image must be both a datascience image and cuda image
+    def test_cuda_run_on_openshift(self, datascience_image, cuda_image):
         client = kubernetes_utils.get_client()
         print(client)
 
-        image_metadata = conftest.get_image_metadata(cuda_workbench_image)
+        image_metadata = conftest.get_image_metadata(cuda_image)
         library = None
         if "-pytorch-" in image_metadata.labels.get("name"):
             library = "torch"
@@ -35,8 +36,52 @@ def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
         # language=python
         tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
 
-        with kubernetes_utils.ImageDeployment(client, cuda_workbench_image) as image:
-            image.deploy(container_name="notebook-tests-pod", accelerator="nvidia.com/gpu")
+        with kubernetes_utils.ImageDeployment(client, cuda_image) as image:
+            image.deploy(
+                container_name="notebook-tests-pod",
+                accelerator="nvidia.com/gpu",
+                is_runtime_image="-runtime-" in image_metadata.labels.get("name"),
+                timeout=TestFrameConstants.TIMEOUT_20MIN,
+            )
+            if library == "torch":
+                result = image.exec(shlex.join(["python", "-c", torch_check]))
+                assert "Using cuda device" in result.stdout
+            elif library == "tensorflow":
+                result = image.exec(shlex.join(["python", "-c", tensorflow_check]))
+                assert "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]" in result.stdout
+            else:
+                raise ValueError(f"Unknown library {library}")
+
+    @pytest.mark.rocm
+    @pytest.mark.openshift
+    # image must be both a datascience image and rocm image
+    def test_rocm_run_on_openshift(self, datascience_image, rocm_image):
+        client = kubernetes_utils.get_client()
+        print(client)
+
+        image_metadata = conftest.get_image_metadata(rocm_image)
+        library = None
+        if "-pytorch-" in image_metadata.labels.get("name"):
+            library = "torch"
+        if "-tensorflow-" in image_metadata.labels.get("name"):
+            library = "tensorflow"
+
+        # NOTE: the basic check is exactly the same as for cuda; in torch, even though it says "cuda", it is actually ROCm
+
+        # language=python
+        torch_check = (
+            """import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")"""
+        )
+        # language=python
+        tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
+
+        with kubernetes_utils.ImageDeployment(client, rocm_image) as image:
+            image.deploy(
+                container_name="notebook-tests-pod",
+                accelerator="amd.com/gpu",
+                is_runtime_image="-runtime-" in image_metadata.labels.get("name"),
+                timeout=TestFrameConstants.TIMEOUT_20MIN,
+            )
             if library == "torch":
                 result = image.exec(shlex.join(["python", "-c", torch_check]))
                 assert "Using cuda device" in result.stdout