NO-JIRA: tests(containers): Add GPU support and pod exec functionality in tests (#1041)

jiridanek · web-flow · commit 0efceb201cb2 · 2025-05-12T13:33:34.000+02:00
Enhance `ImageDeployment` to support GPU accelerators (AMD/NVIDIA) during pod deployment and introduce a method to execute commands inside pods. Additionally, add tests for ensuring CUDA compatibility on OpenShift using a PyTorch notebook image.
diff --git a/.github/workflows/build-notebooks-TEMPLATE.yaml b/.github/workflows/build-notebooks-TEMPLATE.yaml
@@ -335,7 +335,7 @@ jobs:
       - name: Run Testcontainers container tests (in PyTest)
         run: |
           set -Eeuxo pipefail
-          uv run pytest --capture=fd tests/containers -m 'not openshift' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
+          uv run pytest --capture=fd tests/containers -m 'not openshift and not cuda and not rocm' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
         env:
           DOCKER_HOST: "unix:///var/run/podman/podman.sock"
           TESTCONTAINERS_DOCKER_SOCKET_OVERRIDE: "/var/run/podman/podman.sock"
@@ -508,7 +508,7 @@ jobs:
         if: ${{ steps.have-tests.outputs.tests == 'true' }}
         run: |
           set -Eeuxo pipefail
-          uv run pytest --capture=fd tests/containers -m 'openshift' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
+          uv run pytest --capture=fd tests/containers -m 'openshift and not cuda and not rocm' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
         env:
           # TODO(jdanek): this Testcontainers stuff should not be necessary but currently it has to be there
           DOCKER_HOST: "unix:///var/run/podman/podman.sock"
diff --git a/README.md b/README.md
@@ -103,7 +103,7 @@ sudo dnf install podman
 systemctl --user start podman.service
 systemctl --user status podman.service
 systemctl --user status podman.socket
-DOCKER_HOST=unix:///run/user/$UID/podman/podman.sock uv run pytest tests/containers -m 'not openshift' --image quay.io/opendatahub/workbench-images@sha256:e98d19df346e7abb1fa3053f6d41f0d1fa9bab39e49b4cb90b510ca33452c2e4
+DOCKER_HOST=unix:///run/user/$UID/podman/podman.sock uv run pytest tests/containers -m 'not openshift and not cuda and not rocm' --image quay.io/opendatahub/workbench-images@sha256:e98d19df346e7abb1fa3053f6d41f0d1fa9bab39e49b4cb90b510ca33452c2e4
 
 # Mac OS
 brew install podman
diff --git a/pytest.ini b/pytest.ini
@@ -16,4 +16,8 @@ log_cli_level = INFO
 log_file = logs/pytest-logs.txt
 log_file_level = DEBUG
 
-markers = openshift
+# https://docs.pytest.org/en/stable/example/markers.html#registering-markers
+markers =
+    openshift: requires openshift to run,
+    cuda: requires cuda to run,
+    rocm: requires rocm to run,
diff --git a/tests/containers/conftest.py b/tests/containers/conftest.py
@@ -49,14 +49,22 @@ def pytest_generate_tests(metafunc: Metafunc) -> None:
         metafunc.parametrize(image.__name__, metafunc.config.getoption("--image"))
 
 
-def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
+def get_image_metadata(image: str) -> docker.models.images.Image:
     client = testcontainers.core.container.DockerClient()
     try:
         image_metadata = client.client.images.get(image)
     except docker.errors.ImageNotFound:
+        # todo(jdanek): this means that even when image is to be run remotely (on openshift),
+        #  it has to be pulled locally first so that we can check its metadata
         image_metadata = client.client.images.pull(image)
         assert isinstance(image_metadata, docker.models.images.Image)
 
+    return image_metadata
+
+
+def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
+    image_metadata = get_image_metadata(image)
+
     ide_server_label_fragments = ("-code-server-", "-jupyter-", "-rstudio-")
     if not any(ide in image_metadata.labels["name"] for ide in ide_server_label_fragments):
         pytest.skip(
@@ -66,6 +74,23 @@ def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
     return image_metadata
 
 
+def skip_if_not_cuda_image(image: str) -> docker.models.images.Image:
+    image_metadata = get_image_metadata(image)
+
+    if "-cuda-" not in image_metadata.labels["name"]:
+        pytest.skip(f"Image {image} does not have any of '-cuda-' in {image_metadata.labels['name']=}")
+
+    return image_metadata
+
+
+def skip_if_not_rocm_image(image: str) -> docker.models.images.Image:
+    image_metadata = get_image_metadata(image)
+    if "-rocm-" not in image_metadata.labels["name"]:
+        pytest.skip(f"Image {image} does not have any of '-rocm-' in {image_metadata.labels['name']=}")
+
+    return image_metadata
+
+
 # https://docs.pytest.org/en/stable/how-to/fixtures.html#parametrizing-fixtures
 # indirect parametrization https://stackoverflow.com/questions/18011902/how-to-pass-a-parameter-to-a-fixture-function-in-pytest
 @pytest.fixture(scope="session")
@@ -79,6 +104,18 @@ def workbench_image(image: str):
     yield image
 
 
+@pytest.fixture(scope="function")
+def cuda_workbench_image(workbench_image: str):
+    skip_if_not_cuda_image(workbench_image)
+    yield workbench_image
+
+
+@pytest.fixture(scope="function")
+def rocm_workbench_image(workbench_image: str):
+    skip_if_not_rocm_image(workbench_image)
+    yield workbench_image
+
+
 @pytest.fixture(scope="function")
 def jupyterlab_image(image: str) -> docker.models.images.Image:
     image_metadata = skip_if_not_workbench_image(image)
diff --git a/tests/containers/kubernetes_utils.py b/tests/containers/kubernetes_utils.py
@@ -4,10 +4,11 @@
 import functools
 import logging
 import socket
+import subprocess
 import threading
 import time
 import traceback
-from typing import TYPE_CHECKING, Any, Self
+from typing import TYPE_CHECKING, Any, Literal, Self
 
 import kubernetes
 import kubernetes.client.api.core_v1_api
@@ -35,6 +36,7 @@
 class TestFrameConstants:
     GLOBAL_POLL_INTERVAL_MEDIUM = 10
     TIMEOUT_2MIN = 2 * 60
+    TIMEOUT_5MIN = 5 * 60
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -120,6 +122,8 @@ class ImageDeployment:
     def __init__(self, client: kubernetes.dynamic.DynamicClient, image: str):
         self.client = client
         self.image = image
+        self.pod = None
+        self.port = None
         self.tf = TestFrame()
 
     def __enter__(self) -> Self:
@@ -128,7 +132,9 @@ def __enter__(self) -> Self:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.tf.destroy()
 
-    def deploy(self, container_name: str) -> None:
+    def deploy(
+        self, container_name: str, accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None
+    ) -> kubernetes.client.models.v1_pod.V1Pod:
         LOGGER.debug(f"Deploying {self.image}")
         # custom namespace is necessary, because we cannot assign a SCC to pods created in one of the default namespaces:
         #  default, kube-system, kube-public, openshift-node, openshift-infra, openshift.
@@ -187,6 +193,20 @@ def deploy(self, container_name: str) -> None:
                                     "protocol": "TCP",
                                 }
                             ],
+                            **(
+                                {
+                                    "resources": {
+                                        "limits": {
+                                            accelerator: "1",
+                                        },
+                                        "requests": {
+                                            accelerator: "1",
+                                        },
+                                    }
+                                }
+                                if accelerator
+                                else {}
+                            ),
                             # rstudio will not start without its volume mount and it does not log the error for it
                             # See the testcontainers implementation of this (the tty=True part)
                             "volumeMounts": [{"mountPath": "/opt/app-root/src", "name": "my-workbench"}],
@@ -214,9 +234,9 @@ def deploy(self, container_name: str) -> None:
             namespace=ns.name, label_selector=f"app={container_name}"
         )
         assert len(pod_name.items) == 1
-        pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
+        self.pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
 
-        p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, pod), "localhost", 0)
+        p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
         t = threading.Thread(target=p.listen_and_serve_until_canceled)
         t.start()
         self.tf.defer(t, lambda thread: thread.join())
@@ -230,11 +250,55 @@ def deploy(self, container_name: str) -> None:
             30,
             lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
         )
-        LOGGER.debug("Done with portforward")
+        LOGGER.debug("Done setting up portforward")
+
+        return self.pod
+
+    # https://github.com/kubernetes-client/python/blob/master/examples/pod_exec.py
+    def exec(self, command: str) -> subprocess.CompletedProcess:
+        if not self.pod or not self.pod.metadata or not self.pod.metadata.name or not self.pod.metadata.namespace:
+            raise RuntimeError("Pod information is not available for exec operation.")
+        LOGGER.debug(f"Executing command {command}")
+        core_v1_api = kubernetes.client.api.core_v1_api.CoreV1Api(api_client=self.client.client)
+
+        # Uses kubernetes.stream.stream to handle WebSocket upgrade.
+        # E   HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"Upgrade request required","reason":"BadRequest","code":400}
+        try:
+            resp: kubernetes.stream.ws_client.WSClient = kubernetes.stream.stream(
+                core_v1_api.connect_get_namespaced_pod_exec,
+                self.pod.metadata.name,
+                self.pod.metadata.namespace,
+                command=["/bin/sh", "-c", command],
+                stderr=True,
+                stdin=False,
+                stdout=True,
+                tty=False,
+                # _preload_content=False is important for streaming
+                _preload_content=False,
+            )
+        except Exception as e:
+            LOGGER.error(f"Error during pod exec: {e}")
+            LOGGER.error(traceback.format_exc())
+            raise
+        stdout = []
+        stderr = []
+        while resp.is_open():
+            resp.update(timeout=1)
+            if resp.peek_stdout():
+                stdout.append(resp.read_stdout())
+            if resp.peek_stderr():
+                stderr.append(resp.read_stderr())
+        returncode = resp.returncode
+        resp.close()
+
+        return subprocess.CompletedProcess(
+            args=command, returncode=returncode, stdout="\n".join(stdout), stderr="\n".join(stderr)
+        )
 
 
 class PodUtils:
-    READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_2MIN
+    # this includes potentially pulling the image, and cuda images are huge
+    READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_5MIN
 
     # consider using timeout_sampler
     @staticmethod
diff --git a/tests/containers/workbenches/accelerator_image_test.py b/tests/containers/workbenches/accelerator_image_test.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import shlex
+
+import pytest
+
+from tests.containers import conftest, kubernetes_utils
+
+code = """
+import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")
+"""
+
+
+# This is from ods-ci,
+# https://github.com/red-hat-data-services/ods-ci/blob/ab7237d899c053b0f5b0ff0a2074ac4cdde3543e/ods_ci/tests/Resources/Page/ODH/JupyterHub/GPU.resource#L13-L12
+class TestAccelerator:
+    @pytest.mark.cuda
+    @pytest.mark.openshift
+    # image must be both jupyterlab image and cuda workbench image
+    def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
+        client = kubernetes_utils.get_client()
+        print(client)
+
+        image_metadata = conftest.get_image_metadata(cuda_workbench_image)
+        library = None
+        if "-pytorch-" in image_metadata.labels.get("name"):
+            library = "torch"
+        if "-tensorflow-" in image_metadata.labels.get("name"):
+            library = "tensorflow"
+
+        # language=python
+        torch_check = (
+            """import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")"""
+        )
+        # language=python
+        tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
+
+        with kubernetes_utils.ImageDeployment(client, cuda_workbench_image) as image:
+            image.deploy(container_name="notebook-tests-pod", accelerator="nvidia.com/gpu")
+            if library == "torch":
+                result = image.exec(shlex.join(["python", "-c", torch_check]))
+                assert "Using cuda device" in result.stdout
+            elif library == "tensorflow":
+                result = image.exec(shlex.join(["python", "-c", tensorflow_check]))
+                assert "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]" in result.stdout
+            else:
+                raise ValueError(f"Unknown library {library}")