Skip to content

Commit 0efceb2

Browse files
authored
NO-JIRA: tests(containers): Add GPU support and pod exec functionality in tests (#1041)
Enhance `ImageDeployment` to support GPU accelerators (AMD/NVIDIA) during pod deployment and introduce a method to execute commands inside pods. Additionally, add tests for ensuring CUDA compatibility on OpenShift using a PyTorch notebook image.
1 parent c8b73af commit 0efceb2

File tree

6 files changed

+163
-11
lines changed

6 files changed

+163
-11
lines changed

.github/workflows/build-notebooks-TEMPLATE.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ jobs:
335335
- name: Run Testcontainers container tests (in PyTest)
336336
run: |
337337
set -Eeuxo pipefail
338-
uv run pytest --capture=fd tests/containers -m 'not openshift' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
338+
uv run pytest --capture=fd tests/containers -m 'not openshift and not cuda and not rocm' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
339339
env:
340340
DOCKER_HOST: "unix:///var/run/podman/podman.sock"
341341
TESTCONTAINERS_DOCKER_SOCKET_OVERRIDE: "/var/run/podman/podman.sock"
@@ -508,7 +508,7 @@ jobs:
508508
if: ${{ steps.have-tests.outputs.tests == 'true' }}
509509
run: |
510510
set -Eeuxo pipefail
511-
uv run pytest --capture=fd tests/containers -m 'openshift' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
511+
uv run pytest --capture=fd tests/containers -m 'openshift and not cuda and not rocm' --image="${{ steps.calculated_vars.outputs.OUTPUT_IMAGE }}"
512512
env:
513513
# TODO(jdanek): this Testcontainers stuff should not be necessary but currently it has to be there
514514
DOCKER_HOST: "unix:///var/run/podman/podman.sock"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ sudo dnf install podman
103103
systemctl --user start podman.service
104104
systemctl --user status podman.service
105105
systemctl --user status podman.socket
106-
DOCKER_HOST=unix:///run/user/$UID/podman/podman.sock uv run pytest tests/containers -m 'not openshift' --image quay.io/opendatahub/workbench-images@sha256:e98d19df346e7abb1fa3053f6d41f0d1fa9bab39e49b4cb90b510ca33452c2e4
106+
DOCKER_HOST=unix:///run/user/$UID/podman/podman.sock uv run pytest tests/containers -m 'not openshift and not cuda and not rocm' --image quay.io/opendatahub/workbench-images@sha256:e98d19df346e7abb1fa3053f6d41f0d1fa9bab39e49b4cb90b510ca33452c2e4
107107
108108
# Mac OS
109109
brew install podman

pytest.ini

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,8 @@ log_cli_level = INFO
1616
log_file = logs/pytest-logs.txt
1717
log_file_level = DEBUG
1818

19-
markers = openshift
19+
# https://docs.pytest.org/en/stable/example/markers.html#registering-markers
20+
markers =
21+
openshift: requires openshift to run,
22+
cuda: requires cuda to run,
23+
rocm: requires rocm to run,

tests/containers/conftest.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,22 @@ def pytest_generate_tests(metafunc: Metafunc) -> None:
4949
metafunc.parametrize(image.__name__, metafunc.config.getoption("--image"))
5050

5151

52-
def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
52+
def get_image_metadata(image: str) -> docker.models.images.Image:
5353
client = testcontainers.core.container.DockerClient()
5454
try:
5555
image_metadata = client.client.images.get(image)
5656
except docker.errors.ImageNotFound:
57+
# todo(jdanek): this means that even when image is to be run remotely (on openshift),
58+
# it has to be pulled locally first so that we can check its metadata
5759
image_metadata = client.client.images.pull(image)
5860
assert isinstance(image_metadata, docker.models.images.Image)
5961

62+
return image_metadata
63+
64+
65+
def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
66+
image_metadata = get_image_metadata(image)
67+
6068
ide_server_label_fragments = ("-code-server-", "-jupyter-", "-rstudio-")
6169
if not any(ide in image_metadata.labels["name"] for ide in ide_server_label_fragments):
6270
pytest.skip(
@@ -66,6 +74,23 @@ def skip_if_not_workbench_image(image: str) -> docker.models.images.Image:
6674
return image_metadata
6775

6876

77+
def skip_if_not_cuda_image(image: str) -> docker.models.images.Image:
78+
image_metadata = get_image_metadata(image)
79+
80+
if "-cuda-" not in image_metadata.labels["name"]:
81+
pytest.skip(f"Image {image} does not have any of '-cuda-' in {image_metadata.labels['name']=}")
82+
83+
return image_metadata
84+
85+
86+
def skip_if_not_rocm_image(image: str) -> docker.models.images.Image:
87+
image_metadata = get_image_metadata(image)
88+
if "-rocm-" not in image_metadata.labels["name"]:
89+
pytest.skip(f"Image {image} does not have any of '-rocm-' in {image_metadata.labels['name']=}")
90+
91+
return image_metadata
92+
93+
6994
# https://docs.pytest.org/en/stable/how-to/fixtures.html#parametrizing-fixtures
7095
# indirect parametrization https://stackoverflow.com/questions/18011902/how-to-pass-a-parameter-to-a-fixture-function-in-pytest
7196
@pytest.fixture(scope="session")
@@ -79,6 +104,18 @@ def workbench_image(image: str):
79104
yield image
80105

81106

107+
@pytest.fixture(scope="function")
108+
def cuda_workbench_image(workbench_image: str):
109+
skip_if_not_cuda_image(workbench_image)
110+
yield workbench_image
111+
112+
113+
@pytest.fixture(scope="function")
114+
def rocm_workbench_image(workbench_image: str):
115+
skip_if_not_rocm_image(workbench_image)
116+
yield workbench_image
117+
118+
82119
@pytest.fixture(scope="function")
83120
def jupyterlab_image(image: str) -> docker.models.images.Image:
84121
image_metadata = skip_if_not_workbench_image(image)

tests/containers/kubernetes_utils.py

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import functools
55
import logging
66
import socket
7+
import subprocess
78
import threading
89
import time
910
import traceback
10-
from typing import TYPE_CHECKING, Any, Self
11+
from typing import TYPE_CHECKING, Any, Literal, Self
1112

1213
import kubernetes
1314
import kubernetes.client.api.core_v1_api
@@ -35,6 +36,7 @@
3536
class TestFrameConstants:
3637
GLOBAL_POLL_INTERVAL_MEDIUM = 10
3738
TIMEOUT_2MIN = 2 * 60
39+
TIMEOUT_5MIN = 5 * 60
3840

3941

4042
logging.basicConfig(level=logging.DEBUG)
@@ -120,6 +122,8 @@ class ImageDeployment:
120122
def __init__(self, client: kubernetes.dynamic.DynamicClient, image: str):
121123
self.client = client
122124
self.image = image
125+
self.pod = None
126+
self.port = None
123127
self.tf = TestFrame()
124128

125129
def __enter__(self) -> Self:
@@ -128,7 +132,9 @@ def __enter__(self) -> Self:
128132
def __exit__(self, exc_type, exc_val, exc_tb):
129133
self.tf.destroy()
130134

131-
def deploy(self, container_name: str) -> None:
135+
def deploy(
136+
self, container_name: str, accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None
137+
) -> kubernetes.client.models.v1_pod.V1Pod:
132138
LOGGER.debug(f"Deploying {self.image}")
133139
# custom namespace is necessary, because we cannot assign a SCC to pods created in one of the default namespaces:
134140
# default, kube-system, kube-public, openshift-node, openshift-infra, openshift.
@@ -187,6 +193,20 @@ def deploy(self, container_name: str) -> None:
187193
"protocol": "TCP",
188194
}
189195
],
196+
**(
197+
{
198+
"resources": {
199+
"limits": {
200+
accelerator: "1",
201+
},
202+
"requests": {
203+
accelerator: "1",
204+
},
205+
}
206+
}
207+
if accelerator
208+
else {}
209+
),
190210
# rstudio will not start without its volume mount and it does not log the error for it
191211
# See the testcontainers implementation of this (the tty=True part)
192212
"volumeMounts": [{"mountPath": "/opt/app-root/src", "name": "my-workbench"}],
@@ -214,9 +234,9 @@ def deploy(self, container_name: str) -> None:
214234
namespace=ns.name, label_selector=f"app={container_name}"
215235
)
216236
assert len(pod_name.items) == 1
217-
pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
237+
self.pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
218238

219-
p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, pod), "localhost", 0)
239+
p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
220240
t = threading.Thread(target=p.listen_and_serve_until_canceled)
221241
t.start()
222242
self.tf.defer(t, lambda thread: thread.join())
@@ -230,11 +250,55 @@ def deploy(self, container_name: str) -> None:
230250
30,
231251
lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
232252
)
233-
LOGGER.debug("Done with portforward")
253+
LOGGER.debug("Done setting up portforward")
254+
255+
return self.pod
256+
257+
# https://github.com/kubernetes-client/python/blob/master/examples/pod_exec.py
258+
def exec(self, command: str) -> subprocess.CompletedProcess:
259+
if not self.pod or not self.pod.metadata or not self.pod.metadata.name or not self.pod.metadata.namespace:
260+
raise RuntimeError("Pod information is not available for exec operation.")
261+
LOGGER.debug(f"Executing command {command}")
262+
core_v1_api = kubernetes.client.api.core_v1_api.CoreV1Api(api_client=self.client.client)
263+
264+
# Uses kubernetes.stream.stream to handle WebSocket upgrade.
265+
# E HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"Upgrade request required","reason":"BadRequest","code":400}
266+
try:
267+
resp: kubernetes.stream.ws_client.WSClient = kubernetes.stream.stream(
268+
core_v1_api.connect_get_namespaced_pod_exec,
269+
self.pod.metadata.name,
270+
self.pod.metadata.namespace,
271+
command=["/bin/sh", "-c", command],
272+
stderr=True,
273+
stdin=False,
274+
stdout=True,
275+
tty=False,
276+
# _preload_content=False is important for streaming
277+
_preload_content=False,
278+
)
279+
except Exception as e:
280+
LOGGER.error(f"Error during pod exec: {e}")
281+
LOGGER.error(traceback.format_exc())
282+
raise
283+
stdout = []
284+
stderr = []
285+
while resp.is_open():
286+
resp.update(timeout=1)
287+
if resp.peek_stdout():
288+
stdout.append(resp.read_stdout())
289+
if resp.peek_stderr():
290+
stderr.append(resp.read_stderr())
291+
returncode = resp.returncode
292+
resp.close()
293+
294+
return subprocess.CompletedProcess(
295+
args=command, returncode=returncode, stdout="\n".join(stdout), stderr="\n".join(stderr)
296+
)
234297

235298

236299
class PodUtils:
237-
READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_2MIN
300+
# this includes potentially pulling the image, and cuda images are huge
301+
READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_5MIN
238302

239303
# consider using timeout_sampler
240304
@staticmethod
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from __future__ import annotations
2+
3+
import shlex
4+
5+
import pytest
6+
7+
from tests.containers import conftest, kubernetes_utils
8+
9+
code = """
10+
import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")
11+
"""
12+
13+
14+
# This is from ods-ci,
15+
# https://github.com/red-hat-data-services/ods-ci/blob/ab7237d899c053b0f5b0ff0a2074ac4cdde3543e/ods_ci/tests/Resources/Page/ODH/JupyterHub/GPU.resource#L13-L12
16+
class TestAccelerator:
17+
@pytest.mark.cuda
18+
@pytest.mark.openshift
19+
# image must be both jupyterlab image and cuda workbench image
20+
def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
21+
client = kubernetes_utils.get_client()
22+
print(client)
23+
24+
image_metadata = conftest.get_image_metadata(cuda_workbench_image)
25+
library = None
26+
if "-pytorch-" in image_metadata.labels.get("name"):
27+
library = "torch"
28+
if "-tensorflow-" in image_metadata.labels.get("name"):
29+
library = "tensorflow"
30+
31+
# language=python
32+
torch_check = (
33+
"""import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")"""
34+
)
35+
# language=python
36+
tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
37+
38+
with kubernetes_utils.ImageDeployment(client, cuda_workbench_image) as image:
39+
image.deploy(container_name="notebook-tests-pod", accelerator="nvidia.com/gpu")
40+
if library == "torch":
41+
result = image.exec(shlex.join(["python", "-c", torch_check]))
42+
assert "Using cuda device" in result.stdout
43+
elif library == "tensorflow":
44+
result = image.exec(shlex.join(["python", "-c", tensorflow_check]))
45+
assert "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]" in result.stdout
46+
else:
47+
raise ValueError(f"Unknown library {library}")

0 commit comments

Comments
 (0)