Skip to content

Commit 5a36337

Browse files
committed
test(tests/containers): add test for ROCm accelerator images (#1414)
This commit introduces tests for ROCm-enabled workbench images on OpenShift. These tests verify that the images can be deployed successfully on a cluster with AMD GPUs and that both PyTorch and TensorFlow can correctly detect the available accelerator. To support the testing of large accelerator images, the following changes were made: - The pod readiness timeout in the test framework has been increased to 10 minutes to allow sufficient time for image pulling. - The utility was updated to allow for configurable timeouts. `ImageDeployment` - Existing CUDA tests were updated to use this new configurable timeout.
1 parent b81a96b commit 5a36337

File tree

3 files changed

+109
-43
lines changed

3 files changed

+109
-43
lines changed

tests/containers/conftest.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,12 @@ def skip_if_not_workbench_image(image: str) -> Image:
9595
def skip_if_not_cuda_image(image: str) -> Image:
9696
image_metadata = get_image_metadata(image)
9797

98-
if "-cuda-" not in image_metadata.labels["name"]:
99-
pytest.skip(f"Image {image} does not have any of '-cuda-' in {image_metadata.labels['name']=}")
98+
if "-rocm-" in image_metadata.labels["name"]:
99+
pytest.skip(f"Image {image} does have '-rocm-' in {image_metadata.labels['name']=}")
100+
101+
cuda_label_fragments = ("-cuda-", "-pytorch-", "-tensorflow-")
102+
if not any(ide in image_metadata.labels["name"] for ide in cuda_label_fragments):
103+
pytest.skip(f"Image {image} does not have any of '{cuda_label_fragments=}' in {image_metadata.labels['name']=}")
100104

101105
return image_metadata
102106

@@ -139,15 +143,15 @@ def workbench_image(image: str):
139143

140144

141145
@pytest.fixture(scope="function")
142-
def cuda_workbench_image(workbench_image: str):
143-
skip_if_not_cuda_image(workbench_image)
144-
yield workbench_image
146+
def cuda_image(image: str):
147+
skip_if_not_cuda_image(image)
148+
yield image
145149

146150

147151
@pytest.fixture(scope="function")
148-
def rocm_workbench_image(workbench_image: str):
149-
skip_if_not_rocm_image(workbench_image)
150-
yield workbench_image
152+
def rocm_image(image: str):
153+
skip_if_not_rocm_image(image)
154+
yield image
151155

152156

153157
@pytest.fixture(scope="function")

tests/containers/kubernetes_utils.py

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ class TestFrameConstants:
3737
GLOBAL_POLL_INTERVAL_MEDIUM = 10
3838
TIMEOUT_2MIN = 2 * 60
3939
TIMEOUT_5MIN = 5 * 60
40+
TIMEOUT_20MIN = 20 * 60
41+
42+
# this includes potentially pulling the image, and cuda images are huge
43+
READINESS_TIMEOUT = TIMEOUT_5MIN
4044

4145

4246
logging.basicConfig(level=logging.DEBUG)
@@ -133,10 +137,14 @@ def __enter__(self) -> Self:
133137
return self
134138

135139
def __exit__(self, exc_type, exc_val, exc_tb):
136-
self.tf.destroy()
140+
self.tf.destroy(wait=True)
137141

138142
def deploy(
139-
self, container_name: str, accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None
143+
self,
144+
container_name: str,
145+
accelerator: Literal["amd.com/gpu", "nvidia.com/gpu"] | None = None,
146+
is_runtime_image: bool = False,
147+
timeout: float = TestFrameConstants.READINESS_TIMEOUT,
140148
) -> kubernetes.client.models.v1_pod.V1Pod:
141149
LOGGER.debug(f"Deploying {self.image}")
142150
# custom namespace is necessary, because we cannot assign a SCC to pods created in one of the default namespaces:
@@ -188,7 +196,15 @@ def deploy(
188196
{
189197
"name": container_name,
190198
"image": self.image,
191-
# "command": ["/bin/sh", "-c", "while true ; do date; sleep 5; done;"],
199+
# "command": ["/bin/sh", "-c", "while true; do date; sleep 5; done;"],
200+
**(
201+
{
202+
"command": ["/bin/sh"],
203+
"args": ["-c", "sleep infinity"],
204+
}
205+
if is_runtime_image
206+
else {}
207+
),
192208
"ports": [
193209
{
194210
"containerPort": 8888,
@@ -229,7 +245,11 @@ def deploy(
229245
self.tf.defer_resource(deployment)
230246
LOGGER.debug("Waiting for pods to become ready...")
231247
PodUtils.wait_for_pods_ready(
232-
self.client, namespace_name=ns.name, label_selector=f"app={container_name}", expect_pods_count=1
248+
self.client,
249+
namespace_name=ns.name,
250+
label_selector=f"app={container_name}",
251+
expect_pods_count=1,
252+
timeout=timeout,
233253
)
234254

235255
core_v1_api = kubernetes.client.api.core_v1_api.CoreV1Api(api_client=self.client.client)
@@ -239,21 +259,22 @@ def deploy(
239259
assert len(pod_name.items) == 1
240260
self.pod: kubernetes.client.models.v1_pod.V1Pod = pod_name.items[0]
241261

242-
p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
243-
t = threading.Thread(target=p.listen_and_serve_until_canceled)
244-
t.start()
245-
self.tf.defer(t, lambda thread: thread.join())
246-
self.tf.defer(p.cancellation_token, lambda token: token.cancel())
247-
248-
self.port = p.get_actual_port()
249-
LOGGER.debug(f"Listening on port {self.port}")
250-
Wait.until(
251-
"Connecting to pod succeeds",
252-
1,
253-
30,
254-
lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
255-
)
256-
LOGGER.debug("Done setting up portforward")
262+
if not is_runtime_image:
263+
p = socket_proxy.SocketProxy(lambda: exposing_contextmanager(core_v1_api, self.pod), "localhost", 0)
264+
t = threading.Thread(target=p.listen_and_serve_until_canceled)
265+
t.start()
266+
self.tf.defer(t, lambda thread: thread.join())
267+
self.tf.defer(p.cancellation_token, lambda token: token.cancel())
268+
269+
self.port = p.get_actual_port()
270+
LOGGER.debug(f"Listening on port {self.port}")
271+
Wait.until(
272+
"Connecting to pod succeeds",
273+
1,
274+
30,
275+
lambda: requests.get(f"http://localhost:{self.port}").status_code == 200,
276+
)
277+
LOGGER.debug("Done setting up portforward")
257278

258279
return self.pod
259280

@@ -300,20 +321,16 @@ def exec(self, command: str) -> subprocess.CompletedProcess:
300321

301322

302323
class PodUtils:
303-
# this includes potentially pulling the image, and cuda images are huge
304-
READINESS_TIMEOUT = TestFrameConstants.TIMEOUT_5MIN
305-
306324
# consider using timeout_sampler
307325
@staticmethod
308326
def wait_for_pods_ready(
309-
client: DynamicClient, namespace_name: str, label_selector: str, expect_pods_count: int
327+
client: DynamicClient,
328+
namespace_name: str,
329+
label_selector: str,
330+
expect_pods_count: int,
331+
timeout: float = TestFrameConstants.READINESS_TIMEOUT,
310332
) -> None:
311-
"""Wait for all pods in namespace to be ready
312-
:param client:
313-
:param namespace_name: name of the namespace
314-
:param label_selector:
315-
:param expect_pods_count:
316-
"""
333+
"""Wait for all pods in namespace to be ready"""
317334

318335
# it's a dynamic client with the `resource` parameter already filled in
319336
class ResourceType(kubernetes.dynamic.Resource, kubernetes.dynamic.DynamicClient):
@@ -359,7 +376,7 @@ def ready() -> bool:
359376
Wait.until(
360377
description=f"readiness of all Pods matching {label_selector} in Namespace {namespace_name}",
361378
poll_interval=TestFrameConstants.GLOBAL_POLL_INTERVAL_MEDIUM,
362-
timeout=PodUtils.READINESS_TIMEOUT,
379+
timeout=timeout,
363380
ready=ready,
364381
)
365382

tests/containers/workbenches/accelerator_image_test.py

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66

77
from tests.containers import conftest, kubernetes_utils
8+
from tests.containers.kubernetes_utils import TestFrameConstants
89

910
code = """
1011
import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")
@@ -16,12 +17,12 @@
1617
class TestAccelerator:
1718
@pytest.mark.cuda
1819
@pytest.mark.openshift
19-
# image must be both jupyterlab image and cuda workbench image
20-
def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
20+
# image must be both a datascience image and cuda image
21+
def test_cuda_run_on_openshift(self, datascience_image, cuda_image):
2122
client = kubernetes_utils.get_client()
2223
print(client)
2324

24-
image_metadata = conftest.get_image_metadata(cuda_workbench_image)
25+
image_metadata = conftest.get_image_metadata(cuda_image)
2526
library = None
2627
if "-pytorch-" in image_metadata.labels.get("name"):
2728
library = "torch"
@@ -35,8 +36,52 @@ def test_cuda_run_on_openshift(self, jupyterlab_image, cuda_workbench_image):
3536
# language=python
3637
tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
3738

38-
with kubernetes_utils.ImageDeployment(client, cuda_workbench_image) as image:
39-
image.deploy(container_name="notebook-tests-pod", accelerator="nvidia.com/gpu")
39+
with kubernetes_utils.ImageDeployment(client, cuda_image) as image:
40+
image.deploy(
41+
container_name="notebook-tests-pod",
42+
accelerator="nvidia.com/gpu",
43+
is_runtime_image="-runtime-" in image_metadata.labels.get("name"),
44+
timeout=TestFrameConstants.TIMEOUT_20MIN,
45+
)
46+
if library == "torch":
47+
result = image.exec(shlex.join(["python", "-c", torch_check]))
48+
assert "Using cuda device" in result.stdout
49+
elif library == "tensorflow":
50+
result = image.exec(shlex.join(["python", "-c", tensorflow_check]))
51+
assert "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]" in result.stdout
52+
else:
53+
raise ValueError(f"Unknown library {library}")
54+
55+
@pytest.mark.rocm
56+
@pytest.mark.openshift
57+
# image must be both a datascience image and rocm image
58+
def test_rocm_run_on_openshift(self, datascience_image, rocm_image):
59+
client = kubernetes_utils.get_client()
60+
print(client)
61+
62+
image_metadata = conftest.get_image_metadata(rocm_image)
63+
library = None
64+
if "-pytorch-" in image_metadata.labels.get("name"):
65+
library = "torch"
66+
if "-tensorflow-" in image_metadata.labels.get("name"):
67+
library = "tensorflow"
68+
69+
# NOTE: the basic check is exactly the same as for cuda; in torch, even though it says "cuda", it is actually ROCm
70+
71+
# language=python
72+
torch_check = (
73+
"""import torch; device = "cuda" if torch.cuda.is_available() else "cpu"; print(f"Using {device} device")"""
74+
)
75+
# language=python
76+
tensorflow_check = """import tensorflow as tf; import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; print(tf.config.list_physical_devices('GPU'))"""
77+
78+
with kubernetes_utils.ImageDeployment(client, rocm_image) as image:
79+
image.deploy(
80+
container_name="notebook-tests-pod",
81+
accelerator="amd.com/gpu",
82+
is_runtime_image="-runtime-" in image_metadata.labels.get("name"),
83+
timeout=TestFrameConstants.TIMEOUT_20MIN,
84+
)
4085
if library == "torch":
4186
result = image.exec(shlex.join(["python", "-c", torch_check]))
4287
assert "Using cuda device" in result.stdout

0 commit comments

Comments
 (0)