@@ -37,6 +37,10 @@ class TestFrameConstants:
37
37
GLOBAL_POLL_INTERVAL_MEDIUM = 10
38
38
TIMEOUT_2MIN = 2 * 60
39
39
TIMEOUT_5MIN = 5 * 60
40
+ TIMEOUT_20MIN = 20 * 60
41
+
42
+ # this includes potentially pulling the image, and cuda images are huge
43
+ READINESS_TIMEOUT = TIMEOUT_5MIN
40
44
41
45
42
46
logging .basicConfig (level = logging .DEBUG )
@@ -133,10 +137,14 @@ def __enter__(self) -> Self:
133
137
return self
134
138
135
139
def __exit__ (self , exc_type , exc_val , exc_tb ):
136
- self .tf .destroy ()
140
+ self .tf .destroy (wait = True )
137
141
138
142
def deploy (
139
- self , container_name : str , accelerator : Literal ["amd.com/gpu" , "nvidia.com/gpu" ] | None = None
143
+ self ,
144
+ container_name : str ,
145
+ accelerator : Literal ["amd.com/gpu" , "nvidia.com/gpu" ] | None = None ,
146
+ is_runtime_image : bool = False ,
147
+ timeout : float = TestFrameConstants .READINESS_TIMEOUT ,
140
148
) -> kubernetes .client .models .v1_pod .V1Pod :
141
149
LOGGER .debug (f"Deploying { self .image } " )
142
150
# custom namespace is necessary, because we cannot assign a SCC to pods created in one of the default namespaces:
@@ -188,7 +196,15 @@ def deploy(
188
196
{
189
197
"name" : container_name ,
190
198
"image" : self .image ,
191
- # "command": ["/bin/sh", "-c", "while true ; do date; sleep 5; done;"],
199
+ # "command": ["/bin/sh", "-c", "while true; do date; sleep 5; done;"],
200
+ ** (
201
+ {
202
+ "command" : ["/bin/sh" ],
203
+ "args" : ["-c" , "sleep infinity" ],
204
+ }
205
+ if is_runtime_image
206
+ else {}
207
+ ),
192
208
"ports" : [
193
209
{
194
210
"containerPort" : 8888 ,
@@ -229,7 +245,11 @@ def deploy(
229
245
self .tf .defer_resource (deployment )
230
246
LOGGER .debug ("Waiting for pods to become ready..." )
231
247
PodUtils .wait_for_pods_ready (
232
- self .client , namespace_name = ns .name , label_selector = f"app={ container_name } " , expect_pods_count = 1
248
+ self .client ,
249
+ namespace_name = ns .name ,
250
+ label_selector = f"app={ container_name } " ,
251
+ expect_pods_count = 1 ,
252
+ timeout = timeout ,
233
253
)
234
254
235
255
core_v1_api = kubernetes .client .api .core_v1_api .CoreV1Api (api_client = self .client .client )
@@ -239,21 +259,22 @@ def deploy(
239
259
assert len (pod_name .items ) == 1
240
260
self .pod : kubernetes .client .models .v1_pod .V1Pod = pod_name .items [0 ]
241
261
242
- p = socket_proxy .SocketProxy (lambda : exposing_contextmanager (core_v1_api , self .pod ), "localhost" , 0 )
243
- t = threading .Thread (target = p .listen_and_serve_until_canceled )
244
- t .start ()
245
- self .tf .defer (t , lambda thread : thread .join ())
246
- self .tf .defer (p .cancellation_token , lambda token : token .cancel ())
247
-
248
- self .port = p .get_actual_port ()
249
- LOGGER .debug (f"Listening on port { self .port } " )
250
- Wait .until (
251
- "Connecting to pod succeeds" ,
252
- 1 ,
253
- 30 ,
254
- lambda : requests .get (f"http://localhost:{ self .port } " ).status_code == 200 ,
255
- )
256
- LOGGER .debug ("Done setting up portforward" )
262
+ if not is_runtime_image :
263
+ p = socket_proxy .SocketProxy (lambda : exposing_contextmanager (core_v1_api , self .pod ), "localhost" , 0 )
264
+ t = threading .Thread (target = p .listen_and_serve_until_canceled )
265
+ t .start ()
266
+ self .tf .defer (t , lambda thread : thread .join ())
267
+ self .tf .defer (p .cancellation_token , lambda token : token .cancel ())
268
+
269
+ self .port = p .get_actual_port ()
270
+ LOGGER .debug (f"Listening on port { self .port } " )
271
+ Wait .until (
272
+ "Connecting to pod succeeds" ,
273
+ 1 ,
274
+ 30 ,
275
+ lambda : requests .get (f"http://localhost:{ self .port } " ).status_code == 200 ,
276
+ )
277
+ LOGGER .debug ("Done setting up portforward" )
257
278
258
279
return self .pod
259
280
@@ -300,20 +321,16 @@ def exec(self, command: str) -> subprocess.CompletedProcess:
300
321
301
322
302
323
class PodUtils :
303
- # this includes potentially pulling the image, and cuda images are huge
304
- READINESS_TIMEOUT = TestFrameConstants .TIMEOUT_5MIN
305
-
306
324
# consider using timeout_sampler
307
325
@staticmethod
308
326
def wait_for_pods_ready (
309
- client : DynamicClient , namespace_name : str , label_selector : str , expect_pods_count : int
327
+ client : DynamicClient ,
328
+ namespace_name : str ,
329
+ label_selector : str ,
330
+ expect_pods_count : int ,
331
+ timeout : float = TestFrameConstants .READINESS_TIMEOUT ,
310
332
) -> None :
311
- """Wait for all pods in namespace to be ready
312
- :param client:
313
- :param namespace_name: name of the namespace
314
- :param label_selector:
315
- :param expect_pods_count:
316
- """
333
+ """Wait for all pods in namespace to be ready"""
317
334
318
335
# it's a dynamic client with the `resource` parameter already filled in
319
336
class ResourceType (kubernetes .dynamic .Resource , kubernetes .dynamic .DynamicClient ):
@@ -359,7 +376,7 @@ def ready() -> bool:
359
376
Wait .until (
360
377
description = f"readiness of all Pods matching { label_selector } in Namespace { namespace_name } " ,
361
378
poll_interval = TestFrameConstants .GLOBAL_POLL_INTERVAL_MEDIUM ,
362
- timeout = PodUtils . READINESS_TIMEOUT ,
379
+ timeout = timeout ,
363
380
ready = ready ,
364
381
)
365
382
0 commit comments