11import logging
22import os
33import re
4- import subprocess
54
65import pytest
76from ocp_resources .machine_set import MachineSet
87from ocp_resources .node import Node
8+ from ocp_resources .pod import Pod
99
1010from . import __loggername__
1111
@@ -113,6 +113,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
113113
114114 nodes = Node .get (dyn_client = openshift_dyn_client )
115115 gpu_nodes = []
116+ expected_count = 1
116117 for node in nodes :
117118 logger .info (node .instance .metadata .name )
118119 labels = node .instance .metadata .labels
@@ -125,9 +126,7 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
125126 if odh_label in label_str and worker_label in label_str :
126127 gpu_nodes .append (node )
127128
128- # logger.info(node_count)
129-
130- if len (gpu_nodes ) == 3 :
129+ if len (gpu_nodes ) == int (expected_count ):
131130 logger .info ("PASS: Found 'worker' and 'odh-notebook' GPU node-role labels" )
132131 else :
133132 err_msg = "Could not find 'worker' and 'odh-notebook' GPU node-role label"
@@ -139,35 +138,23 @@ def test_validate_gpu_node_role_labels_pods(openshift_dyn_client):
139138 """
140139 logger .info ("Checking pod count on GPU nodes" )
141140
142- for gpu_node in gpu_nodes :
143- name = gpu_node .instance .metadata .name
144- field_select = "--field-selector=spec.host=" + name
145- pod_count = 0
146- expected_count = 20
147- failed_nodes = []
148- cmd_out = subprocess .run (
149- [oc , "get" , "pod" , "-A" , field_select , "--no-headers" ], capture_output = True
150- )
151-
152- if cmd_out .stdout :
153- out_decoded = cmd_out .stdout .decode ("utf-8" )
154- logger .info (node .instance .metadata .name + "\n " + out_decoded )
155- out_split = out_decoded .splitlines ()
156-
157- for line in out_split :
158- if "Completed" in line :
159- continue
160- else :
161- pod_count += 1
162-
163- if pod_count < expected_count :
164- failed_nodes .append (node .instance .metadata .name )
165- else :
166- assert False , cmd_out .stderr
167-
168- if failed_nodes :
169- err_msg = f"Did not find the expected pod count on: { failed_nodes } "
141+ # We are assuming one GPU node
142+ gpu_node = gpu_nodes [0 ].instance .metadata .name
143+ nvidia_pods = []
144+ expected_count = 8
145+ project = "nvidia-gpu-operator"
146+ pods = Pod .get (dyn_client = openshift_dyn_client , namespace = project )
147+
148+ for pod in pods :
149+ if "nvidia" in pod .instance .metadata .name :
150+ logger .info (f"nvidia pod: { pod .instance .metadata .name } " )
151+ if gpu_node in pod .instance .spec .nodeName :
152+ logger .info (f"nvidia pod node name: { pod .instance .spec .nodeName } " )
153+ nvidia_pods .append (pod .instance .metadata .name )
154+
155+ if len (nvidia_pods ) == int (expected_count ):
156+ logger .info ("PASS: Found the expected nvidia pod count for GPU nodes" )
157+ else :
158+ err_msg = "Did not find the expected nvidia pod count for GPU nodes"
170159 logger .error (f"FAIL: { err_msg } " )
171160 assert False , err_msg
172- else :
173- logger .info ("PASS: Found the expected pod count for GPU nodes" )
0 commit comments