@@ -53,30 +53,27 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in
5353 return job_id
5454
5555
56- def assert_imex_nodes_config_is_correct (
57- rce : RemoteCommandExecutor , queue_name : str , compute_resource_name : str , expected_ips : list
58- ):
59- logging .info (f"Checking IMEX nodes config contains the expected nodes: { expected_ips } " )
60- imex_nodes_config_file = (
61- f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{ queue_name } _{ compute_resource_name } .cfg"
62- )
63- imex_config_content = read_remote_file (rce , imex_nodes_config_file )
64- imex_config_content_clean = [line for line in imex_config_content .split ("\n " ) if not line .strip ().startswith ("#" )]
65- actual_ips = [ip .strip () for ip in imex_config_content_clean ]
66- assert_that (actual_ips ).contains_only (* expected_ips )
67- logging .info (f"IMEX nodes config { imex_nodes_config_file } contains the expected nodes: { expected_ips } " )
56+ def assert_imex_nodes_config_is_correct (cluster : Cluster , queue : str , compute_resource : str , expected_ips : list ):
57+ for compute_node_ip in cluster .get_compute_nodes_private_ip (queue , compute_resource ):
58+ logging .info (f"Checking IMEX nodes config for compute node { compute_node_ip } contains the expected nodes: { expected_ips } " )
59+ rce = RemoteCommandExecutor (cluster , compute_node_ip = compute_node_ip )
60+ imex_config_content = read_remote_file (rce , "/etc/nvidia-imex/nodes_config.cfg" )
61+ imex_config_content_clean = [line for line in imex_config_content .split ("\n " ) if not line .strip ().startswith ("#" )]
62+ actual_ips = [ip .strip () for ip in imex_config_content_clean ]
63+ assert_that (actual_ips ).contains_only (* expected_ips )
64+ logging .info (f"IMEX nodes config for compute node { compute_node_ip } contains the expected nodes: { expected_ips } " )
6865
6966
7067def assert_no_errors_in_logs (cluster : Cluster , queue : str , compute_resource : str ):
71- rce = RemoteCommandExecutor (cluster )
7268 logs = ["/var/log/nvidia-imex-verbose.log" , "/var/log/parallelcluster/nvidia-imex-prolog.log" ]
7369 for compute_node_ip in cluster .get_compute_nodes_private_ip (queue , compute_resource ):
70+ rce = RemoteCommandExecutor (cluster , compute_node_ip = compute_node_ip )
7471 for log in logs :
7572 logging .info (f"Checking file { log } log does not contain any error" )
7673 if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file (rce , log ):
7774 logging .info ("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors." )
7875 continue
79- assert_regex_in_file (cluster , compute_node_ip , log , r"(warn|error|fail)" , negate = True )
76+ assert_regex_in_file (rce , log , r"(warn|error|fail)" , negate = True )
8077
8178
8279def assert_imex_status (
@@ -210,7 +207,7 @@ def _check_imex_healthy():
210207 f"Private IP addresses for nodes in queue { queue } and compute resource { compute_resource } : " f"{ ips } "
211208 )
212209
213- assert_imex_nodes_config_is_correct (rce , queue , compute_resource , ips )
210+ assert_imex_nodes_config_is_correct (cluster , queue , compute_resource , ips )
214211 assert_imex_status (rce , job_id , ips , service_status = "UP" , node_status = "READY" , connection_status = "CONNECTED" )
215212 assert_no_errors_in_logs (cluster , queue , compute_resource )
216213
@@ -240,7 +237,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s
240237
241238 job_id = submit_job_imex_status (rce , queue , max_nodes )
242239
243- assert_imex_nodes_config_is_correct (rce , queue , compute_resource , FAKE_IPS )
240+ assert_imex_nodes_config_is_correct (cluster , queue , compute_resource , FAKE_IPS )
244241 assert_imex_status (
245242 rce , job_id , FAKE_IPS , service_status = "DOWN" , node_status = "UNAVAILABLE" , connection_status = "INVALID"
246243 )
0 commit comments