Skip to content

Commit 2ebc9cd

Browse files
Rexrexcsn
authored andcommitted
Add slurm test to assert online node attributes
* Add test to make sure that for online nodes, nodeaddr is not the same as nodename(should be set to private ip), and nodehostname is the same as nodename Signed-off-by: Rex <[email protected]>
1 parent 83706ed commit 2ebc9cd

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

tests/integration-tests/tests/common/schedulers_common.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,9 @@ def get_nodes_status(self, filter_by_nodes=None):
507507

508508
def get_node_addr_host(self):
509509
"""Return a list of nodename, nodeaddr, nodehostname entries."""
510+
# q1-dy-c5xlarge-1 172.31.4.241 q1-dy-c5xlarge-1
511+
# q1-dy-c5xlarge-2 172.31.4.136 q1-dy-c5xlarge-2
512+
# q1-dy-c5xlarge-3 q1-dy-c5xlarge-3 q1-dy-c5xlarge-3
510513
return self._remote_command_executor.run_remote_command(
511514
"/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'"
512515
).stdout.splitlines()

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,13 @@ def test_slurm_scaling(scheduler, region, instance, pcluster_config_reader, clus
140140
scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
141141

142142
_assert_cluster_initial_conditions(scheduler_commands, instance, 20, 20, 4, 1)
143+
_test_online_node_configured_correctly(
144+
scheduler_commands,
145+
partition="ondemand1",
146+
num_static_nodes=2,
147+
num_dynamic_nodes=2,
148+
dynamic_instance_type=instance,
149+
)
143150
_test_partition_states(
144151
scheduler_commands,
145152
cluster.cfn_name,
@@ -255,6 +262,39 @@ def _assert_cluster_initial_conditions(
255262
assert_that(len(dynamic_nodes)).is_equal_to(expected_num_dynamic)
256263

257264

265+
def _test_online_node_configured_correctly(
266+
scheduler_commands,
267+
partition,
268+
num_static_nodes,
269+
num_dynamic_nodes,
270+
dynamic_instance_type,
271+
):
272+
logging.info("Testing that online nodes' nodeaddr and nodehostname are configured correctly.")
273+
init_job_id = submit_initial_job(
274+
scheduler_commands,
275+
"sleep infinity",
276+
partition,
277+
dynamic_instance_type,
278+
num_dynamic_nodes,
279+
other_options="--no-requeue",
280+
)
281+
static_nodes, dynamic_nodes = assert_initial_conditions(
282+
scheduler_commands, num_static_nodes, num_dynamic_nodes, partition, cancel_job_id=init_job_id
283+
)
284+
node_attr_map = {}
285+
for node_entry in scheduler_commands.get_node_addr_host():
286+
nodename, nodeaddr, nodehostname = node_entry.split()
287+
node_attr_map[nodename] = {"nodeaddr": nodeaddr, "nodehostname": nodehostname}
288+
logging.info(node_attr_map)
289+
for nodename in static_nodes + dynamic_nodes:
290+
# For online nodes:
291+
# Nodeaddr should be set to private ip of instance
292+
# Nodehostname should be the same with nodename
293+
assert_that(nodename in node_attr_map).is_true()
294+
assert_that(nodename).is_not_equal_to(node_attr_map.get(nodename).get("nodeaddr"))
295+
assert_that(nodename).is_equal_to(node_attr_map.get(nodename).get("nodehostname"))
296+
297+
258298
def _test_partition_states(
259299
scheduler_commands,
260300
cluster_name,

0 commit comments

Comments
 (0)