Skip to content

Commit 8ca25e8

Browse files
authored
Merge pull request #217 from stackhpc/feature/multipart-hpctests2
Support multiple partitions in MPI tests
2 parents fd0c2c0 + 6d14c7e commit 8ca25e8

File tree

18 files changed

+52
-25
lines changed

18 files changed

+52
-25
lines changed

ansible/adhoc/hpctests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
---
66

7-
- hosts: hpctests[0] # TODO: might want to make which node is used selectable?
7+
- hosts: login[0] # TODO: might want to make which node is used selectable?
88
become: false
99
gather_facts: false
1010
tasks:

ansible/ci/test_reimage.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,10 @@
4545
gather_facts: no
4646
tags: reimage_compute
4747
tasks:
48-
# TODO: This is specific to smslabs/arcus environment config - could generalise to all compute nodes
48+
# TODO: This is specific to arcus environment config - could generalise to all compute nodes
4949
- name: Request compute node rebuild via Slurm
5050
shell:
51-
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-1]
51+
cmd: scontrol reboot ASAP nextstate=RESUME reason='rebuild image:{{ compute_build.artifact_id }}' {{ openhpc_cluster_name }}-compute-[0-3]
5252
become: yes
5353

5454
- name: Check compute node rebuild completed

ansible/roles/hpctests/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ Role Variables
2424
--------------
2525

2626
- `hpctests_rootdir`: Required. Path to root of test directory tree, which must be on a r/w filesystem shared to all cluster nodes under test. The last directory component will be created.
27-
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the default partition are used. Note nodes selected **must** be in the default partition.
27+
- `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used.
28+
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used.
2829
- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use).
2930
- `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user).
3031
- `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html).

ansible/roles/hpctests/defaults/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ hpctests_hpl_NB: 192
1010
hpctests_hpl_mem_frac: 0.8
1111
hpctests_hpl_arch: linux64
1212
#hpctests_nodes:
13+
#hpctests_partition:

ansible/roles/hpctests/library/slurm_node_info.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
options
2424
nodes:
2525
description:
26-
- Slurm nodenames for which information is required. These must be homogenous.
26+
- Slurm nodenames for which information is required.
2727
required: true
2828
type: list
2929
requirements:
@@ -56,7 +56,6 @@ def run_module():
5656
print(values)
5757
for ix, param in enumerate(params):
5858
info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']]
59-
# info[param] = [nodeinfo[nodelist_ix] for nodeinfo in values]
6059
result['info'] = info
6160

6261
module.exit_json(**result)

ansible/roles/hpctests/tasks/hpl-solo.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
- debug:
4343
msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}"
4444

45-
- name: Get all nodes
46-
shell: "sinfo --Node --noheader --format %N" # TODO: assumes only one partition, although actually excluding nodes not in the default partition should be fine.
45+
- name: Get all nodes in partition
46+
shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}"
4747
register: all_nodes
4848
changed_when: false
4949

@@ -74,6 +74,11 @@
7474
vars:
7575
hpctests_hplsolo_ntasks: 2 # TODO: FIXME
7676

77+
- name: Remove previous outputs
78+
# As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten
79+
shell:
80+
cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out"
81+
7782
- name: Run hpl-solo
7883
shell: sbatch --wait hpl-solo.sh
7984
become: no
@@ -111,10 +116,11 @@
111116
tags: postpro
112117
debug:
113118
msg: |
114-
Summary for hpl-solo ({{ hpctests_computes.stdout_lines | length }} nodes) job {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
119+
Summary for hpl-solo on {{ hpctests_computes.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
120+
115121
Max: {{ perf.stdout_lines | map('float') | max }} gflops
116122
Min: {{ perf.stdout_lines | map('float') | min }} gflops
117-
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
123+
Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops
118124
119125
Individual node results (gflops):
120126
{{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float') )) | to_nice_yaml }}

ansible/roles/hpctests/tasks/pingmatrix.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
- name: Summarise results
6262
debug:
6363
msg: |
64-
Summary for pingmatrix (pairwise on {{ slurm_names.stdout_lines | length }} nodes) job {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }} using {{ hpctests_ucx_net_devices }}:
64+
Summary for pingmatrix pairwise over {{ slurm_names.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}':
65+
6566
{{ nxnlatbw['stats'] | to_nice_yaml }}
67+
6668
Tabular output on ansible control host at {{ hpctests_outdir }}/pingmatrix.html

ansible/roles/hpctests/tasks/pingpong.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,11 @@
5555

5656
- debug:
5757
msg: |
58-
Summary for pingpong (2x scheduler-selected nodes) job {{ _pingpong_jobid }} (using interface {{ hpctests_ucx_net_devices }}):
59-
nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
60-
zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
61-
max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
58+
Summary for pingpong using 2x scheduler-selected nodes in '{{ hpctests_partition }}' partition, job ID {{ _pingpong_jobid }}, device '{{ hpctests_ucx_net_devices }}':
59+
60+
Nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }}
61+
Zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us
62+
Max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s)
6263
6364
See plot on localhost:
6465
{{ _pingpong_plot.stdout }}

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
---
22

3+
- name: Get partition information
4+
shell: "sinfo --format %P --noheader"
5+
register: _sinfo_partitions
6+
changed_when: false
7+
8+
- name: Select default partition if hpctests_partition not given
9+
set_fact:
10+
hpctests_partition: "{{ _sinfo_partitions.stdout_lines | select('contains', '*') | first | trim('*') }}"
11+
when: hpctests_partition is not defined
12+
313
- name: Get info about compute nodes
4-
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --format %N"
14+
shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N"
515
register: hpctests_computes
616
changed_when: false
717
failed_when: hpctests_computes.rc != 0

ansible/roles/hpctests/templates/hpl-build.sh.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#SBATCH --output=%x.%a.out
55
#SBATCH --error=%x.%a.out
66
#SBATCH --exclusive
7+
#SBATCH --partition={{ hpctests_partition }}
78
{%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %}
89

910
echo HPL arch: {{ hpctests_hpl_arch }}

0 commit comments

Comments
 (0)