Skip to content

Commit b2f49dd

Browse files
committed
fix hpctests to work with root-squashed /home
1 parent 53705c9 commit b2f49dd

File tree

12 files changed

+36
-37
lines changed

12 files changed

+36
-37
lines changed

ansible/roles/hpctests/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ Requirements
2222

2323
Role Variables
2424
--------------
25-
26-
- `hpctests_rootdir`: Required. Path to root of test directory tree, which must be on a r/w filesystem shared to all cluster nodes under test. The last directory component will be created.
25+
- `hpctests_user`: Optional. User to run jobs as. Default is `ansible_user`.
26+
- `hpctests_rootdir`: Optional. Path to root of test directory tree. This must
27+
be a r/w filesystem shared to all cluster nodes under test. Default is
28+
`/home/{{ hpctests_user }}/hpctests`. **NB:** Do not use `~` in this path.
2729
- `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used.
2830
- `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used.
2931
- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used.

ansible/roles/hpctests/defaults/main.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
---
2-
hpctests_rootdir:
2+
hpctests_user: "{{ ansible_user }}"
3+
hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests"
34
hpctests_pre_cmd: ''
45
hpctests_pingmatrix_modules: [gnu12 openmpi4]
56
hpctests_pingpong_modules: [gnu12 openmpi4 imb]

ansible/roles/hpctests/library/plot_nxnlatbw.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Apache 2 License
66

77
from ansible.module_utils.basic import AnsibleModule
8-
import json
8+
import json, os
99

1010
ANSIBLE_METADATA = {
1111
"metadata_version": "0.1",
@@ -109,8 +109,8 @@ def run_module():
109109
module = AnsibleModule(argument_spec=module_args, supports_check_mode=True)
110110
result = {"changed": False}
111111

112-
src = module.params["src"]
113-
dest = module.params["dest"]
112+
src = os.path.expanduser(module.params["src"])
113+
dest = os.path.expanduser(module.params["dest"])
114114
nodes = module.params["nodes"]
115115
if nodes is not None:
116116
nodes = nodes.split(',')

ansible/roles/hpctests/tasks/build-hpl.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252

5353
- name: Build HPL executable
5454
shell:
55-
cmd: "sbatch --wait hpl-build-{{ hpctests_hpl_arch }}.sh"
55+
cmd: "bash -l -c 'sbatch --wait hpl-build-{{ hpctests_hpl_arch }}.sh'" # need login shell for module command
5656
chdir: "{{ hpctests_hpl_srcdir }}"
5757
creates: "bin/{{ hpctests_hpl_arch }}/xhpl"
58-
become: no

ansible/roles/hpctests/tasks/hpl-solo.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@
8080
cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out"
8181

8282
- name: Run hpl-solo
83-
shell: sbatch --wait hpl-solo.sh
84-
become: no
83+
shell: bash -l -c 'sbatch --wait hpl-solo.sh' # need login shell for module command
8584
args:
8685
chdir: "{{ hpctests_rootdir }}/hpl-solo"
8786
async: "{{ 20 * 60 }}" # wait for up to 20 minutes
Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,38 @@
11
- name: setup
22
block:
3-
- include: setup.yml
3+
- include_tasks: setup.yml
4+
become: true
5+
become_user: "{{ hpctests_user }}"
46
tags: always
57

68
- name: pingpong
79
block:
8-
- include: pingpong.yml
10+
- include_tasks: pingpong.yml
911
when: hpctests_computes.stdout_lines | length > 1
12+
become: true
13+
become_user: "{{ hpctests_user }}"
1014
tags: pingpong
1115

1216
- name: pingmatrix
1317
block:
14-
- include: pingmatrix.yml
18+
- include_tasks: pingmatrix.yml
1519
when: hpctests_computes.stdout_lines | length > 1
20+
become: true
21+
become_user: "{{ hpctests_user }}"
1622
tags: pingmatrix
1723

1824
- name: build HPL
1925
block:
20-
- include: build-hpl.yml
26+
- include_tasks: build-hpl.yml
27+
become: true
28+
become_user: "{{ hpctests_user }}"
2129
tags:
2230
- hpl-solo
2331

2432
- name: run HPL on individual nodes
2533
block:
26-
- include: hpl-solo.yml
34+
- include_tasks: hpl-solo.yml
35+
become: true
36+
become_user: "{{ hpctests_user }}"
2737
tags:
2838
- hpl-solo

ansible/roles/hpctests/tasks/pingmatrix.yml

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,6 @@
55
path: "{{ hpctests_rootdir }}/pingmatrix"
66
state: directory
77

8-
- name: Precreate files to workaround selinux context issues on NFS mounts
9-
file:
10-
path: "{{ hpctests_rootdir }}/pingmatrix/{{ item }}"
11-
state: touch
12-
loop:
13-
- mpi_nxnlatbw.c
14-
- pingmatrix.sh
15-
168
- name: Copy source
179
copy:
1810
src: mpi_nxnlatbw.c
@@ -24,7 +16,7 @@
2416
dest: "{{ hpctests_rootdir }}/pingmatrix/pingmatrix.sh"
2517

2618
- name: Run ping matrix
27-
shell: sbatch --wait pingmatrix.sh
19+
shell: bash -l -c 'sbatch --wait pingmatrix.sh' # need login shell for module command
2820
args:
2921
chdir: "{{ hpctests_rootdir }}/pingmatrix"
3022
register: hpctests_pingmatrix_sbatch

ansible/roles/hpctests/tasks/pingpong.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,6 @@
55
path: "{{ hpctests_rootdir }}/pingpong"
66
state: directory
77

8-
- name: Precreate files to workaround selinux context issues on NFS mounts
9-
file:
10-
path: "{{ hpctests_rootdir }}/pingpong/{{ item }}"
11-
state: touch
12-
loop:
13-
- pingpong.sh
14-
158
- name: Create sbatch script
169
template:
1710
src: pingpong.sh.j2
@@ -20,8 +13,7 @@
2013
- name: Run pingpong
2114
block:
2215
- name: Submit jobscript
23-
shell: sbatch --wait pingpong.sh
24-
become: no
16+
shell: bash -l -c 'sbatch --wait pingpong.sh' # need login shell for module command
2517
args:
2618
chdir: "{{ hpctests_rootdir }}/pingpong"
2719
register: hpctests_pingpong_sbatch
@@ -54,18 +46,21 @@
5446
path: "{{ _pingpong_local_output }}"
5547
register: hpctests_pingpong_out
5648
delegate_to: localhost
49+
become: false
5750

5851
- name: Read nodes used
5952
shell: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}"
6053
register: hpctests_pingpong_run_nodes
6154
delegate_to: localhost
55+
become: false
6256

6357
- name: Plot image
6458
shell:
6559
cmd: "python {{ role_path }}/files/plot_imb_pingpong.py {{ _pingpong_local_output }}"
6660
creates: "{{ _pingpong_local_output | dirname }}/latency.png"
6761
register: _pingpong_plot
6862
delegate_to: localhost
63+
become: false
6964
when: hpctests_pingpong_plot | bool
7065

7166
- debug:

ansible/roles/hpctests/tasks/setup.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,8 @@
2525
file:
2626
path: "{{ hpctests_rootdir }}"
2727
state: directory
28-
owner: "{{ ansible_user }}"
29-
group: "{{ ansible_user }}"
30-
become: true
28+
owner: "{{ hpctests_user }}"
29+
group: "{{ hpctests_user }}"
3130

3231
- name: Set fact for UCX_NET_DEVICES
3332
set_fact:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
hpctests_user: demo_user

0 commit comments

Comments
 (0)