diff --git a/ansible/roles/hpctests/README.md b/ansible/roles/hpctests/README.md index 5c5415158..ab32c9de8 100644 --- a/ansible/roles/hpctests/README.md +++ b/ansible/roles/hpctests/README.md @@ -29,9 +29,22 @@ Role Variables - `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used. - `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user). - `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). -- `hpctests_hpl_mem_frac`: Optional, default 0.8. The HPL problem size "N" will be selected to target using this fraction of each node's memory. +- `hpctests_hpl_mem_frac`: Optional, default 0.3. The HPL problem size "N" will + be selected to target using this fraction of each node's memory - + **CAUTION: see note below**. - `hpctests_hpl_arch`: Optional, default 'linux64'. Arbitrary architecture name for HPL build. HPL is compiled on the first compute node of those selected (see `hpctests_nodes`), so this can be used to create different builds for different types of compute node. + +--- +**CAUTION** + +> The default of `hpctests_hpl_mem_frac=0.3` will not significantly load nodes. +Values up to ~0.8 may be appropriate for a stress test but ensure cloud +operators are aware in case this overloads e.g. power supplies or cooling. +Values > 0.8 require longer runtimes and increase the risk of out-of-memory +errors without normally significantly increasing the stress on the node. +--- + The following variables should not generally be changed: - `hpctests_pre_cmd`: Optional. Command(s) to include in sbatch templates before module load commands. - `hpctests_pingmatrix_modules`: Optional. List of modules to load for pingmatrix test. Defaults are suitable for OpenHPC 2.x cluster using the required packages. diff --git a/ansible/roles/hpctests/defaults/main.yml b/ansible/roles/hpctests/defaults/main.yml index 30ddd8952..eb1864229 100644 --- a/ansible/roles/hpctests/defaults/main.yml +++ b/ansible/roles/hpctests/defaults/main.yml @@ -9,7 +9,7 @@ hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests" hpctests_ucx_net_devices: all hpctests_hpl_version: "2.3" hpctests_hpl_NB: 192 -hpctests_hpl_mem_frac: 0.8 +hpctests_hpl_mem_frac: 0.3 hpctests_hpl_arch: linux64 #hpctests_nodes: #hpctests_partition: diff --git a/docs/production.md b/docs/production.md index 57a64e5f8..9ba67a573 100644 --- a/docs/production.md +++ b/docs/production.md @@ -127,3 +127,6 @@ and referenced from the `site` and `production` environments, e.g.: - Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) may help identify any site-specific configuration. + +- See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on + raising `hpctests_hpl_mem_frac` during tests.