Skip to content

Commit ba4941d

Browse files
committed
Merge branch 'main' into control-ip-addresses
2 parents 9b5ed88 + 8820cdb commit ba4941d

File tree

11 files changed

+42
-40
lines changed

11 files changed

+42
-40
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ The default configuration in this repository may be used to create a cluster to
2525
- Persistent state backed by an OpenStack volume.
2626
- NFS-based shared file system backed by another OpenStack volume.
2727

28-
Note that the Open OnDemand portal and its remote apps are not usable with this default configuration.
29-
3028
It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud.
3129

3230
Before starting ensure that:

ansible/roles/cuda/README.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,10 @@
22

33
Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

5-
## Prerequisites
6-
7-
Requires OFED to be installed to provide required kernel-* packages.
8-
95
## Role Variables
106

117
- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
128
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
13-
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-9']`.
9+
- `cuda_packages`: Optional. Default provides CUDA Toolkit and GPUDirect Storage (GDS).
1410
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
1511
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
22
cuda_nvidia_driver_stream: '575-open'
3-
cuda_package_version: '12.9.0-1'
4-
cuda_version_short: '12.9'
3+
cuda_nvidia_driver_pkg: "nvidia-open-3:575.57.08-1.el{{ ansible_distribution_major_version }}"
4+
cuda_package_version: '12.9.1-1'
5+
cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor
56
cuda_packages:
6-
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
7+
- "cuda-toolkit-{{ cuda_package_version }}"
78
- nvidia-gds
89
- cmake
9-
- cuda-toolkit-12-9
1010
cuda_samples_release_url: "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v{{ cuda_version_short }}.tar.gz"
1111
cuda_samples_path: "/var/lib/{{ ansible_user }}/cuda_samples"
1212
cuda_samples_programs:

ansible/roles/cuda/tasks/install.yml

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,5 @@
11

2-
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
3-
4-
- name: Check for OFED/DOCA
5-
command:
6-
cmd: dnf list --installed rdma-core
7-
register: _dnf_rdma_core
8-
changed_when: false
9-
10-
- name: Assert OFED installed
11-
assert:
12-
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
2+
# Based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/
143

154
- name: Install cuda repo
165
get_url:
@@ -29,23 +18,18 @@
2918
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3019
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
3120

32-
- name: Check if nvidia driver module is installed
33-
ansible.builtin.command: dnf module list --installed nvidia-driver
34-
changed_when: false
35-
failed_when: false
36-
register: _cuda_driver_module_installed
37-
3821
- name: Install nvidia drivers
39-
ansible.builtin.command: dnf module install -y nvidia-driver
22+
ansible.builtin.dnf:
23+
name: "{{ cuda_nvidia_driver_pkg }}"
4024
register: _cuda_driver_install
41-
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
42-
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
4325

4426
- name: Check kernel has not been modified
4527
assert:
4628
that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched
4729
fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}"
4830

31+
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
32+
4933
- name: Install cuda packages
5034
ansible.builtin.dnf:
5135
name: "{{ cuda_packages }}"

ansible/roles/tuned/tasks/configure.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,17 @@
1212
register: _tuned_profile_current
1313
changed_when: false
1414

15-
- name: Set tuned-adm profile
15+
- name: Set TuneD profile
1616
ansible.builtin.command:
1717
cmd: "tuned-adm profile {{ tuned_profile }}"
18-
when:
18+
when:
19+
- tuned_started | bool
20+
- tuned_profile not in _tuned_profile_current.stdout
21+
22+
- name: Verify TuneD profile
23+
ansible.builtin.command:
24+
cmd: tuned-adm verify
25+
changed_when: false
26+
when:
1927
- tuned_started | bool
2028
- tuned_profile not in _tuned_profile_current.stdout

ansible/roles/tuned/tasks/install.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,15 @@
22
- name: Install tuneD
33
ansible.builtin.dnf:
44
name: tuned
5-
state: present
5+
state: present
6+
7+
- name: Fix TuneD hpc-compute profile for hugepages
8+
# See https://github.com/redhat-performance/tuned/issues/752
9+
# This is done on install, not configure, so that it is available even
10+
# for compute-init nodes
11+
community.general.ini_file:
12+
path: /usr/lib/tuned/hpc-compute/tuned.conf
13+
section: sysctl
14+
option: vm.min_free_kbytes
15+
value: '>135168'
16+
no_extra_spaces: true

environments/.stackhpc/inventory/extra_groups

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ cluster
2929

3030
[tuned:children]
3131
# Install tuned into fat image
32+
# NB: builder has tuned_enabled and tuned_started false so does not configure it
3233
builder
34+
# Also test tuned during site playbook
35+
cluster
3336

3437
[squid:children]
3538
# Install squid into fat image
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Set profile which is not default (on VMs) for testing
2+
tuned_profile: hpc-compute
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"cluster_image": {
3-
"RL8": "openhpc-RL8-250514-1502-5a923b2c",
4-
"RL9": "openhpc-RL9-250514-1502-5a923b2c"
3+
"RL8": "openhpc-RL8-250610-1435-d0ef926e",
4+
"RL9": "openhpc-RL9-250610-1435-d0ef926e"
55
}
66
}

environments/common/inventory/group_vars/all/nfs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ nfs_configurations_extra: [] # site-specific nfs shares
3636

3737
nfs_configurations: >- # construct stackhpc.nfs variable
3838
{{
39-
nfs_configuration_home_volume if (cluster_home_volume | default(true)) else []
39+
(nfs_configuration_home_volume if (cluster_home_volume | default(true)) else [])
4040
+
4141
nfs_configuration_compute_nodes
4242
+

0 commit comments

Comments
 (0)