Skip to content

Commit 7f84fed

Browse files
committed
merge conflicts
2 parents 357f7e2 + fed2d6e commit 7f84fed

File tree

14 files changed

+77
-43
lines changed

14 files changed

+77
-43
lines changed

.github/workflows/doca.yml renamed to .github/workflows/extra.yml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Test DOCA extra build
1+
name: Test extra build
22
on:
33
workflow_dispatch:
44
push:
@@ -7,16 +7,18 @@ on:
77
paths:
88
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
99
- 'ansible/roles/doca/**'
10-
- '.github/workflows/doca'
10+
- 'ansible/roles/cuda/**'
11+
- '.github/workflows/extra.yml'
1112
pull_request:
1213
paths:
1314
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
1415
- 'ansible/roles/doca/**'
15-
- '.github/workflows/doca'
16+
- 'ansible/roles/cuda/**'
17+
- '.github/workflows/extra.yml'
1618

1719
jobs:
1820
doca:
19-
name: doca-build
21+
name: extra-build
2022
concurrency:
2123
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
2224
cancel-in-progress: true
@@ -25,12 +27,14 @@ jobs:
2527
fail-fast: false # allow other matrix jobs to continue even if one fails
2628
matrix: # build RL8, RL9
2729
build:
28-
- image_name: openhpc-doca-RL8
30+
- image_name: openhpc-extra-RL8
2931
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
30-
inventory_groups: doca
31-
- image_name: openhpc-doca-RL9
32+
inventory_groups: doca,cuda
33+
volume_size: 30 # needed for cuda
34+
- image_name: openhpc-extra-RL9
3235
source_image_name_key: RL9
33-
inventory_groups: doca
36+
inventory_groups: doca,cuda
37+
volume_size: 30 # needed for cuda
3438
env:
3539
ANSIBLE_FORCE_COLOR: True
3640
OS_CLOUD: openstack
@@ -95,6 +99,7 @@ jobs:
9599
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
96100
-var "image_name=${{ matrix.build.image_name }}" \
97101
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
102+
-var "volume_size=${{ matrix.build.volume_size }}" \
98103
openstack.pkr.hcl
99104
100105
- name: Get created image names from manifest

.github/workflows/fatimage.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@ jobs:
2323
matrix: # build RL8, RL9
2424
build:
2525
- image_name: openhpc-RL8
26-
source_image_name: rocky-latest-RL8
27-
inventory_groups: control,compute,login
26+
source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2
27+
inventory_groups: control,compute,login,update
2828
- image_name: openhpc-RL9
29-
source_image_name: rocky-latest-RL9
30-
inventory_groups: control,compute,login
29+
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
30+
inventory_groups: control,compute,login,update
3131
env:
3232
ANSIBLE_FORCE_COLOR: True
3333
OS_CLOUD: openstack

ansible/cleanup.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,5 +66,4 @@
6666
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
6767

6868
- name: Show image summary
69-
debug:
70-
var: image_info
69+
command: cat /var/lib/image/image.json

ansible/extras.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
gather_facts: yes
2525
tags: cuda
2626
tasks:
27-
- import_role:
27+
- include_role:
2828
name: cuda
29+
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
2930

3031
- name: Persist hostkeys across rebuilds
3132
# Must be after filesystems.yml (for storage)

ansible/fatimage.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@
2929

3030
- import_playbook: bootstrap.yml
3131

32+
- hosts: doca
33+
become: yes
34+
gather_facts: yes
35+
tasks:
36+
- name: Install NVIDIA DOCA
37+
import_role:
38+
name: doca
39+
3240
- name: Run post-bootstrap.yml hook
3341
vars:
3442
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -220,15 +228,15 @@
220228
import_role:
221229
name: doca
222230

223-
- import_playbook: disable-repos.yml
224-
225231
- name: Run post.yml hook
226232
vars:
227233
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
228234
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
229235
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
230236
when: hook_path | exists
231237

238+
- import_playbook: disable-repos.yml
239+
232240
- hosts: builder
233241
become: yes
234242
gather_facts: yes

ansible/roles/cuda/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# cuda
22

3-
Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
3+
Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled.
44

55
## Prerequisites
66

77
Requires OFED to be installed to provide required kernel-* packages.
88

99
## Role Variables
1010

11-
- `cuda_distro`: Optional. Default `rhel8`.
12-
- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo`
13-
- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed.
11+
- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
12+
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
1413
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`.
14+
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
1515
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.

ansible/roles/cuda/defaults/main.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
2-
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
3-
cuda_driver_stream: default
4-
cuda_package_version: 'latest'
1+
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
2+
cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages
3+
cuda_package_version: '12.6.3-1'
54
cuda_packages:
65
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
76
- nvidia-gds

ansible/roles/cuda/tasks/main.yml renamed to ansible/roles/cuda/tasks/install.yml

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
# Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation
33

4-
- name: Check for OFED
4+
- name: Check for OFED/DOCA
55
command:
66
cmd: dnf list --installed rdma-core
77
register: _dnf_rdma_core
@@ -10,41 +10,53 @@
1010
- name: Assert OFED installed
1111
assert:
1212
that: "'mlnx' in _dnf_rdma_core.stdout"
13-
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?"
13+
fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?"
1414

1515
- name: Install cuda repo
1616
get_url:
17-
dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo"
18-
url: "{{ cuda_repo }}"
17+
dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo"
18+
url: "{{ cuda_repo_url }}"
1919

2020
- name: Check if nvidia driver module is enabled
21-
shell:
22-
cmd: dnf module list --enabled nvidia-driver
21+
ansible.builtin.command: dnf module list --enabled nvidia-driver
2322
changed_when: false
2423
failed_when: false
2524
register: _cuda_driver_module_enabled
2625

2726
- name: Enable nvidia driver module
28-
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
27+
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}"
2928
register: _cuda_driver_module_enable
3029
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
3130
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
3231

32+
- name: Check if nvidia driver module is installed
33+
ansible.builtin.command: dnf module list --installed nvidia-driver
34+
changed_when: false
35+
failed_when: false
36+
register: _cuda_driver_module_installed
37+
3338
- name: Install nvidia drivers
3439
ansible.builtin.command: dnf module install -y nvidia-driver
3540
register: _cuda_driver_install
36-
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
41+
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
3742
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"
3843

44+
- name: Check kernel has not been modified
45+
assert:
46+
that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched
47+
fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}"
48+
3949
- name: Install cuda packages
4050
ansible.builtin.dnf:
4151
name: "{{ cuda_packages }}"
52+
when: cuda_package_version != 'none'
4253
register: cuda_package_install
4354

4455
- name: Add cuda binaries to path
4556
lineinfile:
4657
path: /etc/profile.d/sh.local
4758
line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin'
59+
when: cuda_package_version != 'none'
4860

4961
- name: Enable NVIDIA Persistence Daemon
5062
systemd:
@@ -60,3 +72,4 @@
6072
- name: Wait for hosts to be reachable
6173
wait_for_connection:
6274
sleep: 15
75+
when: cuda_package_install.changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
- name: Ensure NVIDIA Persistence Daemon state
2+
systemd:
3+
name: nvidia-persistenced
4+
enabled: true
5+
state: "{{ cuda_persistenced_state }}"

docs/experimental/pulp.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# Pulp Server
22

3-
In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark. Using a local Pulp can be enabled by adding `pulp` to the build groups and overriding `appliances_pulp_url` to point at the local Pulp's URL.
3+
In order to ensure reproducible builds, the appliance can build images using repository mirrors from StackHPC's "Ark" Pulp server. The appliance can sync relevant repositories to a local Pulp server which will then be used instead of Ark.
44

55
## Deploying/configuring Pulp Server
66

77
### Deploying a Pulp server
8-
A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml' playbook. This can be run with
8+
A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with
99
`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=<target_host>"`
1010
where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network.
1111

0 commit comments

Comments
 (0)