diff --git a/.github/workflows/doca.yml b/.github/workflows/extra.yml similarity index 89% rename from .github/workflows/doca.yml rename to .github/workflows/extra.yml index cfd3bb982..dece242ce 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/extra.yml @@ -1,4 +1,4 @@ -name: Test DOCA extra build +name: Test extra build on: workflow_dispatch: push: @@ -7,16 +7,18 @@ on: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' jobs: doca: - name: doca-build + name: extra-build concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true @@ -25,12 +27,14 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - image_name: openhpc-doca-RL8 + - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca - - image_name: openhpc-doca-RL9 + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda + - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -95,6 +99,7 @@ jobs: -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + -var "volume_size=${{ matrix.build.volume_size }}" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 3f059d157..670a99b29 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -66,5 +66,4 @@ slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" - name: Show image summary - debug: - var: image_info + command: cat /var/lib/image/image.json diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..0a74541a5 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -24,8 +24,9 @@ gather_facts: yes tags: cuda tasks: - - import_role: + - include_role: name: cuda + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 55e56e612..c35be5b64 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -29,6 +29,14 @@ - import_playbook: bootstrap.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -220,8 +228,6 @@ import_role: name: doca -- import_playbook: disable-repos.yml - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -229,6 +235,8 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 141e7b80d..be6439cd5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -1,6 +1,6 @@ # cuda -Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. +Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. ## Prerequisites @@ -8,8 +8,8 @@ Requires OFED to be installed to provide required kernel-* packages. ## Role Variables -- `cuda_distro`: Optional. Default `rhel8`. -- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo` -- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed. +- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. +- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..05f1e093d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,6 @@ -cuda_distro: "rhel{{ ansible_distribution_major_version }}" -cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default -cuda_package_version: 'latest' +cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" +cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages +cuda_package_version: '12.6.3-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/install.yml similarity index 60% rename from ansible/roles/cuda/tasks/main.yml rename to ansible/roles/cuda/tasks/install.yml index 22f8e9e8e..51c92a0d3 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,7 +1,7 @@ # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation -- name: Check for OFED +- name: Check for OFED/DOCA command: cmd: dnf list --installed rdma-core register: _dnf_rdma_core @@ -10,41 +10,53 @@ - name: Assert OFED installed assert: that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" - name: Install cuda repo get_url: - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" - url: "{{ cuda_repo }}" + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" + url: "{{ cuda_repo_url }}" - name: Check if nvidia driver module is enabled - shell: - cmd: dnf module list --enabled nvidia-driver + ansible.builtin.command: dnf module list --enabled nvidia-driver changed_when: false failed_when: false register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" +- name: Check if nvidia driver module is installed + ansible.builtin.command: dnf module list --installed nvidia-driver + changed_when: false + failed_when: false + register: _cuda_driver_module_installed + - name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" +- name: Check kernel has not been modified + assert: + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" + when: cuda_package_version != 'none' register: cuda_package_install - name: Add cuda binaries to path lineinfile: path: /etc/profile.d/sh.local line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon systemd: @@ -60,3 +72,4 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 + when: cuda_package_install.changed diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml new file mode 100644 index 000000000..c16a48c6f --- /dev/null +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -0,0 +1,5 @@ +- name: Ensure NVIDIA Persistence Daemon state + systemd: + name: nvidia-persistenced + enabled: true + state: "{{ cuda_persistenced_state }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index db25176e2..be2f156a3 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241218-1011-5effb3fa", - "RL9": "openhpc-RL9-241218-1011-5effb3fa" + "RL8": "openhpc-RL8-241218-1705-09ac4268", + "RL9": "openhpc-RL9-241218-1705-09ac4268" } }