diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 1e74d07f3..532f53193 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -10,6 +10,7 @@ Requires OFED to be installed to provide required kernel-* packages. - `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. - `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. +- `cuda_nvidia_driver_version`: Optional. Version of `nvidia-driver` module to install. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-8']`. - `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index fd4bf37c8..d397ecfc8 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,5 +1,6 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" cuda_nvidia_driver_stream: '570-open' +cuda_nvidia_driver_version: '570.133.20-1.el9' # version-release format cuda_package_version: '12.8.1-1' cuda_version_short: '12.8' cuda_packages: diff --git a/ansible/roles/cuda/library/dnf_module_info.py b/ansible/roles/cuda/library/dnf_module_info.py new file mode 100644 index 000000000..067fbc899 --- /dev/null +++ b/ansible/roles/cuda/library/dnf_module_info.py @@ -0,0 +1,79 @@ +from ansible.module_utils.basic import AnsibleModule +import subprocess + +DOCUMENTATION = r''' +module: dnf_module_info +description: Return a list of packages which would be installed by a dnf module +options: + name: + description: The dnf module name + required: true + type: str + stream: + description: The stream to query + required: true + type: str +''' + +RETURN = r''' +info: + description: The version/context etc for the module + type: list + returned: always +profiles: + description: A mapping, keyed by profile name, where values are a list + of packages this profile will install + type: dict + returned: always +stdout: + description: Raw stdout from the dnf command + type: str + returned: always +''' + +def dnf_module_packages(): + module_args = dict( + name=dict(type='str', required=True), + stream=dict(type='str', required=True) + ) + result = {'changed': False} + + module = AnsibleModule( + argument_spec=module_args, + supports_check_mode=True + ) + dnf_module_name = module.params['name'] + dnf_module_stream = module.params['stream'] + + # first get the list of packages installed by the module + cmd = ['dnf', 'module', 'info', '--profile', f'{dnf_module_name}:{dnf_module_stream}'] + dnf = subprocess.run(cmd, capture_output=True, text=True) + + curr_profile_name = '' + profiles = {} + for line in dnf.stdout.splitlines(): + if line.startswith('Last'): # metadata expiration info + continue + elif not ':' in line: + continue + elif line.startswith('Name'): + info = [v.strip() for v in line.split(':')] + else: + try: + profile, pkg = (v.strip() for v in line.split(':')) + except ValueError: + raise ValueError(line) + if profile != '' and profile != curr_profile_name: + curr_profile_name = profile + profiles[profile] = [] + profiles[curr_profile_name].append(pkg) + + + result['info'] = info + result['profiles'] = profiles + result['stdout'] = dnf.stdout + module.exit_json(**result) + + +if __name__ == '__main__': + dnf_module_packages() diff --git a/ansible/roles/cuda/tasks/install.yml b/ansible/roles/cuda/tasks/install.yml index 51c92a0d3..f199ba889 100644 --- a/ansible/roles/cuda/tasks/install.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -29,17 +29,26 @@ when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" -- name: Check if nvidia driver module is installed - ansible.builtin.command: dnf module list --installed nvidia-driver - changed_when: false - failed_when: false - register: _cuda_driver_module_installed +- name: Read module info for list of packages + # Even a single version of the module stream contains multiple package versions + # so need to find the ones we want. + dnf_module_info: + name: nvidia-driver + stream: "{{ cuda_nvidia_driver_stream }}" + register: _cuda_driver_module_info -- name: Install nvidia drivers - ansible.builtin.command: dnf module install -y nvidia-driver +- name: Install nvidia driver packages for specific version + # entries in package list from above are e.g. 'nvidia-driver', where the + # actual package name is 'nvidia-driver-3:570.133.20-1.el9.x86_64' + # The epoch (3) and arch (x86_64) can be omitted but the release (1.el9) + # cannot be, so need to generate e.g. 'nvidia-driver-570.133.20-1.el9' + ansible.builtin.dnf: + name: >- + {{ + _cuda_driver_module_info.profiles.default | + map('regex_replace', '$', '-' ~ cuda_nvidia_driver_version) + }} register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" - changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" - name: Check kernel has not been modified assert: