Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ansible/roles/cuda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Requires OFED to be installed to provide required kernel-* packages.

- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture.
- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version.
- `cuda_nvidia_driver_version`: Optional. Version of `nvidia-driver` module to install.
- `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds', 'cmake', 'cuda-toolkit-12-8']`.
- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA.
- `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`.
1 change: 1 addition & 0 deletions ansible/roles/cuda/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo"
cuda_nvidia_driver_stream: '570-open'
cuda_nvidia_driver_version: '570.133.20-1.el9' # version-release format
cuda_package_version: '12.8.1-1'
cuda_version_short: '12.8'
cuda_packages:
Expand Down
79 changes: 79 additions & 0 deletions ansible/roles/cuda/library/dnf_module_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from ansible.module_utils.basic import AnsibleModule
import subprocess

DOCUMENTATION = r'''
module: dnf_module_info
description: Return a list of packages which would be installed by a dnf module
options:
name:
description: The dnf module name
required: true
type: str
stream:
description: The stream to query
required: true
type: str
'''

RETURN = r'''
info:
description: The version/context etc for the module
type: list
returned: always
profiles:
description: A mapping, keyed by profile name, where values are a list
of packages this profile will install
type: dict
returned: always
stdout:
description: Raw stdout from the dnf command
type: str
returned: always
'''

def dnf_module_packages():
module_args = dict(
name=dict(type='str', required=True),
stream=dict(type='str', required=True)
)
result = {'changed': False}

module = AnsibleModule(
argument_spec=module_args,
supports_check_mode=True
)
dnf_module_name = module.params['name']
dnf_module_stream = module.params['stream']

# first get the list of packages installed by the module
cmd = ['dnf', 'module', 'info', '--profile', f'{dnf_module_name}:{dnf_module_stream}']
dnf = subprocess.run(cmd, capture_output=True, text=True)

curr_profile_name = ''
profiles = {}
for line in dnf.stdout.splitlines():
if line.startswith('Last'): # metadata expiration info
continue
elif not ':' in line:
continue
elif line.startswith('Name'):
info = [v.strip() for v in line.split(':')]
else:
try:
profile, pkg = (v.strip() for v in line.split(':'))
except ValueError:
raise ValueError(line)
if profile != '' and profile != curr_profile_name:
curr_profile_name = profile
profiles[profile] = []
profiles[curr_profile_name].append(pkg)


result['info'] = info
result['profiles'] = profiles
result['stdout'] = dnf.stdout
module.exit_json(**result)


if __name__ == '__main__':
dnf_module_packages()
27 changes: 18 additions & 9 deletions ansible/roles/cuda/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,26 @@
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"

- name: Check if nvidia driver module is installed
ansible.builtin.command: dnf module list --installed nvidia-driver
changed_when: false
failed_when: false
register: _cuda_driver_module_installed
- name: Read module info for list of packages
# Even a single version of the module stream contains multiple package versions
# so need to find the ones we want.
dnf_module_info:
name: nvidia-driver
stream: "{{ cuda_nvidia_driver_stream }}"
register: _cuda_driver_module_info

- name: Install nvidia drivers
ansible.builtin.command: dnf module install -y nvidia-driver
- name: Install nvidia driver packages for specific version
# entries in package list from above are e.g. 'nvidia-driver', where the
# actual package name is 'nvidia-driver-3:570.133.20-1.el9.x86_64'
# The epoch (3) and arch (x86_64) can be omitted but the release (1.el9)
# cannot be, so need to generate e.g. 'nvidia-driver-570.133.20-1.el9'
ansible.builtin.dnf:
name: >-
{{
_cuda_driver_module_info.profiles.default |
map('regex_replace', '$', '-' ~ cuda_nvidia_driver_version)
}}
register: _cuda_driver_install
when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_install.stdout"

- name: Check kernel has not been modified
assert:
Expand Down