|
| 1 | +--- |
| 2 | +- name: Enable GPU passthough |
| 3 | + hosts: "{{ gpu_group_map.keys() | default([]) }}" |
| 4 | + vars: |
| 5 | + # This playbook will execute after nodes are deployed |
| 6 | + # and before overcloud host configure - we can't assume |
| 7 | + # users and venvs exist. |
| 8 | + ansible_user: "{{ bootstrap_user }}" |
| 9 | + ansible_python_interpreter: "/usr/bin/python3" |
| 10 | + ansible_ssh_common_args: "-o StrictHostKeyChecking=no" |
| 11 | + vfio_pci_ids: |- |
| 12 | + {% set output = [] %} |
| 13 | + {% for gpu_type in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | default([]) %} |
| 14 | + {% set _ = output.append(stackhpc_gpu_data[gpu_type]['vendor_id'] + ':' + stackhpc_gpu_data[gpu_type]['product_id']) %} |
| 15 | + {% endfor %} |
| 16 | + {{ output | join(',') }} |
| 17 | + tasks: |
| 18 | + - name: Template dracut config |
| 19 | + blockinfile: |
| 20 | + path: /etc/dracut.conf.d/gpu-vfio.conf |
| 21 | + block: | |
| 22 | + add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd" |
| 23 | + owner: root |
| 24 | + group: root |
| 25 | + mode: 0660 |
| 26 | + create: true |
| 27 | + become: true |
| 28 | + notify: |
| 29 | + - Regenerate initramfs |
| 30 | + - reboot |
| 31 | + |
| 32 | + - name: Add vfio to modules-load.d |
| 33 | + blockinfile: |
| 34 | + path: /etc/modules-load.d/vfio.conf |
| 35 | + block: | |
| 36 | + vfio |
| 37 | + vfio_iommu_type1 |
| 38 | + vfio_pci |
| 39 | + vfio_virqfd |
| 40 | + owner: root |
| 41 | + group: root |
| 42 | + mode: 0664 |
| 43 | + create: true |
| 44 | + become: true |
| 45 | + notify: reboot |
| 46 | + |
| 47 | + - name: Blacklist nouveau |
| 48 | + blockinfile: |
| 49 | + path: /etc/modprobe.d/blacklist-nouveau.conf |
| 50 | + block: | |
| 51 | + blacklist nouveau |
| 52 | + options nouveau modeset=0 |
| 53 | + mode: 0664 |
| 54 | + owner: root |
| 55 | + group: root |
| 56 | + create: true |
| 57 | + become: true |
| 58 | + notify: |
| 59 | + - reboot |
| 60 | + - Regenerate initramfs |
| 61 | + |
| 62 | + - name: Ignore unsupported model specific registers |
| 63 | + # Occasionally, applications running in the VM may crash unexpectedly, |
| 64 | + # whereas they would run normally on a physical machine. If, while |
| 65 | + # running dmesg -wH, you encounter an error mentioning MSR, the reason |
| 66 | + # for those crashes is that KVM injects a General protection fault (GPF) |
| 67 | + # when the guest tries to access unsupported Model-specific registers |
| 68 | + # (MSRs) - this often results in guest applications/OS crashing. A |
| 69 | + # number of those issues can be solved by passing the ignore_msrs=1 |
| 70 | + # option to the KVM module, which will ignore unimplemented MSRs. |
| 71 | + # source: https://wiki.archlinux.org/index.php/QEMU |
| 72 | + blockinfile: |
| 73 | + path: /etc/modprobe.d/kvm.conf |
| 74 | + block: | |
| 75 | + options kvm ignore_msrs=Y |
| 76 | + # This option is not available in centos 7 as the kernel is too old, |
| 77 | + # but it can help with dmesg spam in newer kernels (centos8?). Sample |
| 78 | + # dmesg log message: |
| 79 | + # [ +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619 |
| 80 | + # options kvm report_ignored_msrs=N |
| 81 | + mode: 0664 |
| 82 | + owner: root |
| 83 | + group: root |
| 84 | + create: true |
| 85 | + become: true |
| 86 | + notify: reboot |
| 87 | + |
| 88 | + - name: Add vfio-pci.ids kernel args |
| 89 | + include_role: |
| 90 | + name: stackhpc.grubcmdline |
| 91 | + vars: |
| 92 | + kernel_cmdline: |
| 93 | + - intel_iommu=on |
| 94 | + - iommu=pt |
| 95 | + - "vfio-pci.ids={{ vfio_pci_ids }}" |
| 96 | + kernel_cmdline_remove: |
| 97 | + - iommu |
| 98 | + - intel_iommu |
| 99 | + - vfio-pci.ids |
| 100 | + |
| 101 | + handlers: |
| 102 | + - name: Regenerate initramfs |
| 103 | + command: /usr/sbin/update-initramfs -u |
| 104 | + become: true |
| 105 | + |
| 106 | + # TODO: Check if this works |
| 107 | + - name: reboot |
| 108 | + import_playbook: reboot.yml |
0 commit comments