Add defaults for GPU PCI passthrough configuration

Alex-Welsh · Alex-Welsh · commit 115d51492d3c · 2025-03-28T15:50:24.000Z
diff --git a/etc/kayobe/ansible/pci-passthrough.yml b/etc/kayobe/ansible/pci-passthrough.yml
@@ -0,0 +1,108 @@
+---
+- name: Enable GPU passthough
+  hosts: "{{ gpu_group_map.keys() | default([]) }}"
+  vars:
+    # This playbook will execute after nodes are deployed
+    # and before overcloud host configure - we can't assume
+    # users and venvs exist.
+    ansible_user: "{{ bootstrap_user }}"
+    ansible_python_interpreter: "/usr/bin/python3"
+    ansible_ssh_common_args: "-o StrictHostKeyChecking=no"
+    vfio_pci_ids: |-
+      {% set output = [] %}
+      {% for gpu_type in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | default([]) %}
+      {% set _ = output.append(stackhpc_gpu_data[gpu_type]['vendor_id'] + ':' + stackhpc_gpu_data[gpu_type]['product_id']) %}
+      {% endfor %}
+      {{ output | join(',') }}
+  tasks:
+    - name: Template dracut config
+      blockinfile:
+        path: /etc/dracut.conf.d/gpu-vfio.conf
+        block: |
+          add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd"
+        owner: root
+        group: root
+        mode: 0660
+        create: true
+      become: true
+      notify:
+        - Regenerate initramfs
+        - reboot
+
+    - name: Add vfio to modules-load.d
+      blockinfile:
+        path: /etc/modules-load.d/vfio.conf
+        block: |
+          vfio
+          vfio_iommu_type1
+          vfio_pci
+          vfio_virqfd
+        owner: root
+        group: root
+        mode: 0664
+        create: true
+      become: true
+      notify: reboot
+
+    - name: Blacklist nouveau
+      blockinfile:
+        path: /etc/modprobe.d/blacklist-nouveau.conf
+        block: |
+          blacklist nouveau
+          options nouveau modeset=0
+        mode: 0664
+        owner: root
+        group: root
+        create: true
+      become: true
+      notify:
+        - reboot
+        - Regenerate initramfs
+
+    - name: Ignore unsupported model specific registers
+      # Occasionally, applications running in the VM may crash unexpectedly,
+      # whereas they would run normally on a physical machine. If, while
+      # running dmesg -wH, you encounter an error mentioning MSR, the reason
+      # for those crashes is that KVM injects a General protection fault (GPF)
+      # when the guest tries to access unsupported Model-specific registers
+      # (MSRs) - this often results in guest applications/OS crashing. A
+      # number of those issues can be solved by passing the ignore_msrs=1
+      # option to the KVM module, which will ignore unimplemented MSRs.
+      # source: https://wiki.archlinux.org/index.php/QEMU
+      blockinfile:
+        path: /etc/modprobe.d/kvm.conf
+        block: |
+          options kvm ignore_msrs=Y
+          # This option is not available in centos 7 as the kernel is too old,
+          # but it can help with dmesg spam in newer kernels (centos8?). Sample
+          # dmesg log message:
+          #  [  +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619
+          # options kvm report_ignored_msrs=N
+        mode: 0664
+        owner: root
+        group: root
+        create: true
+      become: true
+      notify: reboot
+
+    - name: Add vfio-pci.ids kernel args
+      include_role:
+        name: stackhpc.grubcmdline
+      vars:
+        kernel_cmdline:
+          - intel_iommu=on
+          - iommu=pt
+          - "vfio-pci.ids={{ vfio_pci_ids }}"
+        kernel_cmdline_remove:
+          - iommu
+          - intel_iommu
+          - vfio-pci.ids
+
+  handlers:
+    - name: Regenerate initramfs
+      command: /usr/sbin/update-initramfs -u
+      become: true
+
+    # TODO: Check if this works
+    - name: reboot
+      import_playbook: reboot.yml
diff --git a/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml b/etc/kayobe/hooks/overcloud-host-configure/pre.d/pci-passthrough.yml
@@ -0,0 +1 @@
+../../../ansible/pci-passthrough.yml
diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml
@@ -499,7 +499,12 @@ kolla_build_args: {}
 # List of names of additional host variables to pass through from kayobe hosts
 # to kolla-ansible hosts, if set. See also
 # kolla_overcloud_inventory_pass_through_host_vars_map.
-#kolla_overcloud_inventory_pass_through_host_vars_extra:
+# TODO: try and find a way of doing this withouth breaking existing usage of
+# the variable. Maybe override defaults instead? Less likely to conflict on
+# existing deployments
+kolla_overcloud_inventory_pass_through_host_vars_extra:
+  - stackhpc_gpu_data
+  - gpu_group_map
 
 # List of names of host variables to pass through from kayobe hosts to
 # kolla-ansible hosts, if set. See also
diff --git a/etc/kayobe/kolla/config/nova/nova-api.conf b/etc/kayobe/kolla/config/nova/nova-api.conf
@@ -0,0 +1,4 @@
+[pci]
+{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %}
+alias = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }}, "device_type":{{ stackhpc_gpu_data[item].device_type }}, "name":{{ stackhpc_gpu_data[item].resource_name }} }
+{% endfor %}
diff --git a/etc/kayobe/kolla/config/nova/nova-compute.conf b/etc/kayobe/kolla/config/nova/nova-compute.conf
@@ -0,0 +1,13 @@
+[pci]
+{% raw %}
+{% set gpu_list = [] %}
+{% for gpu_type in gpu_group_map | dict2items | default([]) %}
+{% if gpu_type.key in group_names %}
+{% set _ = gpu_list.append(gpu_type.value) %}
+{% endif %}
+{% endfor %}
+{% for item in gpu_list | flatten | unique %}
+device_spec = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }} }
+alias = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }}, "device_type":{{ stackhpc_gpu_data[item].device_type }}, "name":{{ stackhpc_gpu_data[item].resource_name }} }
+{% endfor %}
+{% endraw %}
diff --git a/etc/kayobe/kolla/config/nova/nova-scheduler.conf b/etc/kayobe/kolla/config/nova/nova-scheduler.conf
@@ -0,0 +1,10 @@
+[filter_scheduler]
+# Default list plus PciPassthroughFilter
+# NOTE(Upgrade): defaults may change in each release. Default values can be
+# checked here:
+# https://docs.openstack.org/nova/latest/configuration/sample-config.html
+enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
+available_filters = nova.scheduler.filters.all_filters
+# A strong bias to prevent non-GPU workloads from scheduling onto the GPU node
+# unless there is nothing else available within the system
+pci_weight_multiplier = 100.0
diff --git a/etc/kayobe/stackhpc-compute.yml b/etc/kayobe/stackhpc-compute.yml
@@ -0,0 +1,103 @@
+---
+# StackHPC compute node configuration
+
+# Map of inventory groups to GPU types.
+# This is used to determine which GPU types each compute node should pass
+# through to OpenStack.
+# Keys are group names, values are a list of GPU types.
+# Groups must be added to kolla_overcloud_inventory_top_level_group_map
+# GPU types must be keys in stackhpc_gpu_data.
+# Example GPU group map:
+# gpu_group_map:
+#   compute_a100:
+#     - a100_80
+#   compute_v100:
+#     - v100_32
+#   compute_multi_gpu:
+#     - a100_80
+#     - v100_32
+gpu_group_map: {}
+
+# Dict mapping GPUs to PCI data.
+# Resource names are used to identify the device in placement, and can be
+# edited to match deployment-specific naming conventions
+# The default list covers many common GPUs, but can be extended as needed.
+stackhpc_gpu_data:
+  # Nvidia H100 SXM5 80GB
+  h100_80_sxm:
+    resource_name: "h100_80_sxm"
+    vendor_id: "10de"
+    product_id: "2330"
+    device_type: "type-PF"
+  # Nvidia A100 SXM5 80GB
+  a100_80_sxm:
+    resource_name: "a100_80_sxm"
+    vendor_id: "10de"
+    product_id: "20b2"
+    device_type: "type-PF"
+  # Nvidia A100 SXM5 40GB
+  a100_40_sxm:
+    resource_name: "a100_40_sxm"
+    vendor_id: "10de"
+    product_id: "20b0"
+    device_type: "type-PF"
+  # Nvidia A100 PCI 80GB
+  a100_80:
+    resource_name: "a100_80"
+    vendor_id: "10de"
+    product_id: "20b5"
+    device_type: "type-PF"
+  # Nvidia A100 PCI 40GB
+  a100_40:
+    resource_name: "a100_40"
+    vendor_id: "10de"
+    product_id: "20f1"
+    device_type: "type-PF"
+  # Nvidia V100 SXM3 32GB
+  v100_32_sxm3:
+    resource_name: "v100_32_sxm3"
+    vendor_id: "10de"
+    product_id: "1db8"
+    device_type: "type-PCI"
+  # Nvidia V100 SXM2 32GB
+  v100_32_sxm2:
+    resource_name: "v100_32_sxm2"
+    vendor_id: "10de"
+    product_id: "1db5"
+    device_type: "type-PCI"
+  # Nvidia V100 PCI 32GB
+  v100_32:
+    resource_name: "v100_32"
+    vendor_id: "10de"
+    product_id: "1db6"
+    device_type: "type-PCI"
+  # Nvidia RTX A6000
+  a6000:
+    resource_name: "a6000"
+    vendor_id: "10de"
+    product_id: "2230"
+    device_type: "type-PCI"
+  # Nvidia A40
+  a40:
+    resource_name: "a40"
+    vendor_id: "10de"
+    product_id: "2235"
+    device_type: "type-PF"
+  # Nvidia T4
+  t4:
+    resource_name: "t4"
+    vendor_id: "10de"
+    product_id: "1eb8"
+    device_type: "type-PF"
+  # Nvidia L40
+  l40:
+    resource_name: l40
+    vendor_id: "10de"
+    product_id: "26b5"
+    device_type: "type-PF"
+  # Nvidia L40s
+  l40s:
+    resource_name: l40s
+    vendor_id: "10de"
+    product_id: "26b9"
+    device_type: "type-PF"