Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
398 changes: 134 additions & 264 deletions doc/source/operations/gpu-in-openstack.rst

Large diffs are not rendered by default.

142 changes: 142 additions & 0 deletions etc/kayobe/ansible/pci-passthrough.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
---
- name: Enable GPU passthough
hosts: "{{ (gpu_group_map | default({})).keys() }}"
vars:
# This playbook will execute after nodes are deployed
# and before overcloud host configure - we can't assume
# users and venvs exist.
ansible_user: "{{ bootstrap_user }}"
ansible_ssh_common_args: "-o StrictHostKeyChecking=no"
ansible_python_interpreter: "/usr/bin/python3"
vfio_pci_ids: |-
{% set gpu_list = [] %}
{% set output = [] %}
{% for gpu_group in gpu_group_map | dict2items | default([]) %}
{% if gpu_group.key in group_names %}
{% set _ = gpu_list.append(gpu_group.value) %}
{% endif %}
{% endfor %}
{% for item in gpu_list | flatten | unique %}
{% set _ = output.append(stackhpc_gpu_data[item]['vendor_id'] + ':' + stackhpc_gpu_data[item]['product_id']) %}
{% endfor %}
{{ output | join(',') }}
reboot_timeout_s: "{{ 20 * 60 }}"
tasks:
- name: Template dracut config
ansible.builtin.blockinfile:
path: /etc/dracut.conf.d/gpu-vfio.conf
block: |
add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd"
owner: root
group: root
mode: 0660
create: true
become: true
notify:
- Regenerate initramfs
- reboot

- name: Add vfio to modules-load.d
ansible.builtin.blockinfile:
path: /etc/modules-load.d/vfio.conf
block: |
vfio
vfio_iommu_type1
vfio_pci
vfio_virqfd
owner: root
group: root
mode: 0664
create: true
become: true
notify: reboot

- name: Blacklist nouveau
ansible.builtin.blockinfile:
path: /etc/modprobe.d/blacklist-nouveau.conf
block: |
blacklist nouveau
options nouveau modeset=0
mode: 0664
owner: root
group: root
create: true
become: true
notify:
- reboot
- Regenerate initramfs

- name: Ignore unsupported model specific registers
# Occasionally, applications running in the VM may crash unexpectedly,
# whereas they would run normally on a physical machine. If, while
# running dmesg -wH, you encounter an error mentioning MSR, the reason
# for those crashes is that KVM injects a General protection fault (GPF)
# when the guest tries to access unsupported Model-specific registers
# (MSRs) - this often results in guest applications/OS crashing. A
# number of those issues can be solved by passing the ignore_msrs=1
# option to the KVM module, which will ignore unimplemented MSRs.
# source: https://wiki.archlinux.org/index.php/QEMU
ansible.builtin.blockinfile:
path: /etc/modprobe.d/kvm.conf
block: |
options kvm ignore_msrs=Y
# This option is not available in centos 7 as the kernel is too old,
# but it can help with dmesg spam in newer kernels (centos8?). Sample
# dmesg log message:
# [ +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619
# options kvm report_ignored_msrs=N
mode: 0664
owner: root
group: root
create: true
become: true
notify: reboot

- name: Add vfio-pci.ids kernel args
ansible.builtin.include_role:
name: stackhpc.linux.grubcmdline
vars:
kernel_cmdline:
- intel_iommu=on
- iommu=pt
- "vfio-pci.ids={{ vfio_pci_ids }}"
kernel_cmdline_remove:
- iommu
- intel_iommu
- vfio-pci.ids

handlers:
- name: Regenerate initramfs (RedHat)
listen: Regenerate initramfs
ansible.builtin.shell: |-
#!/bin/bash
set -eux
dracut -v -f /boot/initramfs-$(uname -r).img $(uname -r)
become: true
changed_when: true
when: ansible_facts.os_family == 'RedHat'

- name: Regenerate initramfs (Debian)
listen: Regenerate initramfs
ansible.builtin.shell: |-
#!/bin/bash
set -eux
update-initramfs -u -k $(uname -r)
become: true
changed_when: true
when: ansible_facts.os_family == 'Debian'

- name: Reboot
listen: reboot
become: true
ansible.builtin.reboot:
reboot_timeout: "{{ reboot_timeout_s }}"
search_paths:
# Systems running molly-guard hang waiting for confirmation before rebooting without this.
- /lib/molly-guard
# Default list:
- /sbin
- /bin
- /usr/sbin
- /usr/bin
- /usr/local/sbin
22 changes: 21 additions & 1 deletion etc/kayobe/kolla.yml
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,24 @@ kolla_build_args: {}
# * groups: A list of kayobe ansible groups to map to this kolla-ansible group.
# * vars: A dict mapping variable names to values for hosts in this
# kolla-ansible group.
# NOTE(Alex-Welsh): If you want to extend the map rather than replace it, you
# must include the Kayobe defaults in the mapping.
# Standard Kayobe defaults:
# compute:
# groups:
# - "compute"
# control:
# groups:
# - "controllers"
# monitoring:
# groups:
# - "controllers"
# network:
# groups:
# - "controllers"
# storage:
# groups:
# - "controllers"
#kolla_overcloud_inventory_top_level_group_map:

# List of names of top level kolla-ansible groups. Any of these groups which
Expand All @@ -499,7 +517,9 @@ kolla_build_args: {}
# List of names of additional host variables to pass through from kayobe hosts
# to kolla-ansible hosts, if set. See also
# kolla_overcloud_inventory_pass_through_host_vars_map.
#kolla_overcloud_inventory_pass_through_host_vars_extra:
kolla_overcloud_inventory_pass_through_host_vars_extra:
- stackhpc_gpu_data
- gpu_group_map

# List of names of host variables to pass through from kayobe hosts to
# kolla-ansible hosts, if set. See also
Expand Down
4 changes: 4 additions & 0 deletions etc/kayobe/kolla/config/nova/nova-api.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pci]
{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %}
alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
{% endfor %}
13 changes: 13 additions & 0 deletions etc/kayobe/kolla/config/nova/nova-compute.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[pci]
{% raw %}
{% set gpu_list = [] %}
{% for gpu_group in gpu_group_map | dict2items | default([]) %}
{% if gpu_group.key in group_names %}
{% set _ = gpu_list.append(gpu_group.value) %}
{% endif %}
{% endfor %}
{% for item in gpu_list | flatten | unique %}
device_spec = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}" }
alias = { "vendor_id":"{{ stackhpc_gpu_data[item].vendor_id }}", "product_id":"{{ stackhpc_gpu_data[item].product_id }}", "device_type":"{{ stackhpc_gpu_data[item].device_type }}", "name":"{{ stackhpc_gpu_data[item].resource_name }}" }
{% endfor %}
{% endraw %}
7 changes: 7 additions & 0 deletions etc/kayobe/kolla/config/nova/nova-scheduler.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[filter_scheduler]
# Default list plus PciPassthroughFilter
# NOTE(Upgrade): defaults may change in each release. Default values can be
# checked here:
# https://docs.openstack.org/nova/latest/configuration/sample-config.html
enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
available_filters = nova.scheduler.filters.all_filters
103 changes: 103 additions & 0 deletions etc/kayobe/stackhpc-compute.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
---
# StackHPC compute node configuration

# Map of inventory groups to GPU types.
# This is used to determine which GPU types each compute node should pass
# through to OpenStack.
# Keys are group names, values are a list of GPU types.
# Groups must be added to kolla_overcloud_inventory_top_level_group_map
# GPU types must be keys in stackhpc_gpu_data.
# Example GPU group map:
# gpu_group_map:
# compute_a100:
# - a100_80
# compute_v100:
# - v100_32
# compute_multi_gpu:
# - a100_80
# - v100_32
gpu_group_map: {}

# Dict mapping GPUs to PCI data.
# Resource names are used to identify the device in placement, and can be
# edited to match deployment-specific naming conventions
# The default list covers many common GPUs, but can be extended as needed.
stackhpc_gpu_data:
# Nvidia H100 SXM5 80GB
h100_80_sxm:
resource_name: "{{ h100_80_sxm_resource_name | default('h100_80_sxm')}}"
vendor_id: "10de"
product_id: "2330"
device_type: "type-PF"
# Nvidia A100 SXM5 80GB
a100_80_sxm:
resource_name: "{{ a100_80_sxm_resource_name | default('a100_80_sxm')}}"
vendor_id: "10de"
product_id: "20b2"
device_type: "type-PF"
# Nvidia A100 SXM5 40GB
a100_40_sxm:
resource_name: "{{ a100_40_sxm_resource_name | default('a100_40_sxm')}}"
vendor_id: "10de"
product_id: "20b0"
device_type: "type-PF"
# Nvidia A100 PCI 80GB
a100_80:
resource_name: "{{ a100_80_resource_name | default('a100_80')}}"
vendor_id: "10de"
product_id: "20b5"
device_type: "type-PF"
# Nvidia A100 PCI 40GB
a100_40:
resource_name: "{{ a100_40_resource_name | default('a100_40')}}"
vendor_id: "10de"
product_id: "20f1"
device_type: "type-PF"
# Nvidia V100 SXM3 32GB
v100_32_sxm3:
resource_name: "{{ v100_32_sxm3_resource_name | default('v100_32_sxm3')}}"
vendor_id: "10de"
product_id: "1db8"
device_type: "type-PCI"
# Nvidia V100 SXM2 32GB
v100_32_sxm2:
resource_name: "{{ v100_32_sxm2_resource_name | default('v100_32_sxm2')}}"
vendor_id: "10de"
product_id: "1db5"
device_type: "type-PCI"
# Nvidia V100 PCI 32GB
v100_32:
resource_name: "{{ v100_32_resource_name | default('v100_32')}}"
vendor_id: "10de"
product_id: "1db6"
device_type: "type-PCI"
# Nvidia RTX A6000
a6000:
resource_name: "{{ a6000_resource_name | default('a6000')}}"
vendor_id: "10de"
product_id: "2230"
device_type: "type-PCI"
# Nvidia A40
a40:
resource_name: "{{ a40_resource_name | default('a40')}}"
vendor_id: "10de"
product_id: "2235"
device_type: "type-PF"
# Nvidia T4
t4:
resource_name: "{{ t4_resource_name | default('t4')}}"
vendor_id: "10de"
product_id: "1eb8"
device_type: "type-PF"
# Nvidia L40
l40:
resource_name: "{{ l40_resource_name | default('l40')}}"
vendor_id: "10de"
product_id: "26b5"
device_type: "type-PF"
# Nvidia L40s
l40s:
resource_name: "{{ l40s_resource_name | default('l40s')}}"
vendor_id: "10de"
product_id: "26b9"
device_type: "type-PF"
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Added templates and a playbook to simplify configuration of PCI passthrough
GPUs. GPU types can be mapped to inventory groups with the
``gpu_group_map`` variable, which will configure the host and Nova
automatically. A list of supported GPUs can be found in
``etc/kayobe/stackhpc-compute.yml`` under ``stackhpc_gpu_data``.
Loading