Skip to content

Commit 115d514

Browse files
committed
Add defaults for GPU PCI passthrough configuration
1 parent 289470c commit 115d514

File tree

7 files changed

+245
-1
lines changed

7 files changed

+245
-1
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
---
2+
- name: Enable GPU passthough
3+
hosts: "{{ gpu_group_map.keys() | default([]) }}"
4+
vars:
5+
# This playbook will execute after nodes are deployed
6+
# and before overcloud host configure - we can't assume
7+
# users and venvs exist.
8+
ansible_user: "{{ bootstrap_user }}"
9+
ansible_python_interpreter: "/usr/bin/python3"
10+
ansible_ssh_common_args: "-o StrictHostKeyChecking=no"
11+
vfio_pci_ids: |-
12+
{% set output = [] %}
13+
{% for gpu_type in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | default([]) %}
14+
{% set _ = output.append(stackhpc_gpu_data[gpu_type]['vendor_id'] + ':' + stackhpc_gpu_data[gpu_type]['product_id']) %}
15+
{% endfor %}
16+
{{ output | join(',') }}
17+
tasks:
18+
- name: Template dracut config
19+
blockinfile:
20+
path: /etc/dracut.conf.d/gpu-vfio.conf
21+
block: |
22+
add_drivers+="vfio vfio_iommu_type1 vfio_pci vfio_virqfd"
23+
owner: root
24+
group: root
25+
mode: 0660
26+
create: true
27+
become: true
28+
notify:
29+
- Regenerate initramfs
30+
- reboot
31+
32+
- name: Add vfio to modules-load.d
33+
blockinfile:
34+
path: /etc/modules-load.d/vfio.conf
35+
block: |
36+
vfio
37+
vfio_iommu_type1
38+
vfio_pci
39+
vfio_virqfd
40+
owner: root
41+
group: root
42+
mode: 0664
43+
create: true
44+
become: true
45+
notify: reboot
46+
47+
- name: Blacklist nouveau
48+
blockinfile:
49+
path: /etc/modprobe.d/blacklist-nouveau.conf
50+
block: |
51+
blacklist nouveau
52+
options nouveau modeset=0
53+
mode: 0664
54+
owner: root
55+
group: root
56+
create: true
57+
become: true
58+
notify:
59+
- reboot
60+
- Regenerate initramfs
61+
62+
- name: Ignore unsupported model specific registers
63+
# Occasionally, applications running in the VM may crash unexpectedly,
64+
# whereas they would run normally on a physical machine. If, while
65+
# running dmesg -wH, you encounter an error mentioning MSR, the reason
66+
# for those crashes is that KVM injects a General protection fault (GPF)
67+
# when the guest tries to access unsupported Model-specific registers
68+
# (MSRs) - this often results in guest applications/OS crashing. A
69+
# number of those issues can be solved by passing the ignore_msrs=1
70+
# option to the KVM module, which will ignore unimplemented MSRs.
71+
# source: https://wiki.archlinux.org/index.php/QEMU
72+
blockinfile:
73+
path: /etc/modprobe.d/kvm.conf
74+
block: |
75+
options kvm ignore_msrs=Y
76+
# This option is not available in centos 7 as the kernel is too old,
77+
# but it can help with dmesg spam in newer kernels (centos8?). Sample
78+
# dmesg log message:
79+
# [ +0.000002] kvm [8348]: vcpu0, guest rIP: 0xffffffffb0a767fa ignored rdmsr: 0x619
80+
# options kvm report_ignored_msrs=N
81+
mode: 0664
82+
owner: root
83+
group: root
84+
create: true
85+
become: true
86+
notify: reboot
87+
88+
- name: Add vfio-pci.ids kernel args
89+
include_role:
90+
name: stackhpc.grubcmdline
91+
vars:
92+
kernel_cmdline:
93+
- intel_iommu=on
94+
- iommu=pt
95+
- "vfio-pci.ids={{ vfio_pci_ids }}"
96+
kernel_cmdline_remove:
97+
- iommu
98+
- intel_iommu
99+
- vfio-pci.ids
100+
101+
handlers:
102+
- name: Regenerate initramfs
103+
command: /usr/sbin/update-initramfs -u
104+
become: true
105+
106+
# TODO: Check if this works
107+
- name: reboot
108+
import_playbook: reboot.yml
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../ansible/pci-passthrough.yml

etc/kayobe/kolla.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,12 @@ kolla_build_args: {}
499499
# List of names of additional host variables to pass through from kayobe hosts
500500
# to kolla-ansible hosts, if set. See also
501501
# kolla_overcloud_inventory_pass_through_host_vars_map.
502-
#kolla_overcloud_inventory_pass_through_host_vars_extra:
502+
# TODO: try and find a way of doing this withouth breaking existing usage of
503+
# the variable. Maybe override defaults instead? Less likely to conflict on
504+
# existing deployments
505+
kolla_overcloud_inventory_pass_through_host_vars_extra:
506+
- stackhpc_gpu_data
507+
- gpu_group_map
503508

504509
# List of names of host variables to pass through from kayobe hosts to
505510
# kolla-ansible hosts, if set. See also
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[pci]
2+
{% for item in gpu_group_map | dict2items | map(attribute='value') | flatten | unique | list %}
3+
alias = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }}, "device_type":{{ stackhpc_gpu_data[item].device_type }}, "name":{{ stackhpc_gpu_data[item].resource_name }} }
4+
{% endfor %}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[pci]
2+
{% raw %}
3+
{% set gpu_list = [] %}
4+
{% for gpu_type in gpu_group_map | dict2items | default([]) %}
5+
{% if gpu_type.key in group_names %}
6+
{% set _ = gpu_list.append(gpu_type.value) %}
7+
{% endif %}
8+
{% endfor %}
9+
{% for item in gpu_list | flatten | unique %}
10+
device_spec = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }} }
11+
alias = { "vendor_id":{{ stackhpc_gpu_data[item].vendor_id }}, "product_id":{{ stackhpc_gpu_data[item].product_id }}, "device_type":{{ stackhpc_gpu_data[item].device_type }}, "name":{{ stackhpc_gpu_data[item].resource_name }} }
12+
{% endfor %}
13+
{% endraw %}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[filter_scheduler]
2+
# Default list plus PciPassthroughFilter
3+
# NOTE(Upgrade): defaults may change in each release. Default values can be
4+
# checked here:
5+
# https://docs.openstack.org/nova/latest/configuration/sample-config.html
6+
enabled_filters = ComputeFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,PciPassthroughFilter
7+
available_filters = nova.scheduler.filters.all_filters
8+
# A strong bias to prevent non-GPU workloads from scheduling onto the GPU node
9+
# unless there is nothing else available within the system
10+
pci_weight_multiplier = 100.0

etc/kayobe/stackhpc-compute.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
---
2+
# StackHPC compute node configuration
3+
4+
# Map of inventory groups to GPU types.
5+
# This is used to determine which GPU types each compute node should pass
6+
# through to OpenStack.
7+
# Keys are group names, values are a list of GPU types.
8+
# Groups must be added to kolla_overcloud_inventory_top_level_group_map
9+
# GPU types must be keys in stackhpc_gpu_data.
10+
# Example GPU group map:
11+
# gpu_group_map:
12+
# compute_a100:
13+
# - a100_80
14+
# compute_v100:
15+
# - v100_32
16+
# compute_multi_gpu:
17+
# - a100_80
18+
# - v100_32
19+
gpu_group_map: {}
20+
21+
# Dict mapping GPUs to PCI data.
22+
# Resource names are used to identify the device in placement, and can be
23+
# edited to match deployment-specific naming conventions
24+
# The default list covers many common GPUs, but can be extended as needed.
25+
stackhpc_gpu_data:
26+
# Nvidia H100 SXM5 80GB
27+
h100_80_sxm:
28+
resource_name: "h100_80_sxm"
29+
vendor_id: "10de"
30+
product_id: "2330"
31+
device_type: "type-PF"
32+
# Nvidia A100 SXM5 80GB
33+
a100_80_sxm:
34+
resource_name: "a100_80_sxm"
35+
vendor_id: "10de"
36+
product_id: "20b2"
37+
device_type: "type-PF"
38+
# Nvidia A100 SXM5 40GB
39+
a100_40_sxm:
40+
resource_name: "a100_40_sxm"
41+
vendor_id: "10de"
42+
product_id: "20b0"
43+
device_type: "type-PF"
44+
# Nvidia A100 PCI 80GB
45+
a100_80:
46+
resource_name: "a100_80"
47+
vendor_id: "10de"
48+
product_id: "20b5"
49+
device_type: "type-PF"
50+
# Nvidia A100 PCI 40GB
51+
a100_40:
52+
resource_name: "a100_40"
53+
vendor_id: "10de"
54+
product_id: "20f1"
55+
device_type: "type-PF"
56+
# Nvidia V100 SXM3 32GB
57+
v100_32_sxm3:
58+
resource_name: "v100_32_sxm3"
59+
vendor_id: "10de"
60+
product_id: "1db8"
61+
device_type: "type-PCI"
62+
# Nvidia V100 SXM2 32GB
63+
v100_32_sxm2:
64+
resource_name: "v100_32_sxm2"
65+
vendor_id: "10de"
66+
product_id: "1db5"
67+
device_type: "type-PCI"
68+
# Nvidia V100 PCI 32GB
69+
v100_32:
70+
resource_name: "v100_32"
71+
vendor_id: "10de"
72+
product_id: "1db6"
73+
device_type: "type-PCI"
74+
# Nvidia RTX A6000
75+
a6000:
76+
resource_name: "a6000"
77+
vendor_id: "10de"
78+
product_id: "2230"
79+
device_type: "type-PCI"
80+
# Nvidia A40
81+
a40:
82+
resource_name: "a40"
83+
vendor_id: "10de"
84+
product_id: "2235"
85+
device_type: "type-PF"
86+
# Nvidia T4
87+
t4:
88+
resource_name: "t4"
89+
vendor_id: "10de"
90+
product_id: "1eb8"
91+
device_type: "type-PF"
92+
# Nvidia L40
93+
l40:
94+
resource_name: l40
95+
vendor_id: "10de"
96+
product_id: "26b5"
97+
device_type: "type-PF"
98+
# Nvidia L40s
99+
l40s:
100+
resource_name: l40s
101+
vendor_id: "10de"
102+
product_id: "26b9"
103+
device_type: "type-PF"

0 commit comments

Comments
 (0)