diff --git a/assets/state-vfio-manager/0400_configmap.yaml b/assets/state-vfio-manager/0400_configmap.yaml deleted file mode 100644 index ffc1ab281..000000000 --- a/assets/state-vfio-manager/0400_configmap.yaml +++ /dev/null @@ -1,234 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: nvidia-vfio-manager - namespace: "FILLED BY THE OPERATOR" - labels: - app: nvidia-vfio-manager -data: - vfio-manage.sh: |- - #!/bin/sh - - set -eu - - usage() - { - cat >&2 < "$existing_driver/unbind" - echo > "/sys/bus/pci/devices/$gpu/driver_override" - } - - # unbind device from non vfio-pci driver - unbind_from_other_driver() { - gpu=$1 - - [ -e "/sys/bus/pci/devices/$gpu/driver" ] || return 0 - - existing_driver=$(readlink -f "/sys/bus/pci/devices/$gpu/driver") - existing_driver_name=$(basename "$existing_driver") - - # return if bound to vfio-pci - [ "$existing_driver_name" != "vfio-pci" ] || return 0 - echo "unbinding device $gpu from driver $existing_driver_name" - echo "$gpu" > "$existing_driver/unbind" - echo > "/sys/bus/pci/devices/$gpu/driver_override" - } - - is_nvidia_gpu_device() { - gpu=$1 - # make sure device class is for NVIDIA GPU - device_class_file=$(readlink -f "/sys/bus/pci/devices/$gpu/class") - device_class=$(cat "$device_class_file") - [ "$device_class" = "0x030200" ] || [ "$device_class" = "0x030000" ] || return 1 - return 0 - } - - is_bound_to_vfio() { - gpu=$1 - - # return if not bound to any driver - [ -e "/sys/bus/pci/devices/$gpu/driver" ] || return 1 - - existing_driver=$(readlink -f "/sys/bus/pci/devices/$gpu/driver") - existing_driver_name=$(basename "$existing_driver") - - echo "existing driver is $existing_driver_name" - # return if bound to other drivers(nvidia, nouveau) - [ "$existing_driver_name" = "vfio-pci" ] || return 1 - - # bound to vfio - return 0 - } - - unbind_device() { - gpu=$1 - - if ! is_nvidia_gpu_device $gpu; then - return 0 - fi - - echo "unbinding device $gpu" - unbind_from_driver "$gpu" - #for graphics mode, we need to unbind the auxiliary device as well - aux_dev=$(get_graphics_aux_dev "$gpu") - if [ "$aux_dev" != "NONE" ]; then - echo "gpu $gpu is in graphics mode aux_dev $aux_dev" - unbind_from_driver "$aux_dev" - fi - } - - unbind_all() { - for dev in /sys/bus/pci/devices/*; do - read -r vendor < "$dev/vendor" - if [ "$vendor" = "0x10de" ]; then - dev_id=$(basename "$dev") - unbind_device "$dev_id" - fi - done - } - - bind_pci_device() { - gpu=$1 - - if ! is_bound_to_vfio $gpu; then - unbind_from_other_driver $gpu - echo "binding device $gpu" - echo "vfio-pci" > "/sys/bus/pci/devices/$gpu/driver_override" - echo "$gpu" > /sys/bus/pci/drivers/vfio-pci/bind - else - echo "device $gpu already bound to vfio-pci" - fi - } - - get_graphics_aux_dev() { - gpu=$1 - device_class_file=$(readlink -f "/sys/bus/pci/devices/$gpu/class") - device_class=$(cat "$device_class_file") - if [ "$device_class" != "0x030000" ]; then - echo "NONE" - return - fi - - if ls "/sys/bus/pci/devices/$gpu" | grep consumer >/dev/null 2>&1; then - aux_dev=$(ls "/sys/bus/pci/devices/$gpu" | grep consumer | awk -Fconsumer:pci: '{print $2}') - if [ "$aux_dev" = "" ]; then - echo "NONE" - return - fi - - if ls "/sys/bus/pci/devices/$aux_dev/" >/dev/null 2>&1; then - echo "$aux_dev" - return - fi - fi - - echo "NONE" - } - - bind_device() { - gpu=$1 - - if ! is_nvidia_gpu_device "$gpu"; then - echo "device $gpu is not a gpu!" - return 0 - fi - - bind_pci_device "$gpu" - #for graphics mode, we need to bind the auxiliary device as well - aux_dev=$(get_graphics_aux_dev "$gpu") - if [ "$aux_dev" != "NONE" ]; then - echo "gpu $gpu is in graphics mode aux_dev $aux_dev" - bind_pci_device "$aux_dev" - fi - } - - bind_all() { - for dev in /sys/bus/pci/devices/*; do - read -r vendor < "$dev/vendor" - if [ "$vendor" = "0x10de" ]; then - dev_id=$(basename "$dev") - bind_device "$dev_id" - fi - done - } - - handle_bind() { - chroot /host modprobe vfio-pci - if [ "$DEVICE_ID" != "" ]; then - bind_device "$DEVICE_ID" - elif [ "$ALL_DEVICES" = "true" ]; then - bind_all - else - usage - fi - } - - handle_unbind() { - if [ "$DEVICE_ID" != "" ]; then - unbind_device "$DEVICE_ID" - elif [ "$ALL_DEVICES" = "true" ]; then - unbind_all - else - usage - fi - } - - if [ $# -eq 0 ]; then - usage - fi - - command=$1; shift - case "${command}" in - bind) options=$(getopt -o ad: --long all,device-id: -- "$@");; - unbind) options=$(getopt -o ad: --long all,device-id: -- "$@");; - help) options="" ;; - *) usage ;; - esac - if [ $? -ne 0 ]; then - usage - fi - - eval set -- "${options}" - - DEVICE_ID="" - for opt in ${options}; do - case "$opt" in - -a | --all) ALL_DEVICES=true; shift 1 ;; - -d | --device-id) DEVICE_ID=$2; shift 2 ;; - -h | --help) shift;; - --) shift; break ;; - esac - done - if [ $# -ne 0 ]; then - usage - fi - - if [ "$command" = "help" ]; then - usage - elif [ "$command" = "bind" ]; then - handle_bind - elif [ "$command" = "unbind" ]; then - handle_unbind - else - echo "Unknown function: $command" - exit 1 - fi diff --git a/assets/state-vfio-manager/0500_scc.openshift.yaml b/assets/state-vfio-manager/0400_scc.openshift.yaml similarity index 100% rename from assets/state-vfio-manager/0500_scc.openshift.yaml rename to assets/state-vfio-manager/0400_scc.openshift.yaml diff --git a/assets/state-vfio-manager/0600_daemonset.yaml b/assets/state-vfio-manager/0500_daemonset.yaml similarity index 87% rename from assets/state-vfio-manager/0600_daemonset.yaml rename to assets/state-vfio-manager/0500_daemonset.yaml index c726aea41..3ddd87e9e 100644 --- a/assets/state-vfio-manager/0600_daemonset.yaml +++ b/assets/state-vfio-manager/0500_daemonset.yaml @@ -62,7 +62,7 @@ spec: imagePullPolicy: IfNotPresent command: ["/bin/sh", "-c"] args: - - /bin/vfio-manage.sh bind --all && sleep inf + - vfio-manage bind --all && while true; do sleep 86400; done resources: limits: memory: 200Mi @@ -70,12 +70,11 @@ spec: cpu: 100m memory: 200Mi volumeMounts: - - name: nvidia-vfio-manager - readOnly: true - mountPath: /bin/vfio-manage.sh - subPath: vfio-manage.sh - name: host-sys mountPath: /sys + - name: host-lib-modules + mountPath: /lib/modules + readOnly: true - name: host-root mountPath: /host securityContext: @@ -85,17 +84,17 @@ spec: lifecycle: preStop: exec: - command: ["/bin/sh", "-c", "/bin/vfio-manage.sh unbind --all"] + command: ["vfio-manage unbind --all"] terminationGracePeriodSeconds: 30 volumes: - - name: nvidia-vfio-manager - configMap: - name: nvidia-vfio-manager - defaultMode: 448 - name: host-sys hostPath: path: /sys type: Directory + - name: host-lib-modules + hostPath: + path: /lib/modules + type: Directory - name: run-nvidia hostPath: path: /run/nvidia diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index ce6174488..9a64c5b1e 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -224,8 +224,9 @@ spec: image: ghcr.io/nvidia/gpu-operator:main-latest - name: k8s-driver-manager-image image: nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:c549346eb993fda62e9bf665aabaacc88abc06b0b24e69635427d4d71c2d5ed4 + # TODO: update to a k8s-driver-manager image from nvcr.io once the next version is published - name: vfio-manager-image - image: nvcr.io/nvidia/cuda@sha256:d19fe621624c4eb6ac931b8558daa3ecc0c3f07f1e2a52e0267e083d22dceade + image: ghcr.io/nvidia/k8s-driver-manager:0d30fa9c - name: sandbox-device-plugin-image image: nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:119de9a331a47203858b99901f44d0c4a8052961b4e60327f4b100d0ab8c9df0 - name: vgpu-device-manager-image @@ -940,8 +941,9 @@ spec: value: "nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:8e0803d2f29776cd4cc0501381a20a0b04b2da507a794d66a15894c57beaadb5" - name: "CUDA_BASE_IMAGE" value: "nvcr.io/nvidia/cuda@sha256:d19fe621624c4eb6ac931b8558daa3ecc0c3f07f1e2a52e0267e083d22dceade" + # TODO: update to a k8s-driver-manager image from nvcr.io once the next version is published - name: "VFIO_MANAGER_IMAGE" - value: "nvcr.io/nvidia/cuda@sha256:d19fe621624c4eb6ac931b8558daa3ecc0c3f07f1e2a52e0267e083d22dceade" + value: "ghcr.io/nvidia/k8s-driver-manager:0d30fa9c" - name: "SANDBOX_DEVICE_PLUGIN_IMAGE" value: "nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:119de9a331a47203858b99901f44d0c4a8052961b4e60327f4b100d0ab8c9df0" - name: "VGPU_DEVICE_MANAGER_IMAGE" diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 9f292cccb..184697143 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -445,9 +445,10 @@ vgpuDeviceManager: vfioManager: enabled: true - repository: nvcr.io/nvidia - image: cuda - version: 13.0.1-base-ubi9 + # TODO: update to a k8s-driver-manager image from nvcr.io once the next version is published + repository: ghcr.io/nvidia + image: k8s-driver-manager + version: 0d30fa9c imagePullPolicy: IfNotPresent imagePullSecrets: [] env: []