Skip to content

Commit 75139f4

Browse files
authored
tests/gpu: add AMD CDI test job (#654)
2 parents b754dfa + b186df4 commit 75139f4

File tree

2 files changed

+102
-1
lines changed

2 files changed

+102
-1
lines changed

.github/workflows/gpu-passthrough-tests.yml

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: GPU passthrough
33
on:
44
pull_request:
55
paths:
6-
- '.github/workflows/nvidia-test.yml'
6+
- '.github/workflows/gpu-passthrough-tests.yml'
77
- '.github/workflows/testflinger/**'
88
schedule:
99
- cron: '38 6 */5 * *'
@@ -132,3 +132,35 @@ jobs:
132132
with:
133133
poll: true
134134
job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-legacy-runtime-job.yml
135+
amd-cdi:
136+
runs-on: [self-hosted, self-hosted-linux-amd64-jammy-private-endpoint-medium]
137+
strategy:
138+
matrix:
139+
track: ${{ fromJSON(inputs.snap-track || '["latest/edge"]') }}
140+
os: ${{ fromJSON(inputs.ubuntu-releases || '["noble"]') }}
141+
env:
142+
TESTFLINGER_DIR: .github/workflows/testflinger
143+
SNAP_CHANNEL: ${{ matrix.track }}
144+
steps:
145+
- name: Event data
146+
run: "echo ::notice::Snap channel: $SNAP_CHANNEL"
147+
148+
- name: Checkout code
149+
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
150+
151+
- name: Create Testflinger job
152+
env:
153+
JOB_QUEUE: lxd-amd
154+
DISTRO: ${{ matrix.os }}
155+
run: |
156+
# Prepare job
157+
envsubst '$JOB_QUEUE $DISTRO $SNAP_CHANNEL' \
158+
< $TESTFLINGER_DIR/amd-cdi-job.yaml \
159+
> $TESTFLINGER_DIR/amd-cdi-job.temp
160+
mv $TESTFLINGER_DIR/amd-cdi-job.temp $TESTFLINGER_DIR/amd-cdi-job.yaml
161+
162+
- name: Run tests
163+
uses: canonical/testflinger/.github/actions/submit@main
164+
with:
165+
poll: true
166+
job-path: ${{ env.TESTFLINGER_DIR }}/amd-cdi-job.yaml
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
job_queue: $JOB_QUEUE
2+
global_timeout: 3600
3+
output_timeout: 1800
4+
provision_data:
5+
distro: $DISTRO
6+
7+
test_data:
8+
test_cmds: |
9+
#!/bin/bash
10+
11+
set -xeuo pipefail
12+
13+
# This is used, along with DEVICE_IP, by all scriptlets to access the device
14+
export DEVICE_USER="ubuntu"
15+
16+
# retrieve the tools installer
17+
curl -Ls -o install_tools.sh https://raw.githubusercontent.com/canonical/hwcert-jenkins-tools/main/install_tools.sh
18+
19+
# install the scriptlets and other tools on the agent and the device, as necessary
20+
export TOOLS_PATH=tools
21+
source install_tools.sh $TOOLS_PATH
22+
23+
# ensure device is available before continuing
24+
wait_for_ssh --allow-degraded
25+
26+
# Wait for snapd to become available
27+
retry -- _run "timeout 5 sudo snap wait system seed.loaded"
28+
wait_for_snap_changes
29+
30+
# Install AMD GPU drivers and utilities
31+
_run install_packages mesa-utils libdrm-amdgpu1 radeontop
32+
33+
# Show kernel boot arguments for debugging
34+
_run cat /proc/cmdline
35+
36+
# verify that AMD GPU devices are available
37+
_run lspci -k | grep -A 3 -i "vga\|display\|3d"
38+
_run ls -la /dev/dri/
39+
_run ls -la /dev/kfd
40+
41+
# LXD working
42+
_run sudo snap install lxd --channel=$SNAP_CHANNEL --no-wait
43+
wait_for_snap_changes
44+
45+
_run sudo lxd init --auto
46+
_run sudo usermod -G lxd ubuntu
47+
48+
_run lxc init ubuntu:noble c1 &
49+
for i in $( seq 1 60 ); do
50+
[ $i -lt 60 ]
51+
_run lxc config device add c1 gpu0 gpu id=amd.com/gpu=0 && break || /bin/true
52+
sleep 1
53+
done
54+
55+
_run lxc start c1
56+
57+
# print out all mounts we have so we can compare it later if anything goes wrong
58+
_run lxc exec c1 -- cat /proc/self/mountinfo
59+
60+
# check that AMD devices are accessible in the container
61+
_run lxc exec c1 -- ls -la /dev/dri/
62+
_run lxc exec c1 -- ls -la /dev/kfd
63+
64+
# Verify that card and renderD devices exist
65+
_run lxc exec c1 -- test -e /dev/dri/card0
66+
_run lxc exec c1 -- test -e /dev/dri/renderD128
67+
68+
# Check that KFD device is accessible
69+
_run lxc exec c1 -- test -c /dev/kfd

0 commit comments

Comments
 (0)