Skip to content

Commit b864e36

Browse files
committed
Device Metrics Exporter CI AMDSMI integration
Signed-off-by: yalmusaf <Yazen.ALMusaffar@amd.com>
1 parent 24f443f commit b864e36

File tree

1 file changed

+351
-0
lines changed

1 file changed

+351
-0
lines changed
Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
# ──────────────────────────────────────────────────────────────
2+
# DME ↔ AMDSMI Integration CI
3+
# ──────────────────────────────────────────────────────────────
4+
# Architecture:
5+
# GPU-Agent (C++) ──gRPC──▶ DME (Go) ──▶ Prometheus /metrics
6+
# │ │
7+
# └── libamdsmi.so ◀───────┘ (built from projects/amdsmi)
8+
#
9+
# This workflow builds AMDSMI from the super-repo, compiles
10+
# GPU-Agent and Device Metrics Exporter, then verifies that
11+
# Prometheus GPU metrics are exposed correctly.
12+
# ──────────────────────────────────────────────────────────────
13+
14+
name: DME AMDSMI Integration CI
15+
16+
on:
17+
pull_request:
18+
branches: [develop]
19+
paths:
20+
- 'projects/amdsmi/**'
21+
- '.github/workflows/dme-amdsmi-ci.yml'
22+
push:
23+
branches: [develop]
24+
paths:
25+
- 'projects/amdsmi/**'
26+
- '.github/workflows/dme-amdsmi-ci.yml'
27+
workflow_dispatch:
28+
inputs:
29+
dme_branch:
30+
description: 'DME branch/tag to test against'
31+
default: 'main'
32+
type: string
33+
gpu_agent_branch:
34+
description: 'GPU Agent branch/tag (fallback if submodule missing)'
35+
default: 'main'
36+
type: string
37+
38+
permissions:
39+
contents: read
40+
41+
concurrency:
42+
group: dme-amdsmi-integration-${{ github.ref }}
43+
cancel-in-progress: true
44+
45+
env:
46+
DEBIAN_FRONTEND: noninteractive
47+
BUILD_TYPE: Release
48+
DME_REPO: https://github.com/ROCm/device-metrics-exporter.git
49+
GPU_AGENT_REPO: https://github.com/ROCm/gpu-agent.git
50+
DME_BRANCH: ${{ inputs.dme_branch || 'main' }}
51+
GPU_AGENT_BRANCH: ${{ inputs.gpu_agent_branch || 'main' }}
52+
GO_VERSION: '1.25.5'
53+
DME_METRICS_PORT: 5000
54+
DME_DIR: /tmp/dme
55+
# GPU Agent Makefile hardcodes ABS_DIR to this path
56+
GPU_AGENT_WORKDIR: /usr/src/github.com/ROCm/gpu-agent
57+
58+
jobs:
59+
integration-test:
60+
name: DME Integration • ${{ matrix.os }}
61+
runs-on: ${{ vars.RUNNER_TYPE }}
62+
continue-on-error: true
63+
timeout-minutes: 120
64+
strategy:
65+
fail-fast: false
66+
matrix:
67+
os: [Ubuntu22]
68+
container:
69+
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
70+
options: >-
71+
--rm --privileged
72+
--device=/dev/kfd
73+
--device=/dev/dri
74+
--group-add video
75+
76+
steps:
77+
# ────────────────────────────────────────────────────────
78+
# Setup
79+
# ────────────────────────────────────────────────────────
80+
- name: Checkout Super-repo
81+
uses: actions/checkout@v4
82+
83+
- name: Set Project Directory
84+
run: |
85+
PROJECT_DIR=$(find "$(pwd)" -maxdepth 3 -name "CMakeLists.txt" \
86+
-path "*/amdsmi/*" -exec dirname {} \; | head -1)
87+
echo "AMDSMI_DIR=${PROJECT_DIR}" >> "$GITHUB_ENV"
88+
echo "AMDSMI project: ${PROJECT_DIR}"
89+
90+
- name: Install System Dependencies
91+
run: |
92+
max_retries=3
93+
for i in $(seq 1 $max_retries); do
94+
apt-get update && break || sleep 5
95+
done
96+
apt-get install -y --no-install-recommends \
97+
build-essential cmake git curl wget ca-certificates \
98+
pkg-config libdrm-dev libpci-dev \
99+
autoconf automake libtool unzip
100+
101+
- name: Install Go ${{ env.GO_VERSION }}
102+
run: |
103+
wget -q "https://go.dev/dl/go${{ env.GO_VERSION }}.linux-amd64.tar.gz" \
104+
-O /tmp/go.tar.gz
105+
rm -rf /usr/local/go
106+
tar -C /usr/local -xzf /tmp/go.tar.gz
107+
rm -f /tmp/go.tar.gz
108+
echo "/usr/local/go/bin" >> "$GITHUB_PATH"
109+
echo "${HOME}/go/bin" >> "$GITHUB_PATH"
110+
/usr/local/go/bin/go version
111+
112+
# ────────────────────────────────────────────────────────
113+
# Phase 1 — Build AMDSMI from the super-repo
114+
# ────────────────────────────────────────────────────────
115+
- name: Build and Install AMDSMI
116+
run: |
117+
echo "::group::cmake configure"
118+
cd "${{ env.AMDSMI_DIR }}"
119+
mkdir -p build && cd build
120+
cmake .. \
121+
-DBUILD_TESTS=ON \
122+
-DENABLE_ESMI_LIB=ON \
123+
-DCMAKE_INSTALL_PREFIX=/opt/rocm \
124+
-DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }}
125+
echo "::endgroup::"
126+
127+
echo "::group::make"
128+
make -j "$(nproc)"
129+
echo "::endgroup::"
130+
131+
make install
132+
ldconfig
133+
134+
echo "=== Verify AMDSMI installation ==="
135+
ls -la /opt/rocm/lib/libamd_smi.so*
136+
ls /opt/rocm/include/amd_smi/ | head -10
137+
138+
# ────────────────────────────────────────────────────────
139+
# Phase 2 — Clone and prepare build environment
140+
# ────────────────────────────────────────────────────────
141+
- name: Clone Device Metrics Exporter
142+
run: |
143+
git clone --recurse-submodules --depth 1 \
144+
-b "${{ env.DME_BRANCH }}" \
145+
"${{ env.DME_REPO }}" "${{ env.DME_DIR }}"
146+
147+
# Verify gpu-agent submodule was cloned
148+
if [ ! -d "${{ env.DME_DIR }}/gpuagent/sw" ]; then
149+
echo "GPU Agent submodule not populated — cloning separately"
150+
rm -rf "${{ env.DME_DIR }}/gpuagent"
151+
git clone --recurse-submodules --depth 1 \
152+
-b "${{ env.GPU_AGENT_BRANCH }}" \
153+
"${{ env.GPU_AGENT_REPO }}" "${{ env.DME_DIR }}/gpuagent"
154+
fi
155+
156+
echo "=== DME directory ==="
157+
ls "${{ env.DME_DIR }}/"
158+
159+
- name: Prepare GPU Agent Build Environment
160+
run: |
161+
GPUAGENT_SRC="${{ env.DME_DIR }}/gpuagent"
162+
163+
# GPU Agent Makefile hardcodes ABS_DIR — symlink to match
164+
mkdir -p "$(dirname "${{ env.GPU_AGENT_WORKDIR }}")"
165+
ln -sfn "${GPUAGENT_SRC}" "${{ env.GPU_AGENT_WORKDIR }}"
166+
167+
# Copy locally-built AMDSMI to gpu-agent's expected paths
168+
AMDSMI_TP="${GPUAGENT_SRC}/sw/nic/third-party/rocm/amd_smi_lib"
169+
mkdir -p "${AMDSMI_TP}/include" "${AMDSMI_TP}/x86_64/lib"
170+
cp -v /opt/rocm/lib/libamd_smi.so* "${AMDSMI_TP}/x86_64/lib/"
171+
cp -rv /opt/rocm/include/amd_smi/* "${AMDSMI_TP}/include/"
172+
173+
# Place libs in the runtime lib dir as well
174+
BLD_LIB="${GPUAGENT_SRC}/sw/nic/build/x86_64/sim/lib"
175+
mkdir -p "${BLD_LIB}"
176+
cp -v /opt/rocm/lib/libamd_smi.so* "${BLD_LIB}/"
177+
178+
echo "=== GPU Agent build environment ready ==="
179+
180+
# ────────────────────────────────────────────────────────
181+
# Phase 3 — Build GPU Agent (C++ binary)
182+
# ────────────────────────────────────────────────────────
183+
- name: Build GPU Agent Third-party Libraries
184+
timeout-minutes: 60
185+
run: |
186+
export PATH="/usr/local/go/bin:${HOME}/go/bin:${PATH}"
187+
cd "${{ env.GPU_AGENT_WORKDIR }}/sw/nic/gpuagent"
188+
189+
echo "Building third-party C++ libraries (protobuf, gRPC, abseil, …)"
190+
echo "::group::build-libs"
191+
make build-libs 2>&1 | tail -50
192+
echo "::endgroup::"
193+
194+
- name: Build GPU Agent Binary
195+
timeout-minutes: 30
196+
run: |
197+
export PATH="/usr/local/go/bin:${HOME}/go/bin:${PATH}"
198+
cd "${{ env.GPU_AGENT_WORKDIR }}/sw/nic/gpuagent"
199+
200+
go mod vendor
201+
202+
echo "::group::make gpuagent"
203+
make gpuagent 2>&1 | tail -40
204+
echo "::endgroup::"
205+
206+
BLD_BIN="${{ env.GPU_AGENT_WORKDIR }}/sw/nic/build/x86_64/sim/bin"
207+
echo "=== GPU Agent build artifacts ==="
208+
ls -la "${BLD_BIN}/"
209+
210+
# ────────────────────────────────────────────────────────
211+
# Phase 4 — Build Device Metrics Exporter (Go binary)
212+
# ────────────────────────────────────────────────────────
213+
- name: Install Go Protobuf Tools
214+
run: |
215+
export PATH="/usr/local/go/bin:${HOME}/go/bin:${PATH}"
216+
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
217+
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
218+
219+
- name: Build Device Metrics Exporter
220+
timeout-minutes: 15
221+
run: |
222+
export PATH="/usr/local/go/bin:${HOME}/go/bin:${PATH}"
223+
cd "${{ env.DME_DIR }}"
224+
225+
mkdir -p bin
226+
CGO_ENABLED=0 go build -C cmd/exporter \
227+
-o "$(pwd)/bin/amd-metrics-exporter"
228+
229+
ls -la bin/amd-metrics-exporter
230+
echo "=== DME build complete ==="
231+
232+
# ────────────────────────────────────────────────────────
233+
# Phase 5 — Integration Test
234+
# ────────────────────────────────────────────────────────
235+
- name: Start GPU Agent
236+
run: |
237+
BLD_DIR="${{ env.GPU_AGENT_WORKDIR }}/sw/nic/build/x86_64/sim"
238+
export LD_LIBRARY_PATH="${BLD_DIR}/lib:/opt/rocm/lib:${LD_LIBRARY_PATH:-}"
239+
240+
echo "=== Starting GPU Agent ==="
241+
nohup "${BLD_DIR}/bin/gpuagent" > /tmp/gpuagent.log 2>&1 &
242+
GPUAGENT_PID=$!
243+
echo "GPUAGENT_PID=${GPUAGENT_PID}" >> "$GITHUB_ENV"
244+
245+
sleep 5
246+
247+
if kill -0 "${GPUAGENT_PID}" 2>/dev/null; then
248+
echo "GPU Agent running (PID ${GPUAGENT_PID})"
249+
else
250+
echo "::error::GPU Agent failed to start"
251+
cat /tmp/gpuagent.log
252+
exit 1
253+
fi
254+
255+
- name: Start Device Metrics Exporter
256+
run: |
257+
export LD_LIBRARY_PATH="/opt/rocm/lib:${LD_LIBRARY_PATH:-}"
258+
259+
echo "=== Starting DME ==="
260+
nohup "${{ env.DME_DIR }}/bin/amd-metrics-exporter" \
261+
> /tmp/dme.log 2>&1 &
262+
DME_PID=$!
263+
echo "DME_PID=${DME_PID}" >> "$GITHUB_ENV"
264+
265+
sleep 5
266+
267+
if kill -0 "${DME_PID}" 2>/dev/null; then
268+
echo "DME running (PID ${DME_PID})"
269+
else
270+
echo "::error::DME failed to start"
271+
cat /tmp/dme.log
272+
exit 1
273+
fi
274+
275+
- name: Verify Prometheus Metrics Endpoint
276+
run: |
277+
MAX_RETRIES=5
278+
RETRY_DELAY=3
279+
280+
for i in $(seq 1 ${MAX_RETRIES}); do
281+
echo "Attempt ${i}/${MAX_RETRIES}: querying metrics …"
282+
HTTP_CODE=$(curl -s -o /tmp/metrics_output.txt \
283+
-w "%{http_code}" \
284+
"http://localhost:${{ env.DME_METRICS_PORT }}/metrics" \
285+
|| echo "000")
286+
287+
if [ "${HTTP_CODE}" = "200" ]; then
288+
echo "=== Metrics endpoint returned HTTP 200 ==="
289+
head -50 /tmp/metrics_output.txt
290+
echo "---"
291+
292+
if grep -q "^# HELP" /tmp/metrics_output.txt &&
293+
grep -q "^# TYPE" /tmp/metrics_output.txt; then
294+
echo "=== Prometheus format verified ==="
295+
else
296+
echo "::warning::Response may not be standard Prometheus format"
297+
fi
298+
exit 0
299+
fi
300+
301+
echo "HTTP ${HTTP_CODE} — retrying in ${RETRY_DELAY}s …"
302+
sleep "${RETRY_DELAY}"
303+
done
304+
305+
echo "::error::Metrics endpoint unreachable after ${MAX_RETRIES} attempts"
306+
echo "--- gpu-agent log ---"
307+
tail -100 /tmp/gpuagent.log 2>/dev/null || true
308+
echo "--- dme log ---"
309+
tail -100 /tmp/dme.log 2>/dev/null || true
310+
exit 1
311+
312+
# ────────────────────────────────────────────────────────
313+
# Cleanup & artifacts
314+
# ────────────────────────────────────────────────────────
315+
- name: Collect Logs
316+
if: always()
317+
run: |
318+
mkdir -p /tmp/integration-logs
319+
cp /tmp/gpuagent.log /tmp/integration-logs/ 2>/dev/null || true
320+
cp /tmp/dme.log /tmp/integration-logs/ 2>/dev/null || true
321+
cp /tmp/metrics_output.txt /tmp/integration-logs/ 2>/dev/null || true
322+
323+
{
324+
echo "=== System ==="
325+
uname -a
326+
echo "=== GPU devices ==="
327+
ls -la /dev/kfd /dev/dri/ 2>&1 || true
328+
echo "=== AMDSMI libs ==="
329+
ldconfig -p | grep amd_smi || true
330+
} > /tmp/integration-logs/system-info.txt 2>&1
331+
332+
- name: Upload Integration Test Logs
333+
if: always()
334+
uses: actions/upload-artifact@v4
335+
with:
336+
name: dme-integration-logs-${{ matrix.os }}
337+
path: /tmp/integration-logs/
338+
retention-days: 7
339+
340+
- name: Cleanup
341+
if: always()
342+
run: |
343+
for pid in "${{ env.DME_PID }}" "${{ env.GPUAGENT_PID }}"; do
344+
[ -n "${pid}" ] || continue
345+
kill "${pid}" 2>/dev/null || true
346+
done
347+
sleep 2
348+
for pid in "${{ env.DME_PID }}" "${{ env.GPUAGENT_PID }}"; do
349+
[ -n "${pid}" ] || continue
350+
kill -9 "${pid}" 2>/dev/null || true
351+
done

0 commit comments

Comments
 (0)