Skip to content

Commit 1fc7c1d

Browse files
Add extended diagnostics support to must-gather.sh using debug container for dmidecode/lspci collection
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 05090a5 commit 1fc7c1d

File tree

4 files changed

+130
-4
lines changed

4 files changed

+130
-4
lines changed

.github/ISSUE_TEMPLATE/bug_report.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ Collecting full debug bundle (optional):
4343
curl -o must-gather.sh -L https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh
4444
chmod +x must-gather.sh
4545
./must-gather.sh
46+
47+
# For extended diagnostics (includes system/PCI info):
48+
ENABLE_EXTENDED_DIAGNOSTICS=true ./must-gather.sh
4649
```
4750
**NOTE**: please refer to the [must-gather](https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh) script for debug data collected.
4851

Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,18 @@ build-image:
295295
# This includes https://github.com/openshift-psap/ci-artifacts
296296
docker-image: OUT_IMAGE ?= $(IMAGE_NAME):$(IMAGE_TAG)
297297

298+
##### Debug Container #####
299+
DEBUG_CONTAINER_IMAGE ?= ghcr.io/nvidia/gpu-operator-debug
300+
DEBUG_CONTAINER_TAG ?= latest
301+
302+
.PHONY: build-debug-container push-debug-container
303+
304+
build-debug-container:
305+
$(DOCKER) build -t $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG) hack/debug-container/
306+
307+
push-debug-container: build-debug-container
308+
$(DOCKER) push $(DEBUG_CONTAINER_IMAGE):$(DEBUG_CONTAINER_TAG)
309+
298310
install-tools:
299311
@echo Installing tools from tools.go
300312
export GOBIN=$(PROJECT_DIR)/bin && \

hack/debug-container/Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
FROM ubuntu:22.04
2+
3+
RUN apt-get update && apt-get install -y --no-install-recommends \
4+
dmidecode \
5+
pciutils \
6+
&& rm -rf /var/lib/apt/lists/*
7+
8+
ENTRYPOINT ["/bin/sh"]
9+

hack/must-gather.sh

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,57 @@
33
set -o nounset
44
set -x
55

6+
# Set ENABLE_EXTENDED_DIAGNOSTICS=true to use a debug container for complete nvidia-bug-report collection
7+
ENABLE_EXTENDED_DIAGNOSTICS=${ENABLE_EXTENDED_DIAGNOSTICS:-false}
8+
DEBUG_CONTAINER_IMAGE=${DEBUG_CONTAINER_IMAGE:-ghcr.io/nvidia/gpu-operator-debug:latest}
9+
DEBUG_TIMEOUT_SECONDS=${DEBUG_TIMEOUT_SECONDS:-60}
10+
11+
# Noise patterns from kubectl debug output that should be filtered
12+
KUBECTL_NOISE_PATTERN="^Targeting\|^Defaulting\|^Unable\|^warning:\|^All commands\|^If you don"
13+
14+
# Filter out kubectl informational messages from output
15+
filter_kubectl_noise() {
16+
grep -v "${KUBECTL_NOISE_PATTERN}" || true
17+
}
18+
19+
# Append a section header to the bug report
20+
append_section_header() {
21+
local file="$1"
22+
local title="$2"
23+
24+
{
25+
echo ""
26+
echo "____________________________________________"
27+
echo ""
28+
echo "${title}"
29+
echo ""
30+
} >> "${file}"
31+
}
32+
33+
# Collect diagnostic output using debug container and append to bug report
34+
# Args: $1=pod_name, $2=node_name, $3=command, $4=command_args, $5=output_file
35+
collect_debug_diagnostic() {
36+
local pod_name="$1"
37+
local node_name="$2"
38+
local cmd="$3"
39+
local cmd_args="$4"
40+
local output_file="$5"
41+
42+
append_section_header "${output_file}" "${cmd} ${cmd_args} output (via must-gather extended diagnostics)"
43+
44+
# Use -i to attach stdin (required to capture output)
45+
if ! timeout "${DEBUG_TIMEOUT_SECONDS}" $K debug -n "${OPERATOR_NAMESPACE}" "${pod_name}" \
46+
--image="${DEBUG_CONTAINER_IMAGE}" \
47+
--target=nvidia-driver-ctr \
48+
--profile=sysadmin \
49+
-i \
50+
-- ${cmd} ${cmd_args} 2>/dev/null | filter_kubectl_noise >> "${output_file}"; then
51+
echo "Warning: Failed to collect ${cmd} from ${node_name} (timed out or failed)" >&2
52+
echo "(collection failed or timed out after ${DEBUG_TIMEOUT_SECONDS}s)" >> "${output_file}"
53+
fi
54+
}
55+
56+
657
K=kubectl
758
if ! $K version > /dev/null; then
859
K=oc
@@ -262,18 +313,69 @@ echo "# nvidia-bug-report.sh"
262313
echo "#"
263314
echo ""
264315

316+
if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then
317+
echo "==============================================================================="
318+
echo "WARNING: Extended diagnostics enabled."
319+
echo ""
320+
echo "This will pull and run an external debug container (${DEBUG_CONTAINER_IMAGE})"
321+
echo "with privileged access to collect system information (dmidecode, lspci)."
322+
echo ""
323+
echo "By enabling this option, you acknowledge:"
324+
echo " - An external container image will be pulled and executed in your cluster"
325+
echo " - The debug container requires privileged access (sysadmin profile)"
326+
echo " - System hardware information will be collected and included in the bug report"
327+
echo ""
328+
echo "To disable, unset ENABLE_EXTENDED_DIAGNOSTICS or set it to false."
329+
echo "==============================================================================="
330+
echo ""
331+
fi
332+
265333
for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}");
266334
do
267335
pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}")
336+
pod_name=$(basename "${pod}")
268337
echo "Saving nvidia-bug-report from ${pod_nodename} ..."
269338

270-
$K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \
271-
(echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue)
339+
# Collect standard nvidia-bug-report from driver container
340+
if ! $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2; then
341+
echo "Failed to collect nvidia-bug-report from ${pod_nodename}"
342+
continue
343+
fi
344+
345+
# Clean up any existing temp file to avoid permission issues
346+
rm -f /tmp/nvidia-bug-report.log.gz
347+
348+
if ! $K cp "${OPERATOR_NAMESPACE}"/"${pod_name}":/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz 2>/dev/null; then
349+
echo "Failed to save nvidia-bug-report from ${pod_nodename}"
350+
continue
351+
fi
272352

273-
$K cp "${OPERATOR_NAMESPACE}"/$(basename "${pod}"):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
274-
(echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue)
275353

276354
mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz"
355+
356+
if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then
357+
echo "Collecting extended diagnostics (dmidecode/lspci) from ${pod_nodename}..."
358+
359+
bug_report_file="${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log"
360+
361+
# Decompress the bug report to append data
362+
if ! gunzip "${bug_report_file}.gz" 2>&1; then
363+
echo "Warning: Failed to decompress bug report for ${pod_nodename}, skipping extended diagnostics"
364+
continue
365+
fi
366+
367+
append_section_header "${bug_report_file}" "*** EXTENDED DIAGNOSTICS (from debug container) ***"
368+
369+
collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "dmidecode" "" "${bug_report_file}"
370+
collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "lspci" "-vvv" "${bug_report_file}"
371+
372+
# Recompress the bug report
373+
if ! gzip "${bug_report_file}" 2>&1; then
374+
echo "Warning: Failed to recompress bug report for ${pod_nodename}"
375+
fi
376+
else
377+
echo "NOTE: For extended diagnostics (dmidecode/lspci), set ENABLE_EXTENDED_DIAGNOSTICS=true"
378+
fi
277379
done
278380

279381
echo ""

0 commit comments

Comments
 (0)