|
3 | 3 | set -o nounset |
4 | 4 | set -x |
5 | 5 |
|
| 6 | +# Set ENABLE_EXTENDED_DIAGNOSTICS=true to use a debug container for complete nvidia-bug-report collection |
| 7 | +ENABLE_EXTENDED_DIAGNOSTICS=${ENABLE_EXTENDED_DIAGNOSTICS:-false} |
| 8 | +DEBUG_CONTAINER_IMAGE=${DEBUG_CONTAINER_IMAGE:-ghcr.io/nvidia/gpu-operator-debug:latest} |
| 9 | +DEBUG_TIMEOUT_SECONDS=${DEBUG_TIMEOUT_SECONDS:-60} |
| 10 | + |
| 11 | +# Noise patterns from kubectl debug output that should be filtered |
| 12 | +KUBECTL_NOISE_PATTERN="^Targeting\|^Defaulting\|^Unable\|^warning:\|^All commands\|^If you don" |
| 13 | + |
| 14 | +# Filter out kubectl informational messages from output |
| 15 | +filter_kubectl_noise() { |
| 16 | + grep -v "${KUBECTL_NOISE_PATTERN}" || true |
| 17 | +} |
| 18 | + |
| 19 | +# Append a section header to the bug report |
| 20 | +append_section_header() { |
| 21 | + local file="$1" |
| 22 | + local title="$2" |
| 23 | + |
| 24 | + { |
| 25 | + echo "" |
| 26 | + echo "____________________________________________" |
| 27 | + echo "" |
| 28 | + echo "${title}" |
| 29 | + echo "" |
| 30 | + } >> "${file}" |
| 31 | +} |
| 32 | + |
| 33 | +# Collect diagnostic output using debug container and append to bug report |
| 34 | +# Args: $1=pod_name, $2=node_name, $3=command, $4=command_args, $5=output_file |
| 35 | +collect_debug_diagnostic() { |
| 36 | + local pod_name="$1" |
| 37 | + local node_name="$2" |
| 38 | + local cmd="$3" |
| 39 | + local cmd_args="$4" |
| 40 | + local output_file="$5" |
| 41 | + |
| 42 | + append_section_header "${output_file}" "${cmd} ${cmd_args} output (via must-gather extended diagnostics)" |
| 43 | + |
| 44 | + # Use -i to attach stdin (required to capture output) |
| 45 | + if ! timeout "${DEBUG_TIMEOUT_SECONDS}" $K debug -n "${OPERATOR_NAMESPACE}" "${pod_name}" \ |
| 46 | + --image="${DEBUG_CONTAINER_IMAGE}" \ |
| 47 | + --target=nvidia-driver-ctr \ |
| 48 | + --profile=sysadmin \ |
| 49 | + -i \ |
| 50 | + -- ${cmd} ${cmd_args} 2>/dev/null | filter_kubectl_noise >> "${output_file}"; then |
| 51 | + echo "Warning: Failed to collect ${cmd} from ${node_name} (timed out or failed)" >&2 |
| 52 | + echo "(collection failed or timed out after ${DEBUG_TIMEOUT_SECONDS}s)" >> "${output_file}" |
| 53 | + fi |
| 54 | +} |
| 55 | + |
| 56 | + |
6 | 57 | K=kubectl |
7 | 58 | if ! $K version > /dev/null; then |
8 | 59 | K=oc |
@@ -262,18 +313,69 @@ echo "# nvidia-bug-report.sh" |
262 | 313 | echo "#" |
263 | 314 | echo "" |
264 | 315 |
|
| 316 | +if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then |
| 317 | + echo "===============================================================================" |
| 318 | + echo "WARNING: Extended diagnostics enabled." |
| 319 | + echo "" |
| 320 | + echo "This will pull and run an external debug container (${DEBUG_CONTAINER_IMAGE})" |
| 321 | + echo "with privileged access to collect system information (dmidecode, lspci)." |
| 322 | + echo "" |
| 323 | + echo "By enabling this option, you acknowledge:" |
| 324 | + echo " - An external container image will be pulled and executed in your cluster" |
| 325 | + echo " - The debug container requires privileged access (sysadmin profile)" |
| 326 | + echo " - System hardware information will be collected and included in the bug report" |
| 327 | + echo "" |
| 328 | + echo "To disable, unset ENABLE_EXTENDED_DIAGNOSTICS or set it to false." |
| 329 | + echo "===============================================================================" |
| 330 | + echo "" |
| 331 | +fi |
| 332 | + |
265 | 333 | for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}"); |
266 | 334 | do |
267 | 335 | pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}") |
| 336 | + pod_name=$(basename "${pod}") |
268 | 337 | echo "Saving nvidia-bug-report from ${pod_nodename} ..." |
269 | 338 |
|
270 | | - $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \ |
271 | | - (echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue) |
| 339 | + # Collect standard nvidia-bug-report from driver container |
| 340 | + if ! $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2; then |
| 341 | + echo "Failed to collect nvidia-bug-report from ${pod_nodename}" |
| 342 | + continue |
| 343 | + fi |
| 344 | + |
| 345 | + # Clean up any existing temp file to avoid permission issues |
| 346 | + rm -f /tmp/nvidia-bug-report.log.gz |
| 347 | + |
| 348 | + if ! $K cp "${OPERATOR_NAMESPACE}"/"${pod_name}":/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz 2>/dev/null; then |
| 349 | + echo "Failed to save nvidia-bug-report from ${pod_nodename}" |
| 350 | + continue |
| 351 | + fi |
272 | 352 |
|
273 | | - $K cp "${OPERATOR_NAMESPACE}"/$(basename "${pod}"):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \ |
274 | | - (echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue) |
275 | 353 |
|
276 | 354 | mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz" |
| 355 | + |
| 356 | + if [[ "${ENABLE_EXTENDED_DIAGNOSTICS}" == "true" ]]; then |
| 357 | + echo "Collecting extended diagnostics (dmidecode/lspci) from ${pod_nodename}..." |
| 358 | + |
| 359 | + bug_report_file="${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log" |
| 360 | + |
| 361 | + # Decompress the bug report to append data |
| 362 | + if ! gunzip "${bug_report_file}.gz" 2>&1; then |
| 363 | + echo "Warning: Failed to decompress bug report for ${pod_nodename}, skipping extended diagnostics" |
| 364 | + continue |
| 365 | + fi |
| 366 | + |
| 367 | + append_section_header "${bug_report_file}" "*** EXTENDED DIAGNOSTICS (from debug container) ***" |
| 368 | + |
| 369 | + collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "dmidecode" "" "${bug_report_file}" |
| 370 | + collect_debug_diagnostic "${pod_name}" "${pod_nodename}" "lspci" "-vvv" "${bug_report_file}" |
| 371 | + |
| 372 | + # Recompress the bug report |
| 373 | + if ! gzip "${bug_report_file}" 2>&1; then |
| 374 | + echo "Warning: Failed to recompress bug report for ${pod_nodename}" |
| 375 | + fi |
| 376 | + else |
| 377 | + echo "NOTE: For extended diagnostics (dmidecode/lspci), set ENABLE_EXTENDED_DIAGNOSTICS=true" |
| 378 | + fi |
277 | 379 | done |
278 | 380 |
|
279 | 381 | echo "" |
|
0 commit comments