We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents 454f486 + d090aa1 commit d4e7e55Copy full SHA for d4e7e55
helm/soperator-activechecks/scripts/extensive-check.sh
@@ -190,7 +190,7 @@ health_checker_runs=(
190
all_reduce_with_ib
191
all_reduce_without_ib
192
cuda_samples
193
- dcgmi_diag_r2
+# dcgmi_diag_r2
194
gpu_fryer
195
# ib_gpu_perf
196
mem_perf
helm/soperator-activechecks/values.yaml
@@ -55,12 +55,12 @@ checks:
55
runAfterCreation: true
56
drainReasonPrefix: "[node_problem]"
57
dcgmiDiagR2:
58
- suspend: false
59
- runAfterCreation: true
+ suspend: true
+ runAfterCreation: false
60
61
dcgmiDiagR3:
62
suspend: true
63
64
65
enrootCleanup:
66
suspend: false
0 commit comments