2
2
package badenvs
3
3
4
4
import (
5
+ "bytes"
5
6
"context"
6
7
"encoding/json"
7
8
"fmt"
@@ -10,9 +11,12 @@ import (
10
11
"sync"
11
12
"time"
12
13
14
+ "github.com/olekukonko/tablewriter"
15
+
13
16
apiv1 "github.com/leptonai/gpud/api/v1"
14
17
"github.com/leptonai/gpud/components"
15
18
"github.com/leptonai/gpud/pkg/log"
19
+ nvidianvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml"
16
20
)
17
21
18
22
const Name = "accelerator-nvidia-bad-envs"
@@ -23,23 +27,26 @@ type component struct {
23
27
ctx context.Context
24
28
cancel context.CancelFunc
25
29
30
+ nvmlInstance nvidianvml.InstanceV2
31
+
26
32
// returns true if the specified environment variable is set
27
33
checkEnvFunc func (key string ) bool
28
34
29
35
lastMu sync.RWMutex
30
36
lastData * Data
31
37
}
32
38
33
- func New (ctx context. Context ) components.Component {
34
- cctx , ccancel := context .WithCancel (ctx )
35
- return & component {
36
- ctx : cctx ,
37
- cancel : ccancel ,
38
-
39
+ func New (gpudInstance * components. GPUdInstance ) ( components.Component , error ) {
40
+ cctx , ccancel := context .WithCancel (gpudInstance . RootCtx )
41
+ c := & component {
42
+ ctx : cctx ,
43
+ cancel : ccancel ,
44
+ nvmlInstance : gpudInstance . NVMLInstance ,
39
45
checkEnvFunc : func (key string ) bool {
40
46
return os .Getenv (key ) == "1"
41
47
},
42
48
}
49
+ return c , nil
43
50
}
44
51
45
52
func (c * component ) Name () string { return Name }
@@ -48,8 +55,10 @@ func (c *component) Start() error {
48
55
go func () {
49
56
ticker := time .NewTicker (time .Minute )
50
57
defer ticker .Stop ()
58
+
51
59
for {
52
- c .CheckOnce ()
60
+ _ = c .Check ()
61
+
53
62
select {
54
63
case <- c .ctx .Done ():
55
64
return
@@ -60,11 +69,11 @@ func (c *component) Start() error {
60
69
return nil
61
70
}
62
71
63
- func (c * component ) HealthStates ( ctx context. Context ) ( apiv1.HealthStates , error ) {
72
+ func (c * component ) LastHealthStates () apiv1.HealthStates {
64
73
c .lastMu .RLock ()
65
74
lastData := c .lastData
66
75
c .lastMu .RUnlock ()
67
- return lastData .getHealthStates ()
76
+ return lastData .getLastHealthStates ()
68
77
}
69
78
70
79
func (c * component ) Events (ctx context.Context , since time.Time ) (apiv1.Events , error ) {
@@ -95,19 +104,29 @@ var BAD_CUDA_ENV_KEYS = map[string]string{
95
104
"OPENCL_PROFILE" : "Enables OpenCL profiling." ,
96
105
}
97
106
98
- // CheckOnce checks the current pods
99
- // run this periodically
100
- func (c * component ) CheckOnce () {
101
- log .Logger .Infow ("checking memory" )
102
- d := Data {
107
+ func (c * component ) Check () components.CheckResult {
108
+ log .Logger .Infow ("checking nvidia gpu bad env variables" )
109
+
110
+ d := & Data {
103
111
ts : time .Now ().UTC (),
104
112
}
105
113
defer func () {
106
114
c .lastMu .Lock ()
107
- c .lastData = & d
115
+ c .lastData = d
108
116
c .lastMu .Unlock ()
109
117
}()
110
118
119
+ if c .nvmlInstance == nil {
120
+ d .health = apiv1 .HealthStateTypeHealthy
121
+ d .reason = "NVIDIA NVML instance is nil"
122
+ return d
123
+ }
124
+ if ! c .nvmlInstance .NVMLExists () {
125
+ d .health = apiv1 .HealthStateTypeHealthy
126
+ d .reason = "NVIDIA NVML is not loaded"
127
+ return d
128
+ }
129
+
111
130
foundBadEnvsForCUDA := make (map [string ]string )
112
131
for k , desc := range BAD_CUDA_ENV_KEYS {
113
132
if c .checkEnvFunc (k ) {
@@ -118,8 +137,6 @@ func (c *component) CheckOnce() {
118
137
d .FoundBadEnvsForCUDA = foundBadEnvsForCUDA
119
138
}
120
139
121
- d .healthy = true
122
-
123
140
if len (foundBadEnvsForCUDA ) == 0 {
124
141
d .reason = "no bad envs found"
125
142
} else {
@@ -129,8 +146,13 @@ func (c *component) CheckOnce() {
129
146
}
130
147
d .reason = strings .Join (kvs , "; " )
131
148
}
149
+
150
+ d .health = apiv1 .HealthStateTypeHealthy
151
+ return d
132
152
}
133
153
154
+ var _ components.CheckResult = & Data {}
155
+
134
156
type Data struct {
135
157
// FoundBadEnvsForCUDA is a map of environment variables that are known to hurt CUDA.
136
158
// that is set globally for the host.
@@ -143,44 +165,74 @@ type Data struct {
143
165
err error
144
166
145
167
// tracks the healthy evaluation result of the last check
146
- healthy bool
168
+ health apiv1. HealthStateType
147
169
// tracks the reason of the last check
148
170
reason string
149
171
}
150
172
173
+ func (d * Data ) String () string {
174
+ if d == nil {
175
+ return ""
176
+ }
177
+ if len (d .FoundBadEnvsForCUDA ) == 0 {
178
+ return "no bad envs found"
179
+ }
180
+
181
+ buf := bytes .NewBuffer (nil )
182
+ table := tablewriter .NewWriter (buf )
183
+ table .SetAlignment (tablewriter .ALIGN_CENTER )
184
+ table .SetHeader ([]string {"Found Env Key" , "Description" })
185
+ for k , v := range d .FoundBadEnvsForCUDA {
186
+ table .Append ([]string {k , v })
187
+ }
188
+ table .Render ()
189
+
190
+ return buf .String ()
191
+ }
192
+
193
+ func (d * Data ) Summary () string {
194
+ if d == nil {
195
+ return ""
196
+ }
197
+ return d .reason
198
+ }
199
+
200
+ func (d * Data ) HealthState () apiv1.HealthStateType {
201
+ if d == nil {
202
+ return ""
203
+ }
204
+ return d .health
205
+ }
206
+
151
207
func (d * Data ) getError () string {
152
208
if d == nil || d .err == nil {
153
209
return ""
154
210
}
155
211
return d .err .Error ()
156
212
}
157
213
158
- func (d * Data ) getHealthStates () ( apiv1.HealthStates , error ) {
214
+ func (d * Data ) getLastHealthStates () apiv1.HealthStates {
159
215
if d == nil {
160
- return [] apiv1.HealthState {
216
+ return apiv1.HealthStates {
161
217
{
162
218
Name : Name ,
163
- Health : apiv1 .StateTypeHealthy ,
219
+ Health : apiv1 .HealthStateTypeHealthy ,
164
220
Reason : "no data yet" ,
165
221
},
166
- }, nil
222
+ }
167
223
}
168
224
169
225
state := apiv1.HealthState {
170
226
Name : Name ,
171
227
Reason : d .reason ,
172
228
Error : d .getError (),
173
-
174
- Health : apiv1 .StateTypeHealthy ,
175
- }
176
- if ! d .healthy {
177
- state .Health = apiv1 .StateTypeUnhealthy
229
+ Health : d .health ,
178
230
}
179
231
180
232
b , _ := json .Marshal (d )
181
233
state .DeprecatedExtraInfo = map [string ]string {
182
234
"data" : string (b ),
183
235
"encoding" : "json" ,
184
236
}
185
- return [] apiv1.HealthState {state }, nil
237
+ return apiv1.HealthStates {state }
186
238
}
0 commit comments