@@ -32,8 +32,8 @@ type component struct {
32
32
// returns true if the specified environment variable is set
33
33
checkEnvFunc func (key string ) bool
34
34
35
- lastMu sync.RWMutex
36
- lastData * Data
35
+ lastMu sync.RWMutex
36
+ lastCheckResult * checkResult
37
37
}
38
38
39
39
func New (gpudInstance * components.GPUdInstance ) (components.Component , error ) {
@@ -71,7 +71,7 @@ func (c *component) Start() error {
71
71
72
72
func (c * component ) LastHealthStates () apiv1.HealthStates {
73
73
c .lastMu .RLock ()
74
- lastData := c .lastData
74
+ lastData := c .lastCheckResult
75
75
c .lastMu .RUnlock ()
76
76
return lastData .getLastHealthStates ()
77
77
}
@@ -107,24 +107,24 @@ var BAD_CUDA_ENV_KEYS = map[string]string{
107
107
func (c * component ) Check () components.CheckResult {
108
108
log .Logger .Infow ("checking nvidia gpu bad env variables" )
109
109
110
- d := & Data {
110
+ cr := & checkResult {
111
111
ts : time .Now ().UTC (),
112
112
}
113
113
defer func () {
114
114
c .lastMu .Lock ()
115
- c .lastData = d
115
+ c .lastCheckResult = cr
116
116
c .lastMu .Unlock ()
117
117
}()
118
118
119
119
if c .nvmlInstance == nil {
120
- d .health = apiv1 .HealthStateTypeHealthy
121
- d .reason = "NVIDIA NVML instance is nil"
122
- return d
120
+ cr .health = apiv1 .HealthStateTypeHealthy
121
+ cr .reason = "NVIDIA NVML instance is nil"
122
+ return cr
123
123
}
124
124
if ! c .nvmlInstance .NVMLExists () {
125
- d .health = apiv1 .HealthStateTypeHealthy
126
- d .reason = "NVIDIA NVML is not loaded"
127
- return d
125
+ cr .health = apiv1 .HealthStateTypeHealthy
126
+ cr .reason = "NVIDIA NVML is not loaded"
127
+ return cr
128
128
}
129
129
130
130
foundBadEnvsForCUDA := make (map [string ]string )
@@ -134,26 +134,26 @@ func (c *component) Check() components.CheckResult {
134
134
}
135
135
}
136
136
if len (foundBadEnvsForCUDA ) > 0 {
137
- d .FoundBadEnvsForCUDA = foundBadEnvsForCUDA
137
+ cr .FoundBadEnvsForCUDA = foundBadEnvsForCUDA
138
138
}
139
139
140
140
if len (foundBadEnvsForCUDA ) == 0 {
141
- d .reason = "no bad envs found"
141
+ cr .reason = "no bad envs found"
142
142
} else {
143
- kvs := make ([]string , 0 , len (d .FoundBadEnvsForCUDA ))
144
- for k , v := range d .FoundBadEnvsForCUDA {
143
+ kvs := make ([]string , 0 , len (cr .FoundBadEnvsForCUDA ))
144
+ for k , v := range cr .FoundBadEnvsForCUDA {
145
145
kvs = append (kvs , fmt .Sprintf ("%s: %s" , k , v ))
146
146
}
147
- d .reason = strings .Join (kvs , "; " )
147
+ cr .reason = strings .Join (kvs , "; " )
148
148
}
149
149
150
- d .health = apiv1 .HealthStateTypeHealthy
151
- return d
150
+ cr .health = apiv1 .HealthStateTypeHealthy
151
+ return cr
152
152
}
153
153
154
- var _ components.CheckResult = & Data {}
154
+ var _ components.CheckResult = & checkResult {}
155
155
156
- type Data struct {
156
+ type checkResult struct {
157
157
// FoundBadEnvsForCUDA is a map of environment variables that are known to hurt CUDA.
158
158
// that is set globally for the host.
159
159
// This implements "DCGM_FR_BAD_CUDA_ENV" logic in DCGM.
@@ -170,49 +170,49 @@ type Data struct {
170
170
reason string
171
171
}
172
172
173
- func (d * Data ) String () string {
174
- if d == nil {
173
+ func (cr * checkResult ) String () string {
174
+ if cr == nil {
175
175
return ""
176
176
}
177
- if len (d .FoundBadEnvsForCUDA ) == 0 {
177
+ if len (cr .FoundBadEnvsForCUDA ) == 0 {
178
178
return "no bad envs found"
179
179
}
180
180
181
181
buf := bytes .NewBuffer (nil )
182
182
table := tablewriter .NewWriter (buf )
183
183
table .SetAlignment (tablewriter .ALIGN_CENTER )
184
184
table .SetHeader ([]string {"Found Env Key" , "Description" })
185
- for k , v := range d .FoundBadEnvsForCUDA {
185
+ for k , v := range cr .FoundBadEnvsForCUDA {
186
186
table .Append ([]string {k , v })
187
187
}
188
188
table .Render ()
189
189
190
190
return buf .String ()
191
191
}
192
192
193
- func (d * Data ) Summary () string {
194
- if d == nil {
193
+ func (cr * checkResult ) Summary () string {
194
+ if cr == nil {
195
195
return ""
196
196
}
197
- return d .reason
197
+ return cr .reason
198
198
}
199
199
200
- func (d * Data ) HealthState () apiv1.HealthStateType {
201
- if d == nil {
200
+ func (cr * checkResult ) HealthState () apiv1.HealthStateType {
201
+ if cr == nil {
202
202
return ""
203
203
}
204
- return d .health
204
+ return cr .health
205
205
}
206
206
207
- func (d * Data ) getError () string {
208
- if d == nil || d .err == nil {
207
+ func (cr * checkResult ) getError () string {
208
+ if cr == nil || cr .err == nil {
209
209
return ""
210
210
}
211
- return d .err .Error ()
211
+ return cr .err .Error ()
212
212
}
213
213
214
- func (d * Data ) getLastHealthStates () apiv1.HealthStates {
215
- if d == nil {
214
+ func (cr * checkResult ) getLastHealthStates () apiv1.HealthStates {
215
+ if cr == nil {
216
216
return apiv1.HealthStates {
217
217
{
218
218
Name : Name ,
@@ -224,12 +224,12 @@ func (d *Data) getLastHealthStates() apiv1.HealthStates {
224
224
225
225
state := apiv1.HealthState {
226
226
Name : Name ,
227
- Reason : d .reason ,
228
- Error : d .getError (),
229
- Health : d .health ,
227
+ Reason : cr .reason ,
228
+ Error : cr .getError (),
229
+ Health : cr .health ,
230
230
}
231
231
232
- b , _ := json .Marshal (d )
232
+ b , _ := json .Marshal (cr )
233
233
state .DeprecatedExtraInfo = map [string ]string {
234
234
"data" : string (b ),
235
235
"encoding" : "json" ,
0 commit comments