Skip to content

Commit 8d1d3b5

Browse files
authored
feat(nvidia/bad-envs): rename Data to checkResult, add more unit tests (#671)
Signed-off-by: Gyuho Lee <[email protected]> Signed-off-by: Gyuho Lee <[email protected]>
1 parent d6944a5 commit 8d1d3b5

File tree

2 files changed

+212
-54
lines changed

2 files changed

+212
-54
lines changed

components/accelerator/nvidia/bad-envs/component.go

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ type component struct {
3232
// returns true if the specified environment variable is set
3333
checkEnvFunc func(key string) bool
3434

35-
lastMu sync.RWMutex
36-
lastData *Data
35+
lastMu sync.RWMutex
36+
lastCheckResult *checkResult
3737
}
3838

3939
func New(gpudInstance *components.GPUdInstance) (components.Component, error) {
@@ -71,7 +71,7 @@ func (c *component) Start() error {
7171

7272
func (c *component) LastHealthStates() apiv1.HealthStates {
7373
c.lastMu.RLock()
74-
lastData := c.lastData
74+
lastData := c.lastCheckResult
7575
c.lastMu.RUnlock()
7676
return lastData.getLastHealthStates()
7777
}
@@ -107,24 +107,24 @@ var BAD_CUDA_ENV_KEYS = map[string]string{
107107
func (c *component) Check() components.CheckResult {
108108
log.Logger.Infow("checking nvidia gpu bad env variables")
109109

110-
d := &Data{
110+
cr := &checkResult{
111111
ts: time.Now().UTC(),
112112
}
113113
defer func() {
114114
c.lastMu.Lock()
115-
c.lastData = d
115+
c.lastCheckResult = cr
116116
c.lastMu.Unlock()
117117
}()
118118

119119
if c.nvmlInstance == nil {
120-
d.health = apiv1.HealthStateTypeHealthy
121-
d.reason = "NVIDIA NVML instance is nil"
122-
return d
120+
cr.health = apiv1.HealthStateTypeHealthy
121+
cr.reason = "NVIDIA NVML instance is nil"
122+
return cr
123123
}
124124
if !c.nvmlInstance.NVMLExists() {
125-
d.health = apiv1.HealthStateTypeHealthy
126-
d.reason = "NVIDIA NVML is not loaded"
127-
return d
125+
cr.health = apiv1.HealthStateTypeHealthy
126+
cr.reason = "NVIDIA NVML is not loaded"
127+
return cr
128128
}
129129

130130
foundBadEnvsForCUDA := make(map[string]string)
@@ -134,26 +134,26 @@ func (c *component) Check() components.CheckResult {
134134
}
135135
}
136136
if len(foundBadEnvsForCUDA) > 0 {
137-
d.FoundBadEnvsForCUDA = foundBadEnvsForCUDA
137+
cr.FoundBadEnvsForCUDA = foundBadEnvsForCUDA
138138
}
139139

140140
if len(foundBadEnvsForCUDA) == 0 {
141-
d.reason = "no bad envs found"
141+
cr.reason = "no bad envs found"
142142
} else {
143-
kvs := make([]string, 0, len(d.FoundBadEnvsForCUDA))
144-
for k, v := range d.FoundBadEnvsForCUDA {
143+
kvs := make([]string, 0, len(cr.FoundBadEnvsForCUDA))
144+
for k, v := range cr.FoundBadEnvsForCUDA {
145145
kvs = append(kvs, fmt.Sprintf("%s: %s", k, v))
146146
}
147-
d.reason = strings.Join(kvs, "; ")
147+
cr.reason = strings.Join(kvs, "; ")
148148
}
149149

150-
d.health = apiv1.HealthStateTypeHealthy
151-
return d
150+
cr.health = apiv1.HealthStateTypeHealthy
151+
return cr
152152
}
153153

154-
var _ components.CheckResult = &Data{}
154+
var _ components.CheckResult = &checkResult{}
155155

156-
type Data struct {
156+
type checkResult struct {
157157
// FoundBadEnvsForCUDA is a map of environment variables that are known to hurt CUDA.
158158
// that is set globally for the host.
159159
// This implements "DCGM_FR_BAD_CUDA_ENV" logic in DCGM.
@@ -170,49 +170,49 @@ type Data struct {
170170
reason string
171171
}
172172

173-
func (d *Data) String() string {
174-
if d == nil {
173+
func (cr *checkResult) String() string {
174+
if cr == nil {
175175
return ""
176176
}
177-
if len(d.FoundBadEnvsForCUDA) == 0 {
177+
if len(cr.FoundBadEnvsForCUDA) == 0 {
178178
return "no bad envs found"
179179
}
180180

181181
buf := bytes.NewBuffer(nil)
182182
table := tablewriter.NewWriter(buf)
183183
table.SetAlignment(tablewriter.ALIGN_CENTER)
184184
table.SetHeader([]string{"Found Env Key", "Description"})
185-
for k, v := range d.FoundBadEnvsForCUDA {
185+
for k, v := range cr.FoundBadEnvsForCUDA {
186186
table.Append([]string{k, v})
187187
}
188188
table.Render()
189189

190190
return buf.String()
191191
}
192192

193-
func (d *Data) Summary() string {
194-
if d == nil {
193+
func (cr *checkResult) Summary() string {
194+
if cr == nil {
195195
return ""
196196
}
197-
return d.reason
197+
return cr.reason
198198
}
199199

200-
func (d *Data) HealthState() apiv1.HealthStateType {
201-
if d == nil {
200+
func (cr *checkResult) HealthState() apiv1.HealthStateType {
201+
if cr == nil {
202202
return ""
203203
}
204-
return d.health
204+
return cr.health
205205
}
206206

207-
func (d *Data) getError() string {
208-
if d == nil || d.err == nil {
207+
func (cr *checkResult) getError() string {
208+
if cr == nil || cr.err == nil {
209209
return ""
210210
}
211-
return d.err.Error()
211+
return cr.err.Error()
212212
}
213213

214-
func (d *Data) getLastHealthStates() apiv1.HealthStates {
215-
if d == nil {
214+
func (cr *checkResult) getLastHealthStates() apiv1.HealthStates {
215+
if cr == nil {
216216
return apiv1.HealthStates{
217217
{
218218
Name: Name,
@@ -224,12 +224,12 @@ func (d *Data) getLastHealthStates() apiv1.HealthStates {
224224

225225
state := apiv1.HealthState{
226226
Name: Name,
227-
Reason: d.reason,
228-
Error: d.getError(),
229-
Health: d.health,
227+
Reason: cr.reason,
228+
Error: cr.getError(),
229+
Health: cr.health,
230230
}
231231

232-
b, _ := json.Marshal(d)
232+
b, _ := json.Marshal(cr)
233233
state.DeprecatedExtraInfo = map[string]string{
234234
"data": string(b),
235235
"encoding": "json",

0 commit comments

Comments
 (0)