Skip to content

Commit ea9d3bb

Browse files
authored
feat(*): populate health state Component field, rename Data to checkResult (#674)
Similar to #671. Signed-off-by: Gyuho Lee <[email protected]>
1 parent ad00cc1 commit ea9d3bb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+3152
-3088
lines changed

components/accelerator/nvidia/bad-envs/component.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ func (c *component) Start() error {
7171

7272
func (c *component) LastHealthStates() apiv1.HealthStates {
7373
c.lastMu.RLock()
74-
lastData := c.lastCheckResult
74+
lastCheckResult := c.lastCheckResult
7575
c.lastMu.RUnlock()
76-
return lastData.getLastHealthStates()
76+
return lastCheckResult.getLastHealthStates()
7777
}
7878

7979
func (c *component) Events(ctx context.Context, since time.Time) (apiv1.Events, error) {
@@ -215,18 +215,20 @@ func (cr *checkResult) getLastHealthStates() apiv1.HealthStates {
215215
if cr == nil {
216216
return apiv1.HealthStates{
217217
{
218-
Name: Name,
219-
Health: apiv1.HealthStateTypeHealthy,
220-
Reason: "no data yet",
218+
Component: Name,
219+
Name: Name,
220+
Health: apiv1.HealthStateTypeHealthy,
221+
Reason: "no data yet",
221222
},
222223
}
223224
}
224225

225226
state := apiv1.HealthState{
226-
Name: Name,
227-
Reason: cr.reason,
228-
Error: cr.getError(),
229-
Health: cr.health,
227+
Component: Name,
228+
Name: Name,
229+
Reason: cr.reason,
230+
Error: cr.getError(),
231+
Health: cr.health,
230232
}
231233

232234
b, _ := json.Marshal(cr)

components/accelerator/nvidia/bad-envs/component_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -238,16 +238,16 @@ func TestClose(t *testing.T) {
238238

239239
func TestDataGetError(t *testing.T) {
240240
// Test with nil Data
241-
var d *checkResult
242-
assert.Empty(t, d.getError())
241+
var cr *checkResult
242+
assert.Empty(t, cr.getError())
243243

244244
// Test with nil error
245-
d = &checkResult{}
246-
assert.Empty(t, d.getError())
245+
cr = &checkResult{}
246+
assert.Empty(t, cr.getError())
247247

248248
// Test with actual error
249-
d = &checkResult{err: assert.AnError}
250-
assert.Equal(t, assert.AnError.Error(), d.getError())
249+
cr = &checkResult{err: assert.AnError}
250+
assert.Equal(t, assert.AnError.Error(), cr.getError())
251251
}
252252

253253
func TestPeriodicCheck(t *testing.T) {
@@ -281,7 +281,7 @@ func TestPeriodicCheck(t *testing.T) {
281281
func TestDataWithMultipleBadEnvs(t *testing.T) {
282282
// Create data with multiple bad environments and set a valid reason
283283
reason := "CUDA_PROFILE: Enables CUDA profiling.; COMPUTE_PROFILE: Enables compute profiling."
284-
d := &checkResult{
284+
cr := &checkResult{
285285
FoundBadEnvsForCUDA: map[string]string{
286286
"CUDA_PROFILE": "Enables CUDA profiling.",
287287
"COMPUTE_PROFILE": "Enables compute profiling.",
@@ -292,7 +292,7 @@ func TestDataWithMultipleBadEnvs(t *testing.T) {
292292
}
293293

294294
// Check the reason string contains both env vars
295-
states := d.getLastHealthStates()
295+
states := cr.getLastHealthStates()
296296
assert.Len(t, states, 1)
297297
assert.Contains(t, states[0].Reason, "CUDA_PROFILE")
298298
assert.Contains(t, states[0].Reason, "COMPUTE_PROFILE")

components/accelerator/nvidia/clock-speed/component.go

Lines changed: 47 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ type component struct {
3131
nvmlInstance nvidianvml.InstanceV2
3232
getClockSpeedFunc func(uuid string, dev device.Device) (nvidianvml.ClockSpeed, error)
3333

34-
lastMu sync.RWMutex
35-
lastData *Data
34+
lastMu sync.RWMutex
35+
lastCheckResult *checkResult
3636
}
3737

3838
func New(gpudInstance *components.GPUdInstance) (components.Component, error) {
@@ -68,9 +68,9 @@ func (c *component) Start() error {
6868

6969
func (c *component) LastHealthStates() apiv1.HealthStates {
7070
c.lastMu.RLock()
71-
lastData := c.lastData
71+
lastCheckResult := c.lastCheckResult
7272
c.lastMu.RUnlock()
73-
return lastData.getLastHealthStates()
73+
return lastCheckResult.getLastHealthStates()
7474
}
7575

7676
func (c *component) Events(ctx context.Context, since time.Time) (apiv1.Events, error) {
@@ -88,24 +88,24 @@ func (c *component) Close() error {
8888
func (c *component) Check() components.CheckResult {
8989
log.Logger.Infow("checking nvidia gpu clock speed")
9090

91-
d := &Data{
91+
cr := &checkResult{
9292
ts: time.Now().UTC(),
9393
}
9494
defer func() {
9595
c.lastMu.Lock()
96-
c.lastData = d
96+
c.lastCheckResult = cr
9797
c.lastMu.Unlock()
9898
}()
9999

100100
if c.nvmlInstance == nil {
101-
d.health = apiv1.HealthStateTypeHealthy
102-
d.reason = "NVIDIA NVML instance is nil"
103-
return d
101+
cr.health = apiv1.HealthStateTypeHealthy
102+
cr.reason = "NVIDIA NVML instance is nil"
103+
return cr
104104
}
105105
if !c.nvmlInstance.NVMLExists() {
106-
d.health = apiv1.HealthStateTypeHealthy
107-
d.reason = "NVIDIA NVML is not loaded"
108-
return d
106+
cr.health = apiv1.HealthStateTypeHealthy
107+
cr.reason = "NVIDIA NVML is not loaded"
108+
return cr
109109
}
110110

111111
devs := c.nvmlInstance.Devices()
@@ -114,26 +114,26 @@ func (c *component) Check() components.CheckResult {
114114
if err != nil {
115115
log.Logger.Errorw("error getting clock speed for device", "uuid", uuid, "error", err)
116116

117-
d.err = err
118-
d.health = apiv1.HealthStateTypeUnhealthy
119-
d.reason = fmt.Sprintf("error getting clock speed for device %s", uuid)
120-
return d
117+
cr.err = err
118+
cr.health = apiv1.HealthStateTypeUnhealthy
119+
cr.reason = fmt.Sprintf("error getting clock speed for device %s", uuid)
120+
return cr
121121
}
122-
d.ClockSpeeds = append(d.ClockSpeeds, clockSpeed)
122+
cr.ClockSpeeds = append(cr.ClockSpeeds, clockSpeed)
123123

124124
metricGraphicsMHz.With(prometheus.Labels{pkgmetrics.MetricLabelKey: uuid}).Set(float64(clockSpeed.GraphicsMHz))
125125
metricMemoryMHz.With(prometheus.Labels{pkgmetrics.MetricLabelKey: uuid}).Set(float64(clockSpeed.MemoryMHz))
126126
}
127127

128-
d.health = apiv1.HealthStateTypeHealthy
129-
d.reason = fmt.Sprintf("all %d GPU(s) were checked, no clock speed issue found", len(devs))
128+
cr.health = apiv1.HealthStateTypeHealthy
129+
cr.reason = fmt.Sprintf("all %d GPU(s) were checked, no clock speed issue found", len(devs))
130130

131-
return d
131+
return cr
132132
}
133133

134-
var _ components.CheckResult = &Data{}
134+
var _ components.CheckResult = &checkResult{}
135135

136-
type Data struct {
136+
type checkResult struct {
137137
ClockSpeeds []nvidianvml.ClockSpeed `json:"clock_speeds,omitempty"`
138138

139139
// timestamp of the last check
@@ -147,11 +147,11 @@ type Data struct {
147147
reason string
148148
}
149149

150-
func (d *Data) String() string {
151-
if d == nil {
150+
func (cr *checkResult) String() string {
151+
if cr == nil {
152152
return ""
153153
}
154-
if len(d.ClockSpeeds) == 0 {
154+
if len(cr.ClockSpeeds) == 0 {
155155
return "no data"
156156
}
157157

@@ -160,7 +160,7 @@ func (d *Data) String() string {
160160
table.SetAlignment(tablewriter.ALIGN_CENTER)
161161

162162
table.SetHeader([]string{"GPU UUID", "Graphics MHz", "Memory MHz", "Graphics Supported", "Memory Supported"})
163-
for _, clockSpeed := range d.ClockSpeeds {
163+
for _, clockSpeed := range cr.ClockSpeeds {
164164
table.Append([]string{
165165
clockSpeed.UUID,
166166
fmt.Sprintf("%d MHz", clockSpeed.GraphicsMHz),
@@ -175,46 +175,48 @@ func (d *Data) String() string {
175175
return buf.String()
176176
}
177177

178-
func (d *Data) Summary() string {
179-
if d == nil {
178+
func (cr *checkResult) Summary() string {
179+
if cr == nil {
180180
return ""
181181
}
182-
return d.reason
182+
return cr.reason
183183
}
184184

185-
func (d *Data) HealthState() apiv1.HealthStateType {
186-
if d == nil {
185+
func (cr *checkResult) HealthState() apiv1.HealthStateType {
186+
if cr == nil {
187187
return ""
188188
}
189-
return d.health
189+
return cr.health
190190
}
191191

192-
func (d *Data) getError() string {
193-
if d == nil || d.err == nil {
192+
func (cr *checkResult) getError() string {
193+
if cr == nil || cr.err == nil {
194194
return ""
195195
}
196-
return d.err.Error()
196+
return cr.err.Error()
197197
}
198198

199-
func (d *Data) getLastHealthStates() apiv1.HealthStates {
200-
if d == nil {
199+
func (cr *checkResult) getLastHealthStates() apiv1.HealthStates {
200+
if cr == nil {
201201
return apiv1.HealthStates{
202202
{
203-
Name: Name,
204-
Health: apiv1.HealthStateTypeHealthy,
205-
Reason: "no data yet",
203+
Component: Name,
204+
Name: Name,
205+
Health: apiv1.HealthStateTypeHealthy,
206+
Reason: "no data yet",
206207
},
207208
}
208209
}
209210

210211
state := apiv1.HealthState{
211-
Name: Name,
212-
Reason: d.reason,
213-
Error: d.getError(),
214-
Health: d.health,
212+
Component: Name,
213+
Name: Name,
214+
Reason: cr.reason,
215+
Error: cr.getError(),
216+
Health: cr.health,
215217
}
216218

217-
b, _ := json.Marshal(d)
219+
b, _ := json.Marshal(cr)
218220
state.DeprecatedExtraInfo = map[string]string{
219221
"data": string(b),
220222
"encoding": "json",

0 commit comments

Comments
 (0)