@@ -31,8 +31,8 @@ type component struct {
31
31
nvmlInstance nvidianvml.InstanceV2
32
32
getClockSpeedFunc func (uuid string , dev device.Device ) (nvidianvml.ClockSpeed , error )
33
33
34
- lastMu sync.RWMutex
35
- lastData * Data
34
+ lastMu sync.RWMutex
35
+ lastCheckResult * checkResult
36
36
}
37
37
38
38
func New (gpudInstance * components.GPUdInstance ) (components.Component , error ) {
@@ -68,9 +68,9 @@ func (c *component) Start() error {
68
68
69
69
func (c * component ) LastHealthStates () apiv1.HealthStates {
70
70
c .lastMu .RLock ()
71
- lastData := c .lastData
71
+ lastCheckResult := c .lastCheckResult
72
72
c .lastMu .RUnlock ()
73
- return lastData .getLastHealthStates ()
73
+ return lastCheckResult .getLastHealthStates ()
74
74
}
75
75
76
76
func (c * component ) Events (ctx context.Context , since time.Time ) (apiv1.Events , error ) {
@@ -88,24 +88,24 @@ func (c *component) Close() error {
88
88
func (c * component ) Check () components.CheckResult {
89
89
log .Logger .Infow ("checking nvidia gpu clock speed" )
90
90
91
- d := & Data {
91
+ cr := & checkResult {
92
92
ts : time .Now ().UTC (),
93
93
}
94
94
defer func () {
95
95
c .lastMu .Lock ()
96
- c .lastData = d
96
+ c .lastCheckResult = cr
97
97
c .lastMu .Unlock ()
98
98
}()
99
99
100
100
if c .nvmlInstance == nil {
101
- d .health = apiv1 .HealthStateTypeHealthy
102
- d .reason = "NVIDIA NVML instance is nil"
103
- return d
101
+ cr .health = apiv1 .HealthStateTypeHealthy
102
+ cr .reason = "NVIDIA NVML instance is nil"
103
+ return cr
104
104
}
105
105
if ! c .nvmlInstance .NVMLExists () {
106
- d .health = apiv1 .HealthStateTypeHealthy
107
- d .reason = "NVIDIA NVML is not loaded"
108
- return d
106
+ cr .health = apiv1 .HealthStateTypeHealthy
107
+ cr .reason = "NVIDIA NVML is not loaded"
108
+ return cr
109
109
}
110
110
111
111
devs := c .nvmlInstance .Devices ()
@@ -114,26 +114,26 @@ func (c *component) Check() components.CheckResult {
114
114
if err != nil {
115
115
log .Logger .Errorw ("error getting clock speed for device" , "uuid" , uuid , "error" , err )
116
116
117
- d .err = err
118
- d .health = apiv1 .HealthStateTypeUnhealthy
119
- d .reason = fmt .Sprintf ("error getting clock speed for device %s" , uuid )
120
- return d
117
+ cr .err = err
118
+ cr .health = apiv1 .HealthStateTypeUnhealthy
119
+ cr .reason = fmt .Sprintf ("error getting clock speed for device %s" , uuid )
120
+ return cr
121
121
}
122
- d .ClockSpeeds = append (d .ClockSpeeds , clockSpeed )
122
+ cr .ClockSpeeds = append (cr .ClockSpeeds , clockSpeed )
123
123
124
124
metricGraphicsMHz .With (prometheus.Labels {pkgmetrics .MetricLabelKey : uuid }).Set (float64 (clockSpeed .GraphicsMHz ))
125
125
metricMemoryMHz .With (prometheus.Labels {pkgmetrics .MetricLabelKey : uuid }).Set (float64 (clockSpeed .MemoryMHz ))
126
126
}
127
127
128
- d .health = apiv1 .HealthStateTypeHealthy
129
- d .reason = fmt .Sprintf ("all %d GPU(s) were checked, no clock speed issue found" , len (devs ))
128
+ cr .health = apiv1 .HealthStateTypeHealthy
129
+ cr .reason = fmt .Sprintf ("all %d GPU(s) were checked, no clock speed issue found" , len (devs ))
130
130
131
- return d
131
+ return cr
132
132
}
133
133
134
- var _ components.CheckResult = & Data {}
134
+ var _ components.CheckResult = & checkResult {}
135
135
136
- type Data struct {
136
+ type checkResult struct {
137
137
ClockSpeeds []nvidianvml.ClockSpeed `json:"clock_speeds,omitempty"`
138
138
139
139
// timestamp of the last check
@@ -147,11 +147,11 @@ type Data struct {
147
147
reason string
148
148
}
149
149
150
- func (d * Data ) String () string {
151
- if d == nil {
150
+ func (cr * checkResult ) String () string {
151
+ if cr == nil {
152
152
return ""
153
153
}
154
- if len (d .ClockSpeeds ) == 0 {
154
+ if len (cr .ClockSpeeds ) == 0 {
155
155
return "no data"
156
156
}
157
157
@@ -160,7 +160,7 @@ func (d *Data) String() string {
160
160
table .SetAlignment (tablewriter .ALIGN_CENTER )
161
161
162
162
table .SetHeader ([]string {"GPU UUID" , "Graphics MHz" , "Memory MHz" , "Graphics Supported" , "Memory Supported" })
163
- for _ , clockSpeed := range d .ClockSpeeds {
163
+ for _ , clockSpeed := range cr .ClockSpeeds {
164
164
table .Append ([]string {
165
165
clockSpeed .UUID ,
166
166
fmt .Sprintf ("%d MHz" , clockSpeed .GraphicsMHz ),
@@ -175,46 +175,48 @@ func (d *Data) String() string {
175
175
return buf .String ()
176
176
}
177
177
178
- func (d * Data ) Summary () string {
179
- if d == nil {
178
+ func (cr * checkResult ) Summary () string {
179
+ if cr == nil {
180
180
return ""
181
181
}
182
- return d .reason
182
+ return cr .reason
183
183
}
184
184
185
- func (d * Data ) HealthState () apiv1.HealthStateType {
186
- if d == nil {
185
+ func (cr * checkResult ) HealthState () apiv1.HealthStateType {
186
+ if cr == nil {
187
187
return ""
188
188
}
189
- return d .health
189
+ return cr .health
190
190
}
191
191
192
- func (d * Data ) getError () string {
193
- if d == nil || d .err == nil {
192
+ func (cr * checkResult ) getError () string {
193
+ if cr == nil || cr .err == nil {
194
194
return ""
195
195
}
196
- return d .err .Error ()
196
+ return cr .err .Error ()
197
197
}
198
198
199
- func (d * Data ) getLastHealthStates () apiv1.HealthStates {
200
- if d == nil {
199
+ func (cr * checkResult ) getLastHealthStates () apiv1.HealthStates {
200
+ if cr == nil {
201
201
return apiv1.HealthStates {
202
202
{
203
- Name : Name ,
204
- Health : apiv1 .HealthStateTypeHealthy ,
205
- Reason : "no data yet" ,
203
+ Component : Name ,
204
+ Name : Name ,
205
+ Health : apiv1 .HealthStateTypeHealthy ,
206
+ Reason : "no data yet" ,
206
207
},
207
208
}
208
209
}
209
210
210
211
state := apiv1.HealthState {
211
- Name : Name ,
212
- Reason : d .reason ,
213
- Error : d .getError (),
214
- Health : d .health ,
212
+ Component : Name ,
213
+ Name : Name ,
214
+ Reason : cr .reason ,
215
+ Error : cr .getError (),
216
+ Health : cr .health ,
215
217
}
216
218
217
- b , _ := json .Marshal (d )
219
+ b , _ := json .Marshal (cr )
218
220
state .DeprecatedExtraInfo = map [string ]string {
219
221
"data" : string (b ),
220
222
"encoding" : "json" ,
0 commit comments