Skip to content

Commit 945dc05

Browse files
gyuhoeahydra
andauthored
feat(components/*): refactor to add "Check" method for one-time operations, simplify Component interfaces (#651)
Will come in handy to dedup code base for `gpud scan` and gossip messages: - **feat(components): add registry All method** - **feat(components): rename HealthStateCheckResult, add "Check" to interface** - **feat(os): refactor to use "Check" method** Address #511. Also, address #472. --------- Signed-off-by: Gyuho Lee <[email protected]> Co-authored-by: Joseph <[email protected]>
1 parent dce0cf5 commit 945dc05

File tree

104 files changed

+15191
-5978
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+15191
-5978
lines changed

api/v1/types.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ import (
1010
type HealthStateType string
1111

1212
const (
13-
StateTypeHealthy HealthStateType = "Healthy"
14-
StateTypeUnhealthy HealthStateType = "Unhealthy"
15-
StateTypeDegraded HealthStateType = "Degraded"
16-
StateTypeInitializing HealthStateType = "Initializing"
13+
HealthStateTypeHealthy HealthStateType = "Healthy"
14+
HealthStateTypeUnhealthy HealthStateType = "Unhealthy"
15+
HealthStateTypeDegraded HealthStateType = "Degraded"
16+
HealthStateTypeInitializing HealthStateType = "Initializing"
1717
)
1818

1919
// HealthState represents the health state of a component.

cmd/gpud/command/is-nvidia.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"fmt"
66
"time"
77

8-
nvidia_query "github.com/leptonai/gpud/pkg/nvidia-query"
8+
nvidiaquery "github.com/leptonai/gpud/pkg/nvidia-query"
99

1010
"github.com/urfave/cli"
1111
)
@@ -14,7 +14,7 @@ func cmdIsNvidia(cliContext *cli.Context) error {
1414
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
1515
defer cancel()
1616

17-
nvidiaInstalled, err := nvidia_query.GPUsInstalled(ctx)
17+
nvidiaInstalled, err := nvidiaquery.GPUsInstalled(ctx)
1818
if err != nil {
1919
return err
2020
}

cmd/gpud/command/run.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import (
1818
gpud_manager "github.com/leptonai/gpud/pkg/gpud-manager"
1919
gpudstate "github.com/leptonai/gpud/pkg/gpud-state"
2020
"github.com/leptonai/gpud/pkg/log"
21-
lepServer "github.com/leptonai/gpud/pkg/server"
21+
gpudserver "github.com/leptonai/gpud/pkg/server"
2222
"github.com/leptonai/gpud/pkg/sqlite"
2323
pkd_systemd "github.com/leptonai/gpud/pkg/systemd"
2424
"github.com/leptonai/gpud/version"
@@ -86,7 +86,7 @@ func cmdRun(cliContext *cli.Context) error {
8686
start := time.Now()
8787

8888
signals := make(chan os.Signal, 2048)
89-
serverC := make(chan *lepServer.Server, 1)
89+
serverC := make(chan *gpudserver.Server, 1)
9090

9191
log.Logger.Infof("starting gpud %v", version.Version)
9292

@@ -128,7 +128,7 @@ func cmdRun(cliContext *cli.Context) error {
128128
log.Logger.Warnw("machine ID not found, running in local mode not connected to any control plane")
129129
}
130130

131-
server, err := lepServer.New(rootCtx, cfg, cliContext.String("endpoint"), uid, m)
131+
server, err := gpudserver.New(rootCtx, cfg, cliContext.String("endpoint"), uid, m)
132132
if err != nil {
133133
return err
134134
}
@@ -144,5 +144,6 @@ func cmdRun(cliContext *cli.Context) error {
144144

145145
log.Logger.Infow("successfully booted", "tookSeconds", time.Since(start).Seconds())
146146
<-done
147+
147148
return nil
148149
}

components/accelerator/nvidia/bad-envs/component.go

Lines changed: 80 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
package badenvs
33

44
import (
5+
"bytes"
56
"context"
67
"encoding/json"
78
"fmt"
@@ -10,9 +11,12 @@ import (
1011
"sync"
1112
"time"
1213

14+
"github.com/olekukonko/tablewriter"
15+
1316
apiv1 "github.com/leptonai/gpud/api/v1"
1417
"github.com/leptonai/gpud/components"
1518
"github.com/leptonai/gpud/pkg/log"
19+
nvidianvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml"
1620
)
1721

1822
const Name = "accelerator-nvidia-bad-envs"
@@ -23,23 +27,26 @@ type component struct {
2327
ctx context.Context
2428
cancel context.CancelFunc
2529

30+
nvmlInstance nvidianvml.InstanceV2
31+
2632
// returns true if the specified environment variable is set
2733
checkEnvFunc func(key string) bool
2834

2935
lastMu sync.RWMutex
3036
lastData *Data
3137
}
3238

33-
func New(ctx context.Context) components.Component {
34-
cctx, ccancel := context.WithCancel(ctx)
35-
return &component{
36-
ctx: cctx,
37-
cancel: ccancel,
38-
39+
func New(gpudInstance *components.GPUdInstance) (components.Component, error) {
40+
cctx, ccancel := context.WithCancel(gpudInstance.RootCtx)
41+
c := &component{
42+
ctx: cctx,
43+
cancel: ccancel,
44+
nvmlInstance: gpudInstance.NVMLInstance,
3945
checkEnvFunc: func(key string) bool {
4046
return os.Getenv(key) == "1"
4147
},
4248
}
49+
return c, nil
4350
}
4451

4552
func (c *component) Name() string { return Name }
@@ -48,8 +55,10 @@ func (c *component) Start() error {
4855
go func() {
4956
ticker := time.NewTicker(time.Minute)
5057
defer ticker.Stop()
58+
5159
for {
52-
c.CheckOnce()
60+
_ = c.Check()
61+
5362
select {
5463
case <-c.ctx.Done():
5564
return
@@ -60,11 +69,11 @@ func (c *component) Start() error {
6069
return nil
6170
}
6271

63-
func (c *component) HealthStates(ctx context.Context) (apiv1.HealthStates, error) {
72+
func (c *component) LastHealthStates() apiv1.HealthStates {
6473
c.lastMu.RLock()
6574
lastData := c.lastData
6675
c.lastMu.RUnlock()
67-
return lastData.getHealthStates()
76+
return lastData.getLastHealthStates()
6877
}
6978

7079
func (c *component) Events(ctx context.Context, since time.Time) (apiv1.Events, error) {
@@ -95,19 +104,29 @@ var BAD_CUDA_ENV_KEYS = map[string]string{
95104
"OPENCL_PROFILE": "Enables OpenCL profiling.",
96105
}
97106

98-
// CheckOnce checks the current pods
99-
// run this periodically
100-
func (c *component) CheckOnce() {
101-
log.Logger.Infow("checking memory")
102-
d := Data{
107+
func (c *component) Check() components.CheckResult {
108+
log.Logger.Infow("checking nvidia gpu bad env variables")
109+
110+
d := &Data{
103111
ts: time.Now().UTC(),
104112
}
105113
defer func() {
106114
c.lastMu.Lock()
107-
c.lastData = &d
115+
c.lastData = d
108116
c.lastMu.Unlock()
109117
}()
110118

119+
if c.nvmlInstance == nil {
120+
d.health = apiv1.HealthStateTypeHealthy
121+
d.reason = "NVIDIA NVML instance is nil"
122+
return d
123+
}
124+
if !c.nvmlInstance.NVMLExists() {
125+
d.health = apiv1.HealthStateTypeHealthy
126+
d.reason = "NVIDIA NVML is not loaded"
127+
return d
128+
}
129+
111130
foundBadEnvsForCUDA := make(map[string]string)
112131
for k, desc := range BAD_CUDA_ENV_KEYS {
113132
if c.checkEnvFunc(k) {
@@ -118,8 +137,6 @@ func (c *component) CheckOnce() {
118137
d.FoundBadEnvsForCUDA = foundBadEnvsForCUDA
119138
}
120139

121-
d.healthy = true
122-
123140
if len(foundBadEnvsForCUDA) == 0 {
124141
d.reason = "no bad envs found"
125142
} else {
@@ -129,8 +146,13 @@ func (c *component) CheckOnce() {
129146
}
130147
d.reason = strings.Join(kvs, "; ")
131148
}
149+
150+
d.health = apiv1.HealthStateTypeHealthy
151+
return d
132152
}
133153

154+
var _ components.CheckResult = &Data{}
155+
134156
type Data struct {
135157
// FoundBadEnvsForCUDA is a map of environment variables that are known to hurt CUDA.
136158
// that is set globally for the host.
@@ -143,44 +165,74 @@ type Data struct {
143165
err error
144166

145167
// tracks the healthy evaluation result of the last check
146-
healthy bool
168+
health apiv1.HealthStateType
147169
// tracks the reason of the last check
148170
reason string
149171
}
150172

173+
func (d *Data) String() string {
174+
if d == nil {
175+
return ""
176+
}
177+
if len(d.FoundBadEnvsForCUDA) == 0 {
178+
return "no bad envs found"
179+
}
180+
181+
buf := bytes.NewBuffer(nil)
182+
table := tablewriter.NewWriter(buf)
183+
table.SetAlignment(tablewriter.ALIGN_CENTER)
184+
table.SetHeader([]string{"Found Env Key", "Description"})
185+
for k, v := range d.FoundBadEnvsForCUDA {
186+
table.Append([]string{k, v})
187+
}
188+
table.Render()
189+
190+
return buf.String()
191+
}
192+
193+
func (d *Data) Summary() string {
194+
if d == nil {
195+
return ""
196+
}
197+
return d.reason
198+
}
199+
200+
func (d *Data) HealthState() apiv1.HealthStateType {
201+
if d == nil {
202+
return ""
203+
}
204+
return d.health
205+
}
206+
151207
func (d *Data) getError() string {
152208
if d == nil || d.err == nil {
153209
return ""
154210
}
155211
return d.err.Error()
156212
}
157213

158-
func (d *Data) getHealthStates() (apiv1.HealthStates, error) {
214+
func (d *Data) getLastHealthStates() apiv1.HealthStates {
159215
if d == nil {
160-
return []apiv1.HealthState{
216+
return apiv1.HealthStates{
161217
{
162218
Name: Name,
163-
Health: apiv1.StateTypeHealthy,
219+
Health: apiv1.HealthStateTypeHealthy,
164220
Reason: "no data yet",
165221
},
166-
}, nil
222+
}
167223
}
168224

169225
state := apiv1.HealthState{
170226
Name: Name,
171227
Reason: d.reason,
172228
Error: d.getError(),
173-
174-
Health: apiv1.StateTypeHealthy,
175-
}
176-
if !d.healthy {
177-
state.Health = apiv1.StateTypeUnhealthy
229+
Health: d.health,
178230
}
179231

180232
b, _ := json.Marshal(d)
181233
state.DeprecatedExtraInfo = map[string]string{
182234
"data": string(b),
183235
"encoding": "json",
184236
}
185-
return []apiv1.HealthState{state}, nil
237+
return apiv1.HealthStates{state}
186238
}

0 commit comments

Comments
 (0)