Skip to content

Commit d6944a5

Browse files
authored
feat(login): populate resource spec cpu/memory + GPU counts nvidia.com/gpu (#668)
Signed-off-by: Gyuho Lee <[email protected]> --------- Signed-off-by: Gyuho Lee <[email protected]>
1 parent fd94416 commit d6944a5

File tree

11 files changed

+199
-9
lines changed

11 files changed

+199
-9
lines changed

api/v1/login.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@ package v1
22

33
// LoginRequest is the request for the login request.
44
type LoginRequest struct {
5-
Token string `json:"token"`
6-
Network MachineNetworkSpec `json:"network"`
7-
Location MachineLocation `json:"location"`
8-
CPUInfo MachineCPUInfo `json:"cpuInfo"`
9-
GPUInfo MachineGPUInfo `json:"gpuInfo"`
10-
Provider string `json:"provider"`
11-
MachineInfo MachineInfo `json:"machineInfo"`
5+
Token string `json:"token"`
6+
Network MachineNetworkSpec `json:"network"`
7+
Location MachineLocation `json:"location"`
8+
CPUInfo MachineCPUInfo `json:"cpuInfo"`
9+
GPUInfo MachineGPUInfo `json:"gpuInfo"`
10+
Provider string `json:"provider"`
11+
MachineInfo MachineInfo `json:"machineInfo"`
12+
ResourceSpec map[string]string `json:"resourceSpec"`
1213
}
1314

1415
type MachineNetworkSpec struct {

cmd/gpud/command/login.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,16 @@ import (
66
"time"
77

88
"github.com/urfave/cli"
9+
corev1 "k8s.io/api/core/v1"
910

1011
apiv1 "github.com/leptonai/gpud/api/v1"
1112
client "github.com/leptonai/gpud/client/v1"
1213
"github.com/leptonai/gpud/pkg/config"
14+
pkgcpu "github.com/leptonai/gpud/pkg/cpu"
1315
gpudstate "github.com/leptonai/gpud/pkg/gpud-state"
1416
"github.com/leptonai/gpud/pkg/login"
17+
pkgmemory "github.com/leptonai/gpud/pkg/memory"
18+
nvidiaquery "github.com/leptonai/gpud/pkg/nvidia-query"
1519
"github.com/leptonai/gpud/pkg/server"
1620
"github.com/leptonai/gpud/pkg/sqlite"
1721
)
@@ -59,9 +63,32 @@ func cmdLogin(cliContext *cli.Context) error {
5963

6064
endpoint := cliContext.String("endpoint")
6165

66+
req := apiv1.LoginRequest{
67+
Token: token,
68+
ResourceSpec: map[string]string{},
69+
}
70+
71+
cpu, err := pkgcpu.GetSystemResourceLogicalCores()
72+
if err != nil {
73+
return fmt.Errorf("failed to get system resource logical cores: %w", err)
74+
}
75+
req.ResourceSpec[string(corev1.ResourceCPU)] = cpu
76+
77+
memory, err := pkgmemory.GetSystemResourceMemoryTotal()
78+
if err != nil {
79+
return fmt.Errorf("failed to get system resource memory total: %w", err)
80+
}
81+
req.ResourceSpec[string(corev1.ResourceMemory)] = memory
82+
83+
gpuCnt, err := nvidiaquery.GetSystemResourceGPUCount()
84+
if err != nil {
85+
return fmt.Errorf("failed to get system resource gpu count: %w", err)
86+
}
87+
req.ResourceSpec["nvidia.com/gpu"] = gpuCnt
88+
6289
// machine ID has not been assigned yet
6390
// thus request one and blocks until the login request is processed
64-
loginResp, err := login.SendRequest(rootCtx, endpoint, apiv1.LoginRequest{Token: token})
91+
loginResp, err := login.SendRequest(rootCtx, endpoint, req)
6592
if err != nil {
6693
return err
6794
}

pkg/cpu/doc.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
// Package cpu provides utilities for CPU usage.
2+
package cpu

pkg/cpu/utils.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package cpu
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
"github.com/shirou/gopsutil/v4/cpu"
9+
"k8s.io/apimachinery/pkg/api/resource"
10+
)
11+
12+
// GetSystemResourceLogicalCores returns the system CPU resource of the machine
13+
// with the logical core counts, using the type defined in "corev1.ResourceName"
14+
// in https://pkg.go.dev/k8s.io/api/core/v1#ResourceName.
15+
// It represents the CPU, in cores (500m = .5 cores).
16+
// Must be parsed using the "resource.ParseQuantity" function in https://pkg.go.dev/k8s.io/apimachinery/pkg/api/resource.
17+
func GetSystemResourceLogicalCores() (string, error) {
18+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
19+
defer cancel()
20+
21+
counts, err := cpu.CountsWithContext(ctx, true)
22+
if err != nil {
23+
return "", fmt.Errorf("failed to get CPU cores count: %w", err)
24+
}
25+
26+
qty := resource.NewQuantity(int64(counts), resource.DecimalSI)
27+
return qty.String(), nil
28+
}

pkg/cpu/utils_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cpu
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
"k8s.io/apimachinery/pkg/api/resource"
8+
)
9+
10+
func TestGetSystemResource(t *testing.T) {
11+
cpu, err := GetSystemResourceLogicalCores()
12+
assert.NoError(t, err)
13+
14+
cpuQty, err := resource.ParseQuantity(cpu)
15+
assert.NoError(t, err)
16+
t.Logf("cpu: %s", cpuQty.String())
17+
}

pkg/memory/utils.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package memory
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
"github.com/shirou/gopsutil/v4/mem"
9+
"k8s.io/apimachinery/pkg/api/resource"
10+
)
11+
12+
// GetSystemResourceMemoryTotal returns the system memory resource of the machine
13+
// for the total memory size, using the type defined in "corev1.ResourceName"
14+
// in https://pkg.go.dev/k8s.io/api/core/v1#ResourceName.
15+
// It represents the Memory, in bytes (500Gi = 500GiB = 500 * 1024 * 1024 * 1024).
16+
// Must be parsed using the "resource.ParseQuantity" function in https://pkg.go.dev/k8s.io/apimachinery/pkg/api/resource.
17+
func GetSystemResourceMemoryTotal() (string, error) {
18+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
19+
defer cancel()
20+
21+
vm, err := mem.VirtualMemoryWithContext(ctx)
22+
if err != nil {
23+
return "", fmt.Errorf("failed to get memory: %w", err)
24+
}
25+
26+
qty := resource.NewQuantity(int64(vm.Total), resource.DecimalSI)
27+
return qty.String(), nil
28+
}

pkg/memory/utils_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package memory
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
"k8s.io/apimachinery/pkg/api/resource"
8+
)
9+
10+
func TestGetSystemResource(t *testing.T) {
11+
mem, err := GetSystemResourceMemoryTotal()
12+
assert.NoError(t, err)
13+
14+
memQty, err := resource.ParseQuantity(mem)
15+
assert.NoError(t, err)
16+
t.Logf("mem: %s", memQty.String())
17+
}

pkg/nvidia-query/device_count.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
package query
22

33
import (
4+
"errors"
45
"os"
56
"path/filepath"
67
"regexp"
78
)
89

910
func CountAllDevicesFromDevDir() (int, error) {
10-
return countAllDevicesFromDir("/dev")
11+
dir := "/dev"
12+
if _, err := os.Stat(dir); errors.Is(err, os.ErrNotExist) {
13+
return 0, nil
14+
}
15+
return countAllDevicesFromDir(dir)
1116
}
1217

1318
// "checkPermissions" in "nvvs/plugin_src/software/Software.cpp"

pkg/nvidia-query/device_count_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,11 @@ func TestCountAllDevicesFromDir(t *testing.T) {
8484
t.Errorf("expected %d devices, but got %d", devCnt, count)
8585
}
8686
}
87+
88+
func TestCountAllDevicesFromDevDir(t *testing.T) {
89+
devCnt, err := CountAllDevicesFromDevDir()
90+
if err != nil {
91+
t.Fatalf("CountAllDevicesFromDevDir returned an error: %v", err)
92+
}
93+
t.Logf("CountAllDevicesFromDevDir: %d", devCnt)
94+
}

pkg/nvidia-query/utils.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package query
2+
3+
import (
4+
"github.com/leptonai/gpud/pkg/log"
5+
nvidianvml "github.com/leptonai/gpud/pkg/nvidia-query/nvml"
6+
7+
"k8s.io/apimachinery/pkg/api/resource"
8+
)
9+
10+
// GetSystemResourceLogicalCores returns the system GPU resource of the machine
11+
// with the GPU count, using the type defined in "corev1.ResourceName"
12+
// in https://pkg.go.dev/k8s.io/api/core/v1#ResourceName.
13+
// It represents the GPU count with the key "nvidia.com/gpu" or "nvidia.com/gpu.count".
14+
// Must be parsed using the "resource.ParseQuantity" function in https://pkg.go.dev/k8s.io/apimachinery/pkg/api/resource.
15+
//
16+
// This is different than the device count in DCGM.
17+
// ref. "CountDevEntry" in "nvvs/plugin_src/software/Software.cpp"
18+
// ref. https://github.com/NVIDIA/DCGM/blob/903d745504f50153be8293f8566346f9de3b3c93/nvvs/plugin_src/software/Software.cpp#L220-L249
19+
func GetSystemResourceGPUCount() (string, error) {
20+
nvmlInstance, err := nvidianvml.NewInstanceV2()
21+
if err != nil {
22+
return "", err
23+
}
24+
defer func() {
25+
if err := nvmlInstance.Shutdown(); err != nil {
26+
log.Logger.Warnw("failed to shutdown nvml instance", "error", err)
27+
}
28+
}()
29+
30+
deviceCount := len(nvmlInstance.Devices())
31+
32+
qty := resource.NewQuantity(int64(deviceCount), resource.DecimalSI)
33+
return qty.String(), nil
34+
}

0 commit comments

Comments
 (0)