Skip to content

Commit 3be84e3

Browse files
authored
detect node label to disable cgpu (#32)
1 parent 6ca7aa8 commit 3be84e3

File tree

4 files changed

+50
-28
lines changed

4 files changed

+50
-28
lines changed

pkg/gpu/nvidia/allocate.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
121121
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
122122
},
123123
}
124+
if m.disableCGPUIsolation {
125+
response.Envs["CGPU_DISABLE"] = "true"
126+
}
124127
responses.ContainerResponses = append(responses.ContainerResponses, &response)
125128
}
126129

pkg/gpu/nvidia/const.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@ const (
2121
containerLogPathLabelKey = "io.kubernetes.container.logpath"
2222
sandboxIDLabelKey = "io.kubernetes.sandbox.id"
2323

24-
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
25-
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
26-
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
27-
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
28-
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
29-
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
30-
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
31-
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
24+
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
25+
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
26+
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
27+
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
28+
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
29+
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
30+
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
31+
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
32+
EnvNodeLabelForDisableCGPU = "cgpu.disable.isolation"
3233

3334
GiBPrefix = MemoryUnit("GiB")
3435
MiBPrefix = MemoryUnit("MiB")

pkg/gpu/nvidia/podmanager.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,21 @@ func kubeInit() {
5757

5858
}
5959

60+
func disableCGPUIsolationOrNot() (bool, error) {
61+
disable := false
62+
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
63+
if err != nil {
64+
return disable, err
65+
}
66+
labels := node.ObjectMeta.Labels
67+
value, ok := labels[EnvNodeLabelForDisableCGPU]
68+
if ok && value == "true" {
69+
log.Infof("enable gpusharing mode and disable cgpu mode")
70+
disable = true
71+
}
72+
return disable, nil
73+
}
74+
6075
func patchGPUCount(gpuCount int) error {
6176
node, err := clientset.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
6277
if err != nil {

pkg/gpu/nvidia/server.go

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,16 @@ import (
1616

1717
// NvidiaDevicePlugin implements the Kubernetes device plugin API
1818
type NvidiaDevicePlugin struct {
19-
devs []*pluginapi.Device
20-
realDevNames []string
21-
devNameMap map[string]uint
22-
devIndxMap map[uint]string
23-
socket string
24-
mps bool
25-
healthCheck bool
26-
27-
stop chan struct{}
28-
health chan *pluginapi.Device
19+
devs []*pluginapi.Device
20+
realDevNames []string
21+
devNameMap map[string]uint
22+
devIndxMap map[uint]string
23+
socket string
24+
mps bool
25+
healthCheck bool
26+
disableCGPUIsolation bool
27+
stop chan struct{}
28+
health chan *pluginapi.Device
2929

3030
server *grpc.Server
3131
sync.RWMutex
@@ -47,17 +47,20 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) (*NvidiaDevicePlugin, error) {
4747
if err != nil {
4848
return nil, err
4949
}
50-
50+
disableCGPUIsolation, err := disableCGPUIsolationOrNot()
51+
if err != nil {
52+
return nil, err
53+
}
5154
return &NvidiaDevicePlugin{
52-
devs: devs,
53-
realDevNames: devList,
54-
devNameMap: devNameMap,
55-
socket: serverSock,
56-
mps: mps,
57-
healthCheck: healthCheck,
58-
59-
stop: make(chan struct{}),
60-
health: make(chan *pluginapi.Device),
55+
devs: devs,
56+
realDevNames: devList,
57+
devNameMap: devNameMap,
58+
socket: serverSock,
59+
mps: mps,
60+
healthCheck: healthCheck,
61+
disableCGPUIsolation: disableCGPUIsolation,
62+
stop: make(chan struct{}),
63+
health: make(chan *pluginapi.Device),
6164
}, nil
6265
}
6366

0 commit comments

Comments
 (0)