Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions pkg/scheduler/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,26 @@ import (

"github.com/Project-HAMi/HAMi/pkg/device"
"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"

"github.com/Project-HAMi/HAMi/pkg/device/cambricon"
"github.com/Project-HAMi/HAMi/pkg/device/enflame"
"github.com/Project-HAMi/HAMi/pkg/device/hygon"
"github.com/Project-HAMi/HAMi/pkg/device/iluvatar"
"github.com/Project-HAMi/HAMi/pkg/device/metax"
"github.com/Project-HAMi/HAMi/pkg/device/mthreads"
"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
)

var vendorNoUseAnnoKeyMap = map[string][]string{
nvidia.GPUNoUseUUID: {nvidia.NvidiaGPUDevice},
cambricon.MLUNoUseUUID: {cambricon.CambriconMLUDevice},
hygon.DCUNoUseUUID: {hygon.HygonDCUDevice},
iluvatar.IluvatarNoUseUUID: {iluvatar.IluvatarGPUDevice},
enflame.EnflameNoUseUUID: {enflame.EnflameGPUDevice},
mthreads.MthreadsNoUseUUID: {mthreads.MthreadsGPUDevice},
metax.MetaxNoUseUUID: {metax.MetaxGPUDevice, metax.MetaxSGPUDevice},
}

type NodeUsage struct {
Node *corev1.Node
Devices policy.DeviceUsageList
Expand Down Expand Up @@ -73,6 +91,50 @@ func (m *nodeManager) addNode(nodeID string, nodeInfo *device.NodeInfo) {
} else {
m.nodes[nodeID] = nodeInfo
}
m.nodes[nodeID].Devices = rmDeviceByNodeAnnotation(m.nodes[nodeID])
}

func rmDeviceByNodeAnnotation(nodeInfo *device.NodeInfo) []device.DeviceInfo {
vendorWithDisableGPUUUIDMap := make(map[string]map[string]bool)
if nodeInfo.Node != nil && nodeInfo.Node.Annotations != nil {
for annoKey, vendors := range vendorNoUseAnnoKeyMap {
klog.V(5).Infof("Current annokey is %s, and vendor is %v", annoKey, vendors)
if value, ok := nodeInfo.Node.Annotations[annoKey]; ok {
disableGPUUUIDList := strings.Split(value, ",")
klog.V(5).Infof("Disable gpu uuid list is: %v", disableGPUUUIDList)
for _, disableGPUUUID := range disableGPUUUIDList {
if id := strings.TrimSpace(disableGPUUUID); id != "" {
for _, vendor := range vendors {
if vendorWithDisableGPUUUIDMap[vendor] == nil {
newVendorMap := make(map[string]bool)
newVendorMap[disableGPUUUID] = true
vendorWithDisableGPUUUIDMap[vendor] = newVendorMap
} else {
vendorWithDisableGPUUUIDMap[vendor][disableGPUUUID] = true
}
}
}
}
}
}
}
if len(vendorWithDisableGPUUUIDMap) == 0 {
return nodeInfo.Devices
}
tmp := make([]device.DeviceInfo, 0, len(nodeInfo.Devices))
for _, d := range nodeInfo.Devices {
removeFlag := false
if disableGPUUUIDMap, ok := vendorWithDisableGPUUUIDMap[d.DeviceVendor]; ok {
if ok := disableGPUUUIDMap[d.ID]; ok {
klog.V(5).Infof("Disable gpu uuid is : %s", d.ID)
removeFlag = true
}
}
if !removeFlag {
tmp = append(tmp, d)
}
}
return tmp
}

func (m *nodeManager) rmNodeDevices(nodeID string, deviceVendor string) {
Expand Down
102 changes: 99 additions & 3 deletions pkg/scheduler/nodes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,18 @@ package scheduler

import (
"fmt"
"reflect"
"strings"
"testing"

"github.com/Project-HAMi/HAMi/pkg/device"
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"

"gotest.tools/v3/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/Project-HAMi/HAMi/pkg/device"
"github.com/Project-HAMi/HAMi/pkg/device/metax"
"github.com/Project-HAMi/HAMi/pkg/device/nvidia"
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
)

func Test_addNode_ListNodes(t *testing.T) {
Expand Down Expand Up @@ -322,3 +327,94 @@ func Test_rmNodeDevices(t *testing.T) {
})
}
}

func Test_rmDeviceByNodeAnnotation(t *testing.T) {
id1 := "60151478-4709-4242-a8c1-a944252d194b"
id2 := "33c00a52-72ab-4b61-a7ce-43107588835b"
type args struct {
nodeInfo *device.NodeInfo
}
tests := []struct {
name string
args args
want []device.DeviceInfo
}{
{
name: "Test remove one device",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: id1}}},
Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
},
want: []device.DeviceInfo{},
},
{
name: "Test remove two devices",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: strings.Join([]string{id1, id2}, ",")}}},
Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}, {DeviceVendor: nvidia.NvidiaGPUDevice, ID: id2}},
},
},
want: []device.DeviceInfo{},
},
{
name: "Test remove one device and keep one device",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: strings.Join([]string{id2}, ",")}}},
Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}, {DeviceVendor: nvidia.NvidiaGPUDevice, ID: id2}},
},
},
want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
{
name: "Test no removing device, case1",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"test-key": ""}}},
Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
},
want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
{
name: "Test no removing device, case2",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: id2}}},
Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
},
want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}},
},
{
name: "Test removing metax device, case1",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{metax.MetaxNoUseUUID: id1}}},
Devices: []device.DeviceInfo{{DeviceVendor: metax.MetaxGPUDevice, ID: id1}},
},
},
want: []device.DeviceInfo{},
},
{
name: "Test removing metax device, case2",
args: args{
nodeInfo: &device.NodeInfo{
Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{metax.MetaxNoUseUUID: id1}}},
Devices: []device.DeviceInfo{{DeviceVendor: metax.MetaxSGPUDevice, ID: id1}},
},
},
want: []device.DeviceInfo{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := rmDeviceByNodeAnnotation(tt.args.nodeInfo); !reflect.DeepEqual(got, tt.want) {
t.Errorf("rmDeviceByNodeAnnotation() = %v, want %v", got, tt.want)
}
})
}
}