diff --git a/pkg/scheduler/nodes.go b/pkg/scheduler/nodes.go index 9ff4e8739..69149e0e2 100644 --- a/pkg/scheduler/nodes.go +++ b/pkg/scheduler/nodes.go @@ -26,8 +26,26 @@ import ( "github.com/Project-HAMi/HAMi/pkg/device" "github.com/Project-HAMi/HAMi/pkg/scheduler/policy" + + "github.com/Project-HAMi/HAMi/pkg/device/cambricon" + "github.com/Project-HAMi/HAMi/pkg/device/enflame" + "github.com/Project-HAMi/HAMi/pkg/device/hygon" + "github.com/Project-HAMi/HAMi/pkg/device/iluvatar" + "github.com/Project-HAMi/HAMi/pkg/device/metax" + "github.com/Project-HAMi/HAMi/pkg/device/mthreads" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" ) +var vendorNoUseAnnoKeyMap = map[string][]string{ + nvidia.GPUNoUseUUID: {nvidia.NvidiaGPUDevice}, + cambricon.MLUNoUseUUID: {cambricon.CambriconMLUDevice}, + hygon.DCUNoUseUUID: {hygon.HygonDCUDevice}, + iluvatar.IluvatarNoUseUUID: {iluvatar.IluvatarGPUDevice}, + enflame.EnflameNoUseUUID: {enflame.EnflameGPUDevice}, + mthreads.MthreadsNoUseUUID: {mthreads.MthreadsGPUDevice}, + metax.MetaxNoUseUUID: {metax.MetaxGPUDevice, metax.MetaxSGPUDevice}, +} + type NodeUsage struct { Node *corev1.Node Devices policy.DeviceUsageList @@ -73,6 +91,50 @@ func (m *nodeManager) addNode(nodeID string, nodeInfo *device.NodeInfo) { } else { m.nodes[nodeID] = nodeInfo } + m.nodes[nodeID].Devices = rmDeviceByNodeAnnotation(m.nodes[nodeID]) +} + +func rmDeviceByNodeAnnotation(nodeInfo *device.NodeInfo) []device.DeviceInfo { + vendorWithDisableGPUUUIDMap := make(map[string]map[string]bool) + if nodeInfo.Node != nil && nodeInfo.Node.Annotations != nil { + for annoKey, vendors := range vendorNoUseAnnoKeyMap { + klog.V(5).Infof("Current annokey is %s, and vendor is %v", annoKey, vendors) + if value, ok := nodeInfo.Node.Annotations[annoKey]; ok { + disableGPUUUIDList := strings.Split(value, ",") + klog.V(5).Infof("Disable gpu uuid list is: %v", disableGPUUUIDList) + for _, disableGPUUUID := range disableGPUUUIDList { + if id := strings.TrimSpace(disableGPUUUID); id != "" { + for _, vendor := range vendors { + if vendorWithDisableGPUUUIDMap[vendor] == nil { + newVendorMap := make(map[string]bool) + newVendorMap[disableGPUUUID] = true + vendorWithDisableGPUUUIDMap[vendor] = newVendorMap + } else { + vendorWithDisableGPUUUIDMap[vendor][disableGPUUUID] = true + } + } + } + } + } + } + } + if len(vendorWithDisableGPUUUIDMap) == 0 { + return nodeInfo.Devices + } + tmp := make([]device.DeviceInfo, 0, len(nodeInfo.Devices)) + for _, d := range nodeInfo.Devices { + removeFlag := false + if disableGPUUUIDMap, ok := vendorWithDisableGPUUUIDMap[d.DeviceVendor]; ok { + if ok := disableGPUUUIDMap[d.ID]; ok { + klog.V(5).Infof("Disable gpu uuid is : %s", d.ID) + removeFlag = true + } + } + if !removeFlag { + tmp = append(tmp, d) + } + } + return tmp } func (m *nodeManager) rmNodeDevices(nodeID string, deviceVendor string) { diff --git a/pkg/scheduler/nodes_test.go b/pkg/scheduler/nodes_test.go index f8bd5880d..b5f38474b 100644 --- a/pkg/scheduler/nodes_test.go +++ b/pkg/scheduler/nodes_test.go @@ -18,13 +18,18 @@ package scheduler import ( "fmt" + "reflect" + "strings" "testing" - "github.com/Project-HAMi/HAMi/pkg/device" - "github.com/Project-HAMi/HAMi/pkg/scheduler/config" - "gotest.tools/v3/assert" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/device/metax" + "github.com/Project-HAMi/HAMi/pkg/device/nvidia" + "github.com/Project-HAMi/HAMi/pkg/scheduler/config" ) func Test_addNode_ListNodes(t *testing.T) { @@ -322,3 +327,94 @@ func Test_rmNodeDevices(t *testing.T) { }) } } + +func Test_rmDeviceByNodeAnnotation(t *testing.T) { + id1 := "60151478-4709-4242-a8c1-a944252d194b" + id2 := "33c00a52-72ab-4b61-a7ce-43107588835b" + type args struct { + nodeInfo *device.NodeInfo + } + tests := []struct { + name string + args args + want []device.DeviceInfo + }{ + { + name: "Test remove one device", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: id1}}}, + Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + }, + want: []device.DeviceInfo{}, + }, + { + name: "Test remove two devices", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: strings.Join([]string{id1, id2}, ",")}}}, + Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}, {DeviceVendor: nvidia.NvidiaGPUDevice, ID: id2}}, + }, + }, + want: []device.DeviceInfo{}, + }, + { + name: "Test remove one device and keep one device", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: strings.Join([]string{id2}, ",")}}}, + Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}, {DeviceVendor: nvidia.NvidiaGPUDevice, ID: id2}}, + }, + }, + want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + { + name: "Test no removing device, case1", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{"test-key": ""}}}, + Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + }, + want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + { + name: "Test no removing device, case2", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{nvidia.GPUNoUseUUID: id2}}}, + Devices: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + }, + want: []device.DeviceInfo{{DeviceVendor: nvidia.NvidiaGPUDevice, ID: id1}}, + }, + { + name: "Test removing metax device, case1", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{metax.MetaxNoUseUUID: id1}}}, + Devices: []device.DeviceInfo{{DeviceVendor: metax.MetaxGPUDevice, ID: id1}}, + }, + }, + want: []device.DeviceInfo{}, + }, + { + name: "Test removing metax device, case2", + args: args{ + nodeInfo: &device.NodeInfo{ + Node: &corev1.Node{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{metax.MetaxNoUseUUID: id1}}}, + Devices: []device.DeviceInfo{{DeviceVendor: metax.MetaxSGPUDevice, ID: id1}}, + }, + }, + want: []device.DeviceInfo{}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := rmDeviceByNodeAnnotation(tt.args.nodeInfo); !reflect.DeepEqual(got, tt.want) { + t.Errorf("rmDeviceByNodeAnnotation() = %v, want %v", got, tt.want) + } + }) + } +}