@@ -43,7 +43,6 @@ import (
43
43
cputopology "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
44
44
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
45
45
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
46
- "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
47
46
"k8s.io/kubernetes/pkg/kubelet/config"
48
47
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
49
48
"k8s.io/kubernetes/pkg/kubelet/metrics"
@@ -658,49 +657,107 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
658
657
return nil , nil
659
658
}
660
659
klog .V (3 ).Infof ("Needs to allocate %d %q for pod %q container %q" , needed , resource , podUID , contName )
661
- // Needs to allocate additional devices.
660
+ // Check if resource registered with devicemanager
662
661
if _ , ok := m .healthyDevices [resource ]; ! ok {
663
662
return nil , fmt .Errorf ("can't allocate unregistered device %s" , resource )
664
663
}
665
- devices = sets .NewString ()
666
- // Allocates from reusableDevices list first.
667
- for device := range reusableDevices {
668
- devices .Insert (device )
669
- needed --
670
- if needed == 0 {
671
- return devices , nil
664
+
665
+ // Declare the list of allocated devices.
666
+ // This will be populated and returned below.
667
+ allocated := sets .NewString ()
668
+
669
+ // Create a closure to help with device allocation
670
+ // Returns 'true' once no more devices need to be allocated.
671
+ allocateRemainingFrom := func (devices sets.String ) bool {
672
+ for device := range devices .Difference (allocated ) {
673
+ m .allocatedDevices [resource ].Insert (device )
674
+ allocated .Insert (device )
675
+ needed --
676
+ if needed == 0 {
677
+ return true
678
+ }
672
679
}
680
+ return false
673
681
}
682
+
683
+ // Allocates from reusableDevices list first.
684
+ if allocateRemainingFrom (reusableDevices ) {
685
+ return allocated , nil
686
+ }
687
+
674
688
// Needs to allocate additional devices.
675
689
if m .allocatedDevices [resource ] == nil {
676
690
m .allocatedDevices [resource ] = sets .NewString ()
677
691
}
692
+
678
693
// Gets Devices in use.
679
694
devicesInUse := m .allocatedDevices [resource ]
680
- // Gets a list of available devices.
695
+ // Gets Available devices.
681
696
available := m .healthyDevices [resource ].Difference (devicesInUse )
682
697
if available .Len () < needed {
683
698
return nil , fmt .Errorf ("requested number of devices unavailable for %s. Requested: %d, Available: %d" , resource , needed , available .Len ())
684
699
}
685
- // By default, pull devices from the unsorted list of available devices.
686
- allocated := available .UnsortedList ()[:needed ]
687
- // If topology alignment is desired, update allocated to the set of devices
688
- // with the best alignment.
689
- hint := m .topologyAffinityStore .GetAffinity (podUID , contName )
690
- if m .deviceHasTopologyAlignment (resource ) && hint .NUMANodeAffinity != nil {
691
- allocated = m .takeByTopology (resource , available , hint .NUMANodeAffinity , needed )
700
+
701
+ // Filters available Devices based on NUMA affinity.
702
+ aligned , unaligned , noAffinity := m .filterByAffinity (podUID , contName , resource , available )
703
+
704
+ // If we can allocate all remaining devices from the set of aligned ones, then
705
+ // give the plugin the chance to influence which ones to allocate from that set.
706
+ if needed < aligned .Len () {
707
+ // First allocate from the preferred devices list (if available).
708
+ preferred , err := m .callGetPreferredAllocationIfAvailable (podUID , contName , resource , aligned .Union (allocated ), allocated , required )
709
+ if err != nil {
710
+ return nil , err
711
+ }
712
+ if allocateRemainingFrom (preferred .Intersection (aligned .Union (allocated ))) {
713
+ return allocated , nil
714
+ }
715
+ // Then fallback to allocate from the aligned set if no preferred list
716
+ // is returned (or not enough devices are returned in that list).
717
+ if allocateRemainingFrom (aligned ) {
718
+ return allocated , nil
719
+ }
720
+
721
+ return nil , fmt .Errorf ("unexpectedly allocated less resources than required. Requested: %d, Got: %d" , required , required - needed )
722
+ }
723
+
724
+ // If we can't allocate all remaining devices from the set of aligned ones,
725
+ // then start by first allocating all of the aligned devices (to ensure
726
+ // that the alignment guaranteed by the TopologyManager is honored).
727
+ if allocateRemainingFrom (aligned ) {
728
+ return allocated , nil
729
+ }
730
+
731
+ // Then give the plugin the chance to influence the decision on any
732
+ // remaining devices to allocate.
733
+ preferred , err := m .callGetPreferredAllocationIfAvailable (podUID , contName , resource , available .Union (devices ), devices , required )
734
+ if err != nil {
735
+ return nil , err
736
+ }
737
+ if allocateRemainingFrom (preferred .Intersection (available .Union (allocated ))) {
738
+ return allocated , nil
692
739
}
693
- // Updates m.allocatedDevices with allocated devices to prevent them
694
- // from being allocated to other pods/containers, given that we are
695
- // not holding lock during the rpc call.
696
- for _ , device := range allocated {
697
- m .allocatedDevices [resource ].Insert (device )
698
- devices .Insert (device )
740
+
741
+ // Finally, if the plugin did not return a preferred allocation (or didn't
742
+ // return a large enough one), then fall back to allocating the remaining
743
+ // devices from the 'unaligned' and 'noAffinity' sets.
744
+ if allocateRemainingFrom (unaligned ) {
745
+ return allocated , nil
746
+ }
747
+ if allocateRemainingFrom (noAffinity ) {
748
+ return allocated , nil
699
749
}
700
- return devices , nil
750
+
751
+ return nil , fmt .Errorf ("unexpectedly allocated less resources than required. Requested: %d, Got: %d" , required , required - needed )
701
752
}
702
753
703
- func (m * ManagerImpl ) takeByTopology (resource string , available sets.String , affinity bitmask.BitMask , request int ) []string {
754
+ func (m * ManagerImpl ) filterByAffinity (podUID , contName , resource string , available sets.String ) (sets.String , sets.String , sets.String ) {
755
+ // If alignment information is not available, just pass the available list back.
756
+ hint := m .topologyAffinityStore .GetAffinity (podUID , contName )
757
+ if ! m .deviceHasTopologyAlignment (resource ) || hint .NUMANodeAffinity == nil {
758
+ return sets .NewString (), sets .NewString (), available
759
+ }
760
+
704
761
// Build a map of NUMA Nodes to the devices associated with them. A
705
762
// device may be associated to multiple NUMA nodes at the same time. If an
706
763
// available device does not have any NUMA Nodes associated with it, add it
@@ -754,7 +811,7 @@ func (m *ManagerImpl) takeByTopology(resource string, available sets.String, aff
754
811
if perNodeDevices [n ].Has (d ) {
755
812
if n == nodeWithoutTopology {
756
813
withoutTopology = append (withoutTopology , d )
757
- } else if affinity .IsSet (n ) {
814
+ } else if hint . NUMANodeAffinity .IsSet (n ) {
758
815
fromAffinity = append (fromAffinity , d )
759
816
} else {
760
817
notFromAffinity = append (notFromAffinity , d )
@@ -764,8 +821,8 @@ func (m *ManagerImpl) takeByTopology(resource string, available sets.String, aff
764
821
}
765
822
}
766
823
767
- // Concatenate the lists above return the first 'request' devices from it. .
768
- return append ( append ( fromAffinity , notFromAffinity ... ), withoutTopology ... )[: request ]
824
+ // Return all three lists containing the full set of devices across them .
825
+ return sets . NewString ( fromAffinity ... ), sets . NewString ( notFromAffinity ... ), sets . NewString ( withoutTopology ... )
769
826
}
770
827
771
828
// allocateContainerResources attempts to allocate all of required device
@@ -920,6 +977,30 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
920
977
return nil
921
978
}
922
979
980
+ // callGetPreferredAllocationIfAvailable issues GetPreferredAllocation grpc
981
+ // call for device plugin resource with GetPreferredAllocationAvailable option set.
982
+ func (m * ManagerImpl ) callGetPreferredAllocationIfAvailable (podUID , contName , resource string , available , mustInclude sets.String , size int ) (sets.String , error ) {
983
+ eI , ok := m .endpoints [resource ]
984
+ if ! ok {
985
+ return nil , fmt .Errorf ("endpoint not found in cache for a registered resource: %s" , resource )
986
+ }
987
+
988
+ if eI .opts == nil || ! eI .opts .GetPreferredAllocationAvailable {
989
+ klog .V (4 ).Infof ("Plugin options indicate to skip GetPreferredAllocation for resource: %s" , resource )
990
+ return nil , nil
991
+ }
992
+
993
+ m .mutex .Unlock ()
994
+ klog .V (4 ).Infof ("Issuing a GetPreferredAllocation call for container, %s, of pod %s" , contName , podUID )
995
+ resp , err := eI .e .getPreferredAllocation (available .UnsortedList (), mustInclude .UnsortedList (), size )
996
+ m .mutex .Lock ()
997
+ if err != nil {
998
+ return nil , fmt .Errorf ("device plugin GetPreferredAllocation rpc failed with err: %v" , err )
999
+ }
1000
+ // TODO: Add metrics support for init RPC
1001
+ return sets .NewString (resp .ContainerResponses [0 ].DeviceIDs ... ), nil
1002
+ }
1003
+
923
1004
// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
924
1005
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
925
1006
// the allocated capacity. This allows pods that have already been scheduled on
0 commit comments