Skip to content

Commit 0834feb

Browse files
authored
Merge pull request #752 from ffromani/nrt-hostlevel-redo
[noderesourcetopology] rewrite accounting of numa-affine resources with scope=container
2 parents df958ed + 13e9bc7 commit 0834feb

File tree

10 files changed

+685
-225
lines changed

10 files changed

+685
-225
lines changed

pkg/noderesourcetopology/filter.go

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"context"
2121

2222
v1 "k8s.io/api/core/v1"
23-
"k8s.io/apimachinery/pkg/api/resource"
2423
"k8s.io/klog/v2"
2524
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2625
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
@@ -55,32 +54,38 @@ func singleNUMAContainerLevelHandler(lh logr.Logger, pod *v1.Pod, zones topology
5554
// https://kubernetes.io/docs/concepts/workloads/pods/init-containers/#understanding-init-containers
5655
// therefore, we don't need to accumulate their resources together
5756
for _, initContainer := range pod.Spec.InitContainers {
58-
lh.V(6).Info("init container desired resources", stringify.ResourceListToLoggable(initContainer.Resources.Requests)...)
57+
// TODO: handle sidecar explicitely (new kind)
58+
clh := lh.WithValues(logging.KeyContainer, initContainer.Name, logging.KeyContainerKind, logging.KindContainerInit)
59+
clh.V(6).Info("desired resources", stringify.ResourceListToLoggable(initContainer.Resources.Requests)...)
5960

60-
_, match := resourcesAvailableInAnyNUMANodes(lh, nodes, initContainer.Resources.Requests, qos, nodeInfo)
61+
_, match := resourcesAvailableInAnyNUMANodes(clh, nodes, initContainer.Resources.Requests, qos, nodeInfo)
6162
if !match {
6263
// we can't align init container, so definitely we can't align a pod
63-
lh.V(2).Info("cannot align container", "name", initContainer.Name, "kind", "init")
64+
clh.V(2).Info("cannot align container")
6465
return framework.NewStatus(framework.Unschedulable, "cannot align init container")
6566
}
6667
}
6768

6869
for _, container := range pod.Spec.Containers {
69-
// TODO: add containerName
70-
lh.V(6).Info("app container resources", stringify.ResourceListToLoggable(container.Resources.Requests)...)
70+
clh := lh.WithValues(logging.KeyContainer, container.Name, logging.KeyContainerKind, logging.KindContainerApp)
71+
clh.V(6).Info("container requests", stringify.ResourceListToLoggable(container.Resources.Requests)...)
7172

72-
numaID, match := resourcesAvailableInAnyNUMANodes(lh, nodes, container.Resources.Requests, qos, nodeInfo)
73+
numaID, match := resourcesAvailableInAnyNUMANodes(clh, nodes, container.Resources.Requests, qos, nodeInfo)
7374
if !match {
7475
// we can't align container, so definitely we can't align a pod
75-
lh.V(2).Info("cannot align container", "name", container.Name, "kind", "app")
76+
clh.V(2).Info("cannot align container")
7677
return framework.NewStatus(framework.Unschedulable, "cannot align container")
7778
}
7879

7980
// subtract the resources requested by the container from the given NUMA.
8081
// this is necessary, so we won't allocate the same resources for the upcoming containers
81-
subtractFromNUMA(lh, nodes, numaID, container)
82+
err := subtractResourcesFromNUMANodeList(clh, nodes, numaID, qos, container.Resources.Requests)
83+
if err != nil {
84+
// this is an internal error which should never happen
85+
return framework.NewStatus(framework.Error, "inconsistent resource accounting", err.Error())
86+
}
87+
clh.V(4).Info("container aligned", "numaCell", numaID)
8288
}
83-
lh.V(2).Info("can align all containers")
8489
return nil
8590
}
8691

@@ -150,17 +155,10 @@ func resourcesAvailableInAnyNUMANodes(lh logr.Logger, numaNodes NUMANodeList, re
150155

151156
// at least one NUMA node is available
152157
ret := !bitmask.IsEmpty()
153-
lh.V(2).Info("final verdict", "suitable", ret)
158+
lh.V(2).Info("final verdict", "suitable", ret, "numaCell", numaID)
154159
return numaID, ret
155160
}
156161

157-
func isResourceSetSuitable(qos v1.PodQOSClass, resource v1.ResourceName, quantity, numaQuantity resource.Quantity) bool {
158-
if qos != v1.PodQOSGuaranteed && isNUMAAffineResource(resource) {
159-
return true
160-
}
161-
return numaQuantity.Cmp(quantity) >= 0
162-
}
163-
164162
func singleNUMAPodLevelHandler(lh logr.Logger, pod *v1.Pod, zones topologyv1alpha2.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
165163
lh.V(5).Info("pod level single NUMA node handler")
166164

@@ -172,11 +170,12 @@ func singleNUMAPodLevelHandler(lh logr.Logger, pod *v1.Pod, zones topologyv1alph
172170
logNumaNodes(lh, "pod handler NUMA resources", nodeInfo.Node().Name, nodes)
173171
lh.V(6).Info("pod desired resources", stringify.ResourceListToLoggable(resources)...)
174172

175-
if _, match := resourcesAvailableInAnyNUMANodes(lh, createNUMANodeList(lh, zones), resources, v1qos.GetPodQOS(pod), nodeInfo); !match {
173+
numaID, match := resourcesAvailableInAnyNUMANodes(lh, createNUMANodeList(lh, zones), resources, v1qos.GetPodQOS(pod), nodeInfo)
174+
if !match {
176175
lh.V(2).Info("cannot align pod", "name", pod.Name)
177176
return framework.NewStatus(framework.Unschedulable, "cannot align pod")
178177
}
179-
lh.V(2).Info("can align pod")
178+
lh.V(4).Info("all container placed", "numaCell", numaID)
180179
return nil
181180
}
182181

@@ -218,28 +217,6 @@ func (tm *TopologyMatch) Filter(ctx context.Context, cycleState *framework.Cycle
218217
return status
219218
}
220219

221-
// subtractFromNUMA finds the correct NUMA ID's resources and subtract them from `nodes`.
222-
func subtractFromNUMA(lh logr.Logger, nodes NUMANodeList, numaID int, container v1.Container) {
223-
for i := 0; i < len(nodes); i++ {
224-
if nodes[i].NUMAID != numaID {
225-
continue
226-
}
227-
228-
nRes := nodes[i].Resources
229-
for resName, quan := range container.Resources.Requests {
230-
nodeResQuan := nRes[resName]
231-
nodeResQuan.Sub(quan)
232-
// we do not expect a negative value here, since this function only called
233-
// when resourcesAvailableInAnyNUMANodes function is passed
234-
// but let's log here if such unlikely case will occur
235-
if nodeResQuan.Sign() == -1 {
236-
lh.V(4).Info("resource quantity should not be a negative value", "resource", resName, "quantity", nodeResQuan.String())
237-
}
238-
nRes[resName] = nodeResQuan
239-
}
240-
}
241-
}
242-
243220
func filterHandlerFromTopologyManagerConfig(conf TopologyManagerConfig) filterFn {
244221
if conf.Policy != kubeletconfig.SingleNumaNodeTopologyManagerPolicy {
245222
return nil

pkg/noderesourcetopology/logging/logging.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@ import (
2626

2727
// well-known structured log keys
2828
const (
29-
KeyLogID string = "logID"
30-
KeyPod string = "pod"
31-
KeyPodUID string = "podUID"
32-
KeyNode string = "node"
33-
KeyFlow string = "flow"
29+
KeyLogID string = "logID"
30+
KeyPod string = "pod"
31+
KeyPodUID string = "podUID"
32+
KeyNode string = "node"
33+
KeyFlow string = "flow"
34+
KeyContainer string = "container"
35+
KeyContainerKind string = "kind"
3436
)
3537

3638
const (
@@ -42,6 +44,11 @@ const (
4244
FlowCacheSync string = "resync"
4345
)
4446

47+
const (
48+
KindContainerInit string = "init"
49+
KindContainerApp string = "app"
50+
)
51+
4552
const (
4653
SubsystemForeignPods string = "foreignpods"
4754
SubsystemNRTCache string = "nrtcache"

pkg/noderesourcetopology/numaresources.go

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,91 @@ limitations under the License.
1717
package noderesourcetopology
1818

1919
import (
20+
"fmt"
21+
"reflect"
22+
23+
"github.com/go-logr/logr"
24+
2025
corev1 "k8s.io/api/core/v1"
26+
"k8s.io/apimachinery/pkg/api/resource"
2127
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
28+
29+
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/stringify"
2230
)
2331

32+
type NUMANode struct {
33+
NUMAID int
34+
Resources corev1.ResourceList
35+
Costs map[int]int
36+
}
37+
38+
func (n *NUMANode) WithCosts(costs map[int]int) *NUMANode {
39+
n.Costs = costs
40+
return n
41+
}
42+
43+
func (n NUMANode) DeepCopy() NUMANode {
44+
ret := NUMANode{
45+
NUMAID: n.NUMAID,
46+
Resources: n.Resources.DeepCopy(),
47+
}
48+
if len(n.Costs) > 0 {
49+
ret.Costs = make(map[int]int)
50+
for key, val := range n.Costs {
51+
ret.Costs[key] = val
52+
}
53+
}
54+
return ret
55+
}
56+
57+
func (n NUMANode) Equal(o NUMANode) bool {
58+
if n.NUMAID != o.NUMAID {
59+
return false
60+
}
61+
if !reflect.DeepEqual(n.Costs, o.Costs) {
62+
return false
63+
}
64+
return equalResourceList(n.Resources, o.Resources)
65+
}
66+
67+
func equalResourceList(ra, rb corev1.ResourceList) bool {
68+
if len(ra) != len(rb) {
69+
return false
70+
}
71+
for key, valA := range ra {
72+
valB, ok := rb[key]
73+
if !ok {
74+
return false
75+
}
76+
if !valA.Equal(valB) {
77+
return false
78+
}
79+
}
80+
return true
81+
}
82+
83+
type NUMANodeList []NUMANode
84+
85+
func (nnl NUMANodeList) DeepCopy() NUMANodeList {
86+
ret := make(NUMANodeList, 0, len(nnl))
87+
for idx := 0; idx < len(nnl); idx++ {
88+
ret = append(ret, nnl[idx].DeepCopy())
89+
}
90+
return ret
91+
}
92+
93+
func (nnl NUMANodeList) Equal(oth NUMANodeList) bool {
94+
if len(nnl) != len(oth) {
95+
return false
96+
}
97+
for idx := 0; idx < len(nnl); idx++ {
98+
if !nnl[idx].Equal(oth[idx]) {
99+
return false
100+
}
101+
}
102+
return true
103+
}
104+
24105
func isHostLevelResource(resource corev1.ResourceName) bool {
25106
// host-level resources are resources which *may* not be bound to NUMA nodes.
26107
// A Key example is generic [ephemeral] storage which doesn't expose NUMA affinity.
@@ -52,3 +133,83 @@ func isNUMAAffineResource(resource corev1.ResourceName) bool {
52133
// We can't tell for sure, so we default to "no".
53134
return false
54135
}
136+
137+
func isResourceSetSuitable(qos corev1.PodQOSClass, resource corev1.ResourceName, quantity, numaQuantity resource.Quantity) bool {
138+
if qos != corev1.PodQOSGuaranteed && isNUMAAffineResource(resource) {
139+
return true
140+
}
141+
return numaQuantity.Cmp(quantity) >= 0
142+
}
143+
144+
// subtractResourcesFromNUMANodeList finds the correct NUMA ID's resources and always subtract them from `nodes` in-place.
145+
func subtractResourcesFromNUMANodeList(lh logr.Logger, nodes NUMANodeList, numaID int, qos corev1.PodQOSClass, containerRes corev1.ResourceList) error {
146+
logEntries := []any{"numaCell", numaID}
147+
148+
for _, node := range nodes {
149+
if node.NUMAID != numaID {
150+
continue
151+
}
152+
153+
lh.V(5).Info("NUMA resources before", append(logEntries, stringify.ResourceListToLoggable(node.Resources)...)...)
154+
155+
for resName, resQty := range containerRes {
156+
isAffine := isNUMAAffineResource(resName)
157+
if qos != corev1.PodQOSGuaranteed && isAffine {
158+
lh.V(4).Info("ignoring QoS-depending exclusive request", "resource", resName, "QoS", qos)
159+
continue
160+
}
161+
if resQty.IsZero() {
162+
lh.V(4).Info("ignoring zero-valued request", "resource", resName)
163+
continue
164+
}
165+
nResQ, ok := node.Resources[resName]
166+
if !ok {
167+
lh.V(4).Info("ignoring missing resource", "resource", resName, "affine", isAffine, "request", resQty.String())
168+
continue
169+
}
170+
nodeResQty := nResQ.DeepCopy()
171+
nodeResQty.Sub(resQty)
172+
if nodeResQty.Sign() < 0 {
173+
lh.V(1).Info("resource quantity should not be a negative value", "numaCell", numaID, "resource", resName, "quantity", nResQ.String(), "request", resQty.String())
174+
return fmt.Errorf("resource %q request %s exceeds NUMA %d availability %s", string(resName), resQty.String(), numaID, nResQ.String())
175+
}
176+
node.Resources[resName] = nodeResQty
177+
}
178+
179+
lh.V(5).Info("NUMA resources after", append(logEntries, stringify.ResourceListToLoggable(node.Resources)...)...)
180+
}
181+
return nil
182+
}
183+
184+
func subtractFromNUMAs(resources corev1.ResourceList, numaNodes NUMANodeList, nodes ...int) {
185+
for resName, quantity := range resources {
186+
for _, node := range nodes {
187+
// quantity is zero no need to iterate through another NUMA node, go to another resource
188+
if quantity.IsZero() {
189+
break
190+
}
191+
192+
nRes := numaNodes[node].Resources
193+
if available, ok := nRes[resName]; ok {
194+
switch quantity.Cmp(available) {
195+
case 0: // the same
196+
// basically zero container resources
197+
quantity.Sub(available)
198+
// zero NUMA quantity
199+
nRes[resName] = resource.Quantity{}
200+
case 1: // container wants more resources than available in this NUMA zone
201+
// substract NUMA resources from container request, to calculate how much is missing
202+
quantity.Sub(available)
203+
// zero NUMA quantity
204+
nRes[resName] = resource.Quantity{}
205+
case -1: // there are more resources available in this NUMA zone than container requests
206+
// substract container resources from resources available in this NUMA node
207+
available.Sub(quantity)
208+
// zero container quantity
209+
quantity = resource.Quantity{}
210+
nRes[resName] = available
211+
}
212+
}
213+
}
214+
}
215+
}

0 commit comments

Comments
 (0)