Skip to content

Commit e9b8aa4

Browse files
committed
nrt: rewrite resource accounting for scope=container
Rewrite the accounting of NUMA-local resources when using scope=container. The previous code was too lenient and worked mostly by side effects when dealing with non-NUMA affine resources. A non-NUMA affine resource (aka a hostlevel resource) is a resource which is not guaranteed to always have a NUMA affinity. CPU and memory (incl. hugepages) always do, but devices may or may not, both options are legal for device plugins. Similarly, ephemeral storage is a prominent example of resource which should never have a NUMA affinity. The accounting in this case was wrong because previously the resource was considered NUMA affine. Note: it's legal to configure topology updaters (e.g. NFD) to not advertise CPU and memory in NRT objects. Thus is best to treat lack of them as warnings, not as blocking errors. However if the per-NUMA affine counters go negative this is definitely an error condition we need to detect and be very loud about it. Signed-off-by: Francesco Romani <[email protected]>
1 parent e3388b9 commit e9b8aa4

File tree

4 files changed

+403
-46
lines changed

4 files changed

+403
-46
lines changed

pkg/noderesourcetopology/filter.go

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"context"
2121

2222
v1 "k8s.io/api/core/v1"
23-
"k8s.io/apimachinery/pkg/api/resource"
2423
"k8s.io/klog/v2"
2524
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2625
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
@@ -69,7 +68,7 @@ func singleNUMAContainerLevelHandler(lh logr.Logger, pod *v1.Pod, zones topology
6968

7069
for _, container := range pod.Spec.Containers {
7170
clh := lh.WithValues(logging.KeyContainer, container.Name, logging.KeyContainerKind, logging.KindContainerApp)
72-
clh.V(6).Info("app container resources", stringify.ResourceListToLoggable(container.Resources.Requests)...)
71+
clh.V(6).Info("container requests", stringify.ResourceListToLoggable(container.Resources.Requests)...)
7372

7473
numaID, match := resourcesAvailableInAnyNUMANodes(clh, nodes, container.Resources.Requests, qos, nodeInfo)
7574
if !match {
@@ -80,8 +79,12 @@ func singleNUMAContainerLevelHandler(lh logr.Logger, pod *v1.Pod, zones topology
8079

8180
// subtract the resources requested by the container from the given NUMA.
8281
// this is necessary, so we won't allocate the same resources for the upcoming containers
83-
subtractFromNUMA(clh, nodes, numaID, container)
84-
clh.V(4).Info("app container placed", "numaCell", numaID)
82+
err := subtractResourcesFromNUMANodeList(clh, nodes, numaID, qos, container.Resources.Requests)
83+
if err != nil {
84+
// this is an internal error which should never happen
85+
return framework.NewStatus(framework.Error, "inconsistent resource accounting", err.Error())
86+
}
87+
clh.V(4).Info("container aligned", "numaCell", numaID)
8588
}
8689
return nil
8790
}
@@ -156,13 +159,6 @@ func resourcesAvailableInAnyNUMANodes(lh logr.Logger, numaNodes NUMANodeList, re
156159
return numaID, ret
157160
}
158161

159-
func isResourceSetSuitable(qos v1.PodQOSClass, resource v1.ResourceName, quantity, numaQuantity resource.Quantity) bool {
160-
if qos != v1.PodQOSGuaranteed && isNUMAAffineResource(resource) {
161-
return true
162-
}
163-
return numaQuantity.Cmp(quantity) >= 0
164-
}
165-
166162
func singleNUMAPodLevelHandler(lh logr.Logger, pod *v1.Pod, zones topologyv1alpha2.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
167163
lh.V(5).Info("pod level single NUMA node handler")
168164

@@ -221,28 +217,6 @@ func (tm *TopologyMatch) Filter(ctx context.Context, cycleState *framework.Cycle
221217
return status
222218
}
223219

224-
// subtractFromNUMA finds the correct NUMA ID's resources and subtract them from `nodes`.
225-
func subtractFromNUMA(lh logr.Logger, nodes NUMANodeList, numaID int, container v1.Container) {
226-
for i := 0; i < len(nodes); i++ {
227-
if nodes[i].NUMAID != numaID {
228-
continue
229-
}
230-
231-
nRes := nodes[i].Resources
232-
for resName, quan := range container.Resources.Requests {
233-
nodeResQuan := nRes[resName]
234-
nodeResQuan.Sub(quan)
235-
// we do not expect a negative value here, since this function only called
236-
// when resourcesAvailableInAnyNUMANodes function is passed
237-
// but let's log here if such unlikely case will occur
238-
if nodeResQuan.Sign() == -1 {
239-
lh.V(4).Info("resource quantity should not be a negative value", "resource", resName, "quantity", nodeResQuan.String())
240-
}
241-
nRes[resName] = nodeResQuan
242-
}
243-
}
244-
}
245-
246220
func filterHandlerFromTopologyManagerConfig(conf TopologyManagerConfig) filterFn {
247221
if conf.Policy != kubeletconfig.SingleNumaNodeTopologyManagerPolicy {
248222
return nil

pkg/noderesourcetopology/numaresources.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,89 @@ limitations under the License.
1717
package noderesourcetopology
1818

1919
import (
20+
"fmt"
21+
"reflect"
22+
23+
"github.com/go-logr/logr"
2024
corev1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/api/resource"
2126
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
27+
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/stringify"
2228
)
2329

30+
type NUMANode struct {
31+
NUMAID int
32+
Resources corev1.ResourceList
33+
Costs map[int]int
34+
}
35+
36+
func (n *NUMANode) WithCosts(costs map[int]int) *NUMANode {
37+
n.Costs = costs
38+
return n
39+
}
40+
41+
func (n NUMANode) DeepCopy() NUMANode {
42+
ret := NUMANode{
43+
NUMAID: n.NUMAID,
44+
Resources: n.Resources.DeepCopy(),
45+
}
46+
if len(n.Costs) > 0 {
47+
ret.Costs = make(map[int]int)
48+
for key, val := range n.Costs {
49+
ret.Costs[key] = val
50+
}
51+
}
52+
return ret
53+
}
54+
55+
func (n NUMANode) Equal(o NUMANode) bool {
56+
if n.NUMAID != o.NUMAID {
57+
return false
58+
}
59+
if !reflect.DeepEqual(n.Costs, o.Costs) {
60+
return false
61+
}
62+
return equalResourceList(n.Resources, o.Resources)
63+
}
64+
65+
func equalResourceList(ra, rb corev1.ResourceList) bool {
66+
if len(ra) != len(rb) {
67+
return false
68+
}
69+
for key, valA := range ra {
70+
valB, ok := rb[key]
71+
if !ok {
72+
return false
73+
}
74+
if !valA.Equal(valB) {
75+
return false
76+
}
77+
}
78+
return true
79+
}
80+
81+
type NUMANodeList []NUMANode
82+
83+
func (nnl NUMANodeList) DeepCopy() NUMANodeList {
84+
ret := make(NUMANodeList, 0, len(nnl))
85+
for idx := 0; idx < len(nnl); idx++ {
86+
ret = append(ret, nnl[idx].DeepCopy())
87+
}
88+
return ret
89+
}
90+
91+
func (nnl NUMANodeList) Equal(oth NUMANodeList) bool {
92+
if len(nnl) != len(oth) {
93+
return false
94+
}
95+
for idx := 0; idx < len(nnl); idx++ {
96+
if !nnl[idx].Equal(oth[idx]) {
97+
return false
98+
}
99+
}
100+
return true
101+
}
102+
24103
func isHostLevelResource(resource corev1.ResourceName) bool {
25104
// host-level resources are resources which *may* not be bound to NUMA nodes.
26105
// A Key example is generic [ephemeral] storage which doesn't expose NUMA affinity.
@@ -52,3 +131,50 @@ func isNUMAAffineResource(resource corev1.ResourceName) bool {
52131
// We can't tell for sure, so we default to "no".
53132
return false
54133
}
134+
135+
func isResourceSetSuitable(qos corev1.PodQOSClass, resource corev1.ResourceName, quantity, numaQuantity resource.Quantity) bool {
136+
if qos != corev1.PodQOSGuaranteed && isNUMAAffineResource(resource) {
137+
return true
138+
}
139+
return numaQuantity.Cmp(quantity) >= 0
140+
}
141+
142+
// subtractResourcesFromNUMANodeList finds the correct NUMA ID's resources and always subtract them from `nodes` in-place.
143+
func subtractResourcesFromNUMANodeList(lh logr.Logger, nodes NUMANodeList, numaID int, qos corev1.PodQOSClass, containerRes corev1.ResourceList) error {
144+
logEntries := []any{"numaCell", numaID}
145+
146+
for _, node := range nodes {
147+
if node.NUMAID != numaID {
148+
continue
149+
}
150+
151+
lh.V(5).Info("NUMA resources before", append(logEntries, stringify.ResourceListToLoggable(node.Resources)...)...)
152+
153+
for resName, resQty := range containerRes {
154+
isAffine := isNUMAAffineResource(resName)
155+
if qos != corev1.PodQOSGuaranteed && isAffine {
156+
lh.V(4).Info("ignoring QoS-depending exclusive request", "resource", resName, "QoS", qos)
157+
continue
158+
}
159+
if resQty.IsZero() {
160+
lh.V(4).Info("ignoring zero-valued request", "resource", resName)
161+
continue
162+
}
163+
nResQ, ok := node.Resources[resName]
164+
if !ok {
165+
lh.V(4).Info("ignoring missing resource", "resource", resName, "affine", isAffine, "request", resQty.String())
166+
continue
167+
}
168+
nodeResQty := nResQ.DeepCopy()
169+
nodeResQty.Sub(resQty)
170+
if nodeResQty.Sign() < 0 {
171+
lh.V(1).Info("resource quantity should not be a negative value", "numaCell", numaID, "resource", resName, "quantity", nResQ.String(), "request", resQty.String())
172+
return fmt.Errorf("resource %q request %s exceeds NUMA %d availability %s", string(resName), resQty.String(), numaID, nResQ.String())
173+
}
174+
node.Resources[resName] = nodeResQty
175+
}
176+
177+
lh.V(5).Info("NUMA resources after", append(logEntries, stringify.ResourceListToLoggable(node.Resources)...)...)
178+
}
179+
return nil
180+
}

0 commit comments

Comments
 (0)