Skip to content

Commit d9728ed

Browse files
authored
Merge pull request #725 from ffromani/klog-contextual-logging
[noderesourcetopology] complete the contextual logging integration
2 parents 521dd8e + 7098181 commit d9728ed

File tree

13 files changed

+86
-91
lines changed

13 files changed

+86
-91
lines changed

pkg/noderesourcetopology/cache/discardreserved.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ func (pt *DiscardReserved) NodeMaybeOverReserved(nodeName string, pod *corev1.Po
7878
func (pt *DiscardReserved) NodeHasForeignPods(nodeName string, pod *corev1.Pod) {}
7979

8080
func (pt *DiscardReserved) ReserveNodeResources(nodeName string, pod *corev1.Pod) {
81-
pt.lh.V(5).Info("NRT Reserve", "logID", logging.PodLogID(pod), "podUID", pod.GetUID(), "node", nodeName)
81+
pt.lh.V(5).Info("NRT Reserve", logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
8282
pt.rMutex.Lock()
8383
defer pt.rMutex.Unlock()
8484

@@ -89,14 +89,14 @@ func (pt *DiscardReserved) ReserveNodeResources(nodeName string, pod *corev1.Pod
8989
}
9090

9191
func (pt *DiscardReserved) UnreserveNodeResources(nodeName string, pod *corev1.Pod) {
92-
pt.lh.V(5).Info("NRT Unreserve", "logID", klog.KObj(pod), "podUID", pod.GetUID(), "node", nodeName)
92+
pt.lh.V(5).Info("NRT Unreserve", logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
9393

9494
pt.removeReservationForNode(nodeName, pod)
9595
}
9696

9797
// PostBind is invoked to cleanup reservationMap
9898
func (pt *DiscardReserved) PostBind(nodeName string, pod *corev1.Pod) {
99-
pt.lh.V(5).Info("NRT PostBind", "logID", klog.KObj(pod), "podUID", pod.GetUID(), "node", nodeName)
99+
pt.lh.V(5).Info("NRT PostBind", logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
100100

101101
pt.removeReservationForNode(nodeName, pod)
102102
}

pkg/noderesourcetopology/cache/foreign_pods.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
corev1 "k8s.io/api/core/v1"
2424
"k8s.io/apimachinery/pkg/util/sets"
2525
k8scache "k8s.io/client-go/tools/cache"
26+
"k8s.io/klog/v2"
2627

2728
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/logging"
2829
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/resourcerequests"
@@ -50,7 +51,7 @@ func SetupForeignPodsDetector(lh logr.Logger, schedProfileName string, podInform
5051
}
5152

5253
cc.NodeHasForeignPods(pod.Spec.NodeName, pod)
53-
lh.V(6).Info("detected foreign pods", "logID", logging.PodLogID(pod), "podUID", pod.GetUID(), "node", pod.Spec.NodeName)
54+
lh.V(6).Info("detected foreign pods", logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, pod.Spec.NodeName)
5455
}
5556

5657
podInformer.AddEventHandler(k8scache.ResourceEventHandlerFuncs{

pkg/noderesourcetopology/cache/overreserve.go

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"k8s.io/apimachinery/pkg/labels"
3131
"k8s.io/apimachinery/pkg/types"
3232
podlisterv1 "k8s.io/client-go/listers/core/v1"
33+
"k8s.io/klog/v2"
3334

3435
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
3536

@@ -64,12 +65,11 @@ func NewOverReserve(ctx context.Context, lh logr.Logger, cfg *apiconfig.NodeReso
6465
resyncMethod := getCacheResyncMethod(lh, cfg)
6566

6667
nrtObjs := &topologyv1alpha2.NodeResourceTopologyList{}
67-
// TODO: we should pass-in a context in the future
6868
if err := client.List(ctx, nrtObjs); err != nil {
6969
return nil, err
7070
}
7171

72-
lh.V(3).Info("initializing", "noderesourcetopologies", len(nrtObjs.Items), "method", resyncMethod)
72+
lh.V(2).Info("initializing", "noderesourcetopologies", len(nrtObjs.Items), "method", resyncMethod)
7373
obj := &OverReserve{
7474
lh: lh,
7575
client: client,
@@ -100,11 +100,11 @@ func (ov *OverReserve) GetCachedNRTCopy(ctx context.Context, nodeName string, po
100100
return nrt, true
101101
}
102102

103-
logID := logging.PodLogID(pod)
104-
lh := ov.lh.WithValues("logID", logID, "podUID", pod.GetUID(), "node", nodeName)
103+
logID := klog.KObj(pod)
104+
lh := ov.lh.WithValues(logging.KeyPod, logID, logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
105105

106106
lh.V(6).Info("NRT", "fromcache", stringify.NodeResourceTopologyResources(nrt))
107-
nodeAssumedResources.UpdateNRT(logID, nrt)
107+
nodeAssumedResources.UpdateNRT(nrt, logging.KeyPod, logID)
108108

109109
lh.V(5).Info("NRT", "withassumed", stringify.NodeResourceTopologyResources(nrt))
110110
return nrt, true
@@ -114,23 +114,23 @@ func (ov *OverReserve) NodeMaybeOverReserved(nodeName string, pod *corev1.Pod) {
114114
ov.lock.Lock()
115115
defer ov.lock.Unlock()
116116
val := ov.nodesMaybeOverreserved.Incr(nodeName)
117-
ov.lh.V(4).Info("mark discarded", "node", nodeName, "count", val)
117+
ov.lh.V(4).Info("mark discarded", logging.KeyNode, nodeName, "count", val)
118118
}
119119

120120
func (ov *OverReserve) NodeHasForeignPods(nodeName string, pod *corev1.Pod) {
121-
lh := ov.lh.WithValues("logID", logging.PodLogID(pod), "podUID", pod.GetUID(), "node", nodeName)
121+
lh := ov.lh.WithValues(logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
122122
ov.lock.Lock()
123123
defer ov.lock.Unlock()
124124
if !ov.nrts.Contains(nodeName) {
125125
lh.V(5).Info("ignoring foreign pods", "nrtinfo", "missing")
126126
return
127127
}
128128
val := ov.nodesWithForeignPods.Incr(nodeName)
129-
lh.V(4).Info("marked with foreign pods", "count", val)
129+
lh.V(2).Info("marked with foreign pods", logging.KeyNode, nodeName, "count", val)
130130
}
131131

132132
func (ov *OverReserve) ReserveNodeResources(nodeName string, pod *corev1.Pod) {
133-
lh := ov.lh.WithValues("logID", logging.PodLogID(pod), "podUID", pod.GetUID(), "node", nodeName)
133+
lh := ov.lh.WithValues(logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
134134
ov.lock.Lock()
135135
defer ov.lock.Unlock()
136136
nodeAssumedResources, ok := ov.assumedResources[nodeName]
@@ -140,26 +140,26 @@ func (ov *OverReserve) ReserveNodeResources(nodeName string, pod *corev1.Pod) {
140140
}
141141

142142
nodeAssumedResources.AddPod(pod)
143-
lh.V(5).Info("post reserve", "assumedResources", nodeAssumedResources.String())
143+
lh.V(2).Info("post reserve", logging.KeyNode, nodeName, "assumedResources", nodeAssumedResources.String())
144144

145145
ov.nodesMaybeOverreserved.Delete(nodeName)
146-
lh.V(6).Info("reset discard counter")
146+
lh.V(6).Info("reset discard counter", logging.KeyNode, nodeName)
147147
}
148148

149149
func (ov *OverReserve) UnreserveNodeResources(nodeName string, pod *corev1.Pod) {
150-
lh := ov.lh.WithValues("logID", logging.PodLogID(pod), "podUID", pod.GetUID(), "node", nodeName)
150+
lh := ov.lh.WithValues(logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
151151
ov.lock.Lock()
152152
defer ov.lock.Unlock()
153153
nodeAssumedResources, ok := ov.assumedResources[nodeName]
154154
if !ok {
155155
// this should not happen, so we're vocal about it
156156
// we don't return error because not much to do to recover anyway
157-
lh.V(3).Info("no resources tracked")
157+
lh.V(2).Info("no resources tracked", logging.KeyNode, nodeName)
158158
return
159159
}
160160

161161
nodeAssumedResources.DeletePod(pod)
162-
lh.V(5).Info("post release", "assumedResources", nodeAssumedResources.String())
162+
lh.V(2).Info("post unreserve", logging.KeyNode, nodeName, "assumedResources", nodeAssumedResources.String())
163163
}
164164

165165
// NodesMaybeOverReserved returns a slice of all the node names which have been discarded previously,
@@ -201,81 +201,78 @@ func (ov *OverReserve) NodesMaybeOverReserved(lh logr.Logger) []string {
201201
// too aggressive resync attempts, so to more, likely unnecessary, computation work on the scheduler side.
202202
func (ov *OverReserve) Resync() {
203203
// we are not working with a specific pod, so we need a unique key to track this flow
204-
lh := ov.lh.WithValues("logID", logging.TimeLogID(), "flow", logging.FlowCacheSync)
205-
lh.V(4).Info(logging.FlowBegin)
206-
defer lh.V(4).Info(logging.FlowEnd)
204+
lh_ := ov.lh.WithName(logging.FlowCacheSync).WithValues(logging.KeyLogID, logging.TimeLogID())
205+
lh_.V(4).Info(logging.FlowBegin)
206+
defer lh_.V(4).Info(logging.FlowEnd)
207207

208-
nodeNames := ov.NodesMaybeOverReserved(lh)
208+
nodeNames := ov.NodesMaybeOverReserved(lh_)
209209
// avoid as much as we can unnecessary work and logs.
210210
if len(nodeNames) == 0 {
211-
lh.V(6).Info("no dirty nodes detected")
211+
lh_.V(5).Info("no dirty nodes detected")
212212
return
213213
}
214214

215215
// node -> pod identifier (namespace, name)
216-
nodeToObjsMap, err := makeNodeToPodDataMap(lh, ov.podLister, ov.isPodRelevant)
216+
nodeToObjsMap, err := makeNodeToPodDataMap(lh_, ov.podLister, ov.isPodRelevant)
217217
if err != nil {
218-
lh.Error(err, "cannot find the mapping between running pods and nodes")
218+
lh_.Error(err, "cannot find the mapping between running pods and nodes")
219219
return
220220
}
221221

222-
lh.V(6).Info("resync NodeTopology cache starting")
223-
defer lh.V(6).Info("resync NodeTopology cache complete")
224-
225222
var nrtUpdates []*topologyv1alpha2.NodeResourceTopology
226223
for _, nodeName := range nodeNames {
227-
lh = lh.WithValues("node", nodeName)
224+
lh := lh_.WithValues(logging.KeyNode, nodeName)
228225

229226
nrtCandidate := &topologyv1alpha2.NodeResourceTopology{}
230227
if err := ov.client.Get(context.Background(), types.NamespacedName{Name: nodeName}, nrtCandidate); err != nil {
231-
lh.V(3).Info("failed to get NodeTopology", "error", err)
228+
lh.V(2).Info("failed to get NodeTopology", "error", err)
232229
continue
233230
}
234231
if nrtCandidate == nil {
235-
lh.V(3).Info("missing NodeTopology")
232+
lh.V(2).Info("missing NodeTopology")
236233
continue
237234
}
238235

239236
objs, ok := nodeToObjsMap[nodeName]
240237
if !ok {
241238
// this really should never happen
242-
lh.V(3).Info("cannot find any pod for node")
239+
lh.Info("cannot find any pod for node")
243240
continue
244241
}
245242

246243
pfpExpected, onlyExclRes := podFingerprintForNodeTopology(nrtCandidate, ov.resyncMethod)
247244
if pfpExpected == "" {
248-
lh.V(3).Info("missing NodeTopology podset fingerprint data")
245+
lh.V(2).Info("missing NodeTopology podset fingerprint data")
249246
continue
250247
}
251248

252-
lh.V(6).Info("trying to sync NodeTopology", "fingerprint", pfpExpected, "onlyExclusiveResources", onlyExclRes)
249+
lh.V(4).Info("trying to sync NodeTopology", "fingerprint", pfpExpected, "onlyExclusiveResources", onlyExclRes)
253250

254251
err = checkPodFingerprintForNode(lh, objs, nodeName, pfpExpected, onlyExclRes)
255252
if errors.Is(err, podfingerprint.ErrSignatureMismatch) {
256253
// can happen, not critical
257-
lh.V(5).Info("NodeTopology podset fingerprint mismatch")
254+
lh.V(4).Info("NodeTopology podset fingerprint mismatch")
258255
continue
259256
}
260257
if err != nil {
261258
// should never happen, let's be vocal
262-
lh.V(3).Error(err, "checking NodeTopology podset fingerprint")
259+
lh.Error(err, "checking NodeTopology podset fingerprint")
263260
continue
264261
}
265262

266263
lh.V(4).Info("overriding cached info")
267264
nrtUpdates = append(nrtUpdates, nrtCandidate)
268265
}
269266

270-
ov.FlushNodes(lh, nrtUpdates...)
267+
ov.FlushNodes(lh_, nrtUpdates...)
271268
}
272269

273270
// FlushNodes drops all the cached information about a given node, resetting its state clean.
274271
func (ov *OverReserve) FlushNodes(lh logr.Logger, nrts ...*topologyv1alpha2.NodeResourceTopology) {
275272
ov.lock.Lock()
276273
defer ov.lock.Unlock()
277274
for _, nrt := range nrts {
278-
lh.V(4).Info("flushing", "node", nrt.Name)
275+
lh.V(2).Info("flushing", logging.KeyNode, nrt.Name)
279276
ov.nrts.Update(nrt)
280277
delete(ov.assumedResources, nrt.Name)
281278
ov.nodesMaybeOverreserved.Delete(nrt.Name)

pkg/noderesourcetopology/cache/store.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/k8stopologyawareschedwg/podfingerprint"
2929

3030
apiconfig "sigs.k8s.io/scheduler-plugins/apis/config"
31+
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/logging"
3132
"sigs.k8s.io/scheduler-plugins/pkg/noderesourcetopology/stringify"
3233
"sigs.k8s.io/scheduler-plugins/pkg/util"
3334
)
@@ -91,7 +92,7 @@ func newResourceStore(lh logr.Logger) *resourceStore {
9192
func (rs *resourceStore) String() string {
9293
var sb strings.Builder
9394
for podKey, podRes := range rs.data {
94-
sb.WriteString(" " + podKey + ": " + stringify.ResourceList(podRes) + "\n")
95+
sb.WriteString(podKey + "::[" + stringify.ResourceList(podRes) + "];")
9596
}
9697
return sb.String()
9798
}
@@ -125,7 +126,7 @@ func (rs *resourceStore) DeletePod(pod *corev1.Pod) bool {
125126

126127
// UpdateNRT updates the provided Node Resource Topology object with the resources tracked in this store,
127128
// performing pessimistic overallocation across all the NUMA zones.
128-
func (rs *resourceStore) UpdateNRT(logID string, nrt *topologyv1alpha2.NodeResourceTopology) {
129+
func (rs *resourceStore) UpdateNRT(nrt *topologyv1alpha2.NodeResourceTopology, logKeysAndValues ...any) {
129130
for key, res := range rs.data {
130131
// We cannot predict on which Zone the workload will be placed.
131132
// And we should totally not guess. So the only safe (and conservative)
@@ -146,7 +147,8 @@ func (rs *resourceStore) UpdateNRT(logID string, nrt *topologyv1alpha2.NodeResou
146147
if zr.Available.Cmp(qty) < 0 {
147148
// this should happen rarely, and it is likely caused by
148149
// a bug elsewhere.
149-
rs.lh.V(3).Info("cannot decrement resource", "logID", logID, "zone", zr.Name, "node", nrt.Name, "available", zr.Available, "requestor", key, "quantity", qty.String())
150+
logKeysAndValues = append(logKeysAndValues, "zone", zr.Name, logging.KeyNode, nrt.Name, "available", zr.Available, "requestor", key, "quantity", qty.String())
151+
rs.lh.V(3).Info("cannot decrement resource", logKeysAndValues...)
150152
zr.Available = resource.Quantity{}
151153
continue
152154
}
@@ -239,7 +241,7 @@ func checkPodFingerprintForNode(lh logr.Logger, objs []podData, nodeName, pfpExp
239241
}
240242
pfpComputed := pfp.Sign()
241243

242-
lh.V(5).Info("podset fingerprint check", "expected", pfpExpected, "computed", pfpComputed, "onlyExclusiveResources", onlyExclRes)
244+
lh.V(4).Info("podset fingerprint check", "expected", pfpExpected, "computed", pfpComputed, "onlyExclusiveResources", onlyExclRes)
243245
lh.V(6).Info("podset fingerprint debug", "status", st.Repr())
244246

245247
err := pfp.Check(pfpExpected)

pkg/noderesourcetopology/cache/store_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ func TestResourceStoreUpdate(t *testing.T) {
516516
}
517517

518518
logID := "testResourceStoreUpdate"
519-
rs.UpdateNRT(logID, nrt)
519+
rs.UpdateNRT(nrt, "logID", logID)
520520

521521
cpuInfo0 := findResourceInfo(nrt.Zones[0].Resources, cpu)
522522
if cpuInfo0.Capacity.Cmp(resource.MustParse("20")) != 0 {

pkg/noderesourcetopology/filter.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121

2222
v1 "k8s.io/api/core/v1"
2323
"k8s.io/apimachinery/pkg/api/resource"
24+
"k8s.io/klog/v2"
2425
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
2526
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2627
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
@@ -80,6 +81,7 @@ func singleNUMAContainerLevelHandler(lh logr.Logger, pod *v1.Pod, zones topology
8081
// this is necessary, so we won't allocate the same resources for the upcoming containers
8182
subtractFromNUMA(lh, nodes, numaID, container)
8283
}
84+
lh.V(2).Info("can align all containers")
8385
return nil
8486
}
8587

@@ -105,7 +107,7 @@ func resourcesAvailableInAnyNUMANodes(lh logr.Logger, numaNodes NUMANodeList, re
105107
// some resources may not expose NUMA affinity (device plugins, extended resources), but all resources
106108
// must be reported at node level; thus, if they are not present at node level, we can safely assume
107109
// we don't have the resource at all.
108-
lh.V(5).Info("early verdict: cannot meet request", "resource", resource, "suitable", "false")
110+
lh.V(2).Info("early verdict: cannot meet request", "resource", resource, "suitable", "false")
109111
return numaID, false
110112
}
111113

@@ -137,7 +139,7 @@ func resourcesAvailableInAnyNUMANodes(lh logr.Logger, numaNodes NUMANodeList, re
137139

138140
bitmask.And(resourceBitmask)
139141
if bitmask.IsEmpty() {
140-
lh.V(5).Info("early verdict", "resource", resource, "suitable", "false")
142+
lh.V(2).Info("early verdict", "resource", resource, "suitable", "false")
141143
return numaID, false
142144
}
143145
}
@@ -149,7 +151,7 @@ func resourcesAvailableInAnyNUMANodes(lh logr.Logger, numaNodes NUMANodeList, re
149151

150152
// at least one NUMA node is available
151153
ret := !bitmask.IsEmpty()
152-
lh.V(5).Info("final verdict", "suitable", ret)
154+
lh.V(2).Info("final verdict", "suitable", ret)
153155
return numaID, ret
154156
}
155157

@@ -187,6 +189,7 @@ func singleNUMAPodLevelHandler(lh logr.Logger, pod *v1.Pod, zones topologyv1alph
187189
lh.V(2).Info("cannot align pod", "name", pod.Name)
188190
return framework.NewStatus(framework.Unschedulable, "cannot align pod")
189191
}
192+
lh.V(2).Info("can align pod")
190193
return nil
191194
}
192195

@@ -201,7 +204,8 @@ func (tm *TopologyMatch) Filter(ctx context.Context, cycleState *framework.Cycle
201204

202205
nodeName := nodeInfo.Node().Name
203206

204-
lh := logging.Log().WithValues(logging.KeyLogID, logging.PodLogID(pod), logging.KeyPodUID, pod.GetUID(), logging.KeyNode, nodeName, logging.KeyFlow, logging.FlowFilter)
207+
lh := klog.FromContext(ctx).WithValues(logging.KeyPod, klog.KObj(pod), logging.KeyPodUID, logging.PodUID(pod), logging.KeyNode, nodeName)
208+
205209
lh.V(4).Info(logging.FlowBegin)
206210
defer lh.V(4).Info(logging.FlowEnd)
207211

@@ -214,7 +218,7 @@ func (tm *TopologyMatch) Filter(ctx context.Context, cycleState *framework.Cycle
214218
return nil
215219
}
216220

217-
lh.V(5).Info("found nrt data", "object", stringify.NodeResourceTopologyResources(nodeTopology))
221+
lh.V(4).Info("found nrt data", "object", stringify.NodeResourceTopologyResources(nodeTopology))
218222

219223
handler := filterHandlerFromTopologyManagerConfig(topologyManagerConfigFromNodeResourceTopology(lh, nodeTopology))
220224
if handler == nil {

0 commit comments

Comments
 (0)