Skip to content

Commit 25101d3

Browse files
authored
Merge pull request kubernetes#128518 from tallclair/pleg-watch-conditions
[FG:InPlacePodVerticalScaling] PLEG watch conditions: rapid polling for expected changes
2 parents fb03382 + 24443b6 commit 25101d3

File tree

8 files changed

+455
-78
lines changed

8 files changed

+455
-78
lines changed

pkg/kubelet/container/helpers.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ type RuntimeHelper interface {
6666

6767
// UnprepareDynamicResources unprepares resources for a a pod.
6868
UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error
69+
70+
// SetPodWatchCondition flags a pod to be inspected until the condition is met.
71+
SetPodWatchCondition(types.UID, string, func(*PodStatus) bool)
6972
}
7073

7174
// ShouldContainerBeRestarted checks whether a container needs to be restarted.

pkg/kubelet/container/testing/fake_runtime_helper.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,7 @@ func (f *FakeRuntimeHelper) PrepareDynamicResources(ctx context.Context, pod *v1
114114
func (f *FakeRuntimeHelper) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
115115
return nil
116116
}
117+
118+
func (f *FakeRuntimeHelper) SetPodWatchCondition(_ kubetypes.UID, _ string, _ func(*kubecontainer.PodStatus) bool) {
119+
// Not implemented.
120+
}

pkg/kubelet/kubelet.go

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,17 +2028,6 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
20282028
return false, nil
20292029
}
20302030

2031-
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, podStatus) {
2032-
// While resize is in progress, periodically request the latest status from the runtime via
2033-
// the PLEG. This is necessary since ordinarily pod status is only fetched when a container
2034-
// undergoes a state transition.
2035-
runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
2036-
if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil {
2037-
klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod))
2038-
return false, err
2039-
}
2040-
}
2041-
20422031
return false, nil
20432032
}
20442033

@@ -3174,3 +3163,7 @@ func (kl *Kubelet) fastStaticPodsRegistration(ctx context.Context) {
31743163
kl.tryReconcileMirrorPods(staticPod, mirrorPod)
31753164
}
31763165
}
3166+
3167+
func (kl *Kubelet) SetPodWatchCondition(podUID types.UID, conditionKey string, condition pleg.WatchCondition) {
3168+
kl.pleg.SetPodWatchCondition(podUID, conditionKey, condition)
3169+
}

pkg/kubelet/kuberuntime/kuberuntime_manager.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ import (
5757
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
5858
"k8s.io/kubernetes/pkg/kubelet/logs"
5959
"k8s.io/kubernetes/pkg/kubelet/metrics"
60+
"k8s.io/kubernetes/pkg/kubelet/pleg"
6061
proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
6162
"k8s.io/kubernetes/pkg/kubelet/runtimeclass"
6263
"k8s.io/kubernetes/pkg/kubelet/sysctl"
@@ -804,6 +805,26 @@ func (m *kubeGenericRuntimeManager) updatePodContainerResources(pod *v1.Pod, res
804805
"pod", format.Pod(pod), "resourceName", resourceName)
805806
return err
806807
}
808+
resizeKey := fmt.Sprintf("%s:resize:%s", container.Name, resourceName)
809+
810+
// Watch (poll) the container for the expected resources update. Stop watching once the resources
811+
// match the desired values.
812+
resizeCondition := pleg.RunningContainerWatchCondition(container.Name, func(status *kubecontainer.Status) bool {
813+
if status.Resources == nil {
814+
return false
815+
}
816+
switch resourceName {
817+
case v1.ResourceMemory:
818+
return status.Resources.MemoryLimit.Equal(*container.Resources.Limits.Memory())
819+
case v1.ResourceCPU:
820+
return status.Resources.CPURequest.Equal(*container.Resources.Requests.Cpu()) &&
821+
status.Resources.CPULimit.Equal(*container.Resources.Limits.Cpu())
822+
default:
823+
return true // Shouldn't happen.
824+
}
825+
})
826+
m.runtimeHelper.SetPodWatchCondition(pod.UID, resizeKey, resizeCondition)
827+
807828
// If UpdateContainerResources is error-free, it means desired values for 'resourceName' was accepted by runtime.
808829
// So we update currentContainerResources for 'resourceName', which is our view of most recently configured resources.
809830
// Note: We can't rely on GetPodStatus as runtime may lag in actuating the resource values it just accepted.

pkg/kubelet/pleg/evented.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,6 @@ func (e *EventedPLEG) updateLatencyMetric(event *runtimeapi.ContainerEventRespon
427427
metrics.EventedPLEGConnLatency.Observe(duration.Seconds())
428428
}
429429

430-
func (e *EventedPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) {
431-
return fmt.Errorf("not implemented"), false
430+
func (e *EventedPLEG) SetPodWatchCondition(podUID types.UID, conditionKey string, condition WatchCondition) {
431+
e.genericPleg.SetPodWatchCondition(podUID, conditionKey, condition)
432432
}

pkg/kubelet/pleg/generic.go

Lines changed: 130 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,16 @@ type GenericPLEG struct {
8080
podCacheMutex sync.Mutex
8181
// logger is used for contextual logging
8282
logger klog.Logger
83+
// watchConditions tracks pod watch conditions, guarded by watchConditionsLock
84+
// watchConditions is a map of pod UID -> condition key -> condition
85+
watchConditions map[types.UID]map[string]versionedWatchCondition
86+
watchConditionsLock sync.Mutex
87+
}
88+
89+
type versionedWatchCondition struct {
90+
key string
91+
condition WatchCondition
92+
version uint32
8393
}
8494

8595
// plegContainerState has a one-to-one mapping to the
@@ -125,13 +135,14 @@ func NewGenericPLEG(logger klog.Logger, runtime kubecontainer.Runtime, eventChan
125135
panic("cache cannot be nil")
126136
}
127137
return &GenericPLEG{
128-
logger: logger,
129-
relistDuration: relistDuration,
130-
runtime: runtime,
131-
eventChannel: eventChannel,
132-
podRecords: make(podRecords),
133-
cache: cache,
134-
clock: clock,
138+
logger: logger,
139+
relistDuration: relistDuration,
140+
runtime: runtime,
141+
eventChannel: eventChannel,
142+
podRecords: make(podRecords),
143+
cache: cache,
144+
clock: clock,
145+
watchConditions: make(map[types.UID]map[string]versionedWatchCondition),
135146
}
136147
}
137148

@@ -252,28 +263,29 @@ func (g *GenericPLEG) Relist() {
252263
// update running pod and container count
253264
updateRunningPodAndContainerMetrics(pods)
254265
g.podRecords.setCurrent(pods)
266+
g.cleanupOrphanedWatchConditions()
267+
268+
needsReinspection := make(map[types.UID]*kubecontainer.Pod)
255269

256-
// Compare the old and the current pods, and generate events.
257-
eventsByPodID := map[types.UID][]*PodLifecycleEvent{}
258270
for pid := range g.podRecords {
271+
// Compare the old and the current pods, and generate events.
259272
oldPod := g.podRecords.getOld(pid)
260273
pod := g.podRecords.getCurrent(pid)
261274
// Get all containers in the old and the new pod.
262275
allContainers := getContainersFromPods(oldPod, pod)
276+
var events []*PodLifecycleEvent
263277
for _, container := range allContainers {
264-
events := computeEvents(g.logger, oldPod, pod, &container.ID)
265-
for _, e := range events {
266-
updateEvents(eventsByPodID, e)
267-
}
278+
containerEvents := computeEvents(g.logger, oldPod, pod, &container.ID)
279+
events = append(events, containerEvents...)
268280
}
269-
}
270281

271-
needsReinspection := make(map[types.UID]*kubecontainer.Pod)
282+
watchConditions := g.getPodWatchConditions(pid)
283+
_, reinspect := g.podsToReinspect[pid]
272284

273-
// If there are events associated with a pod, we should update the
274-
// podCache.
275-
for pid, events := range eventsByPodID {
276-
pod := g.podRecords.getCurrent(pid)
285+
if len(events) == 0 && len(watchConditions) == 0 && !reinspect {
286+
// Nothing else needed for this pod.
287+
continue
288+
}
277289

278290
// updateCache() will inspect the pod and update the cache. If an
279291
// error occurs during the inspection, we want PLEG to retry again
@@ -284,25 +296,35 @@ func (g *GenericPLEG) Relist() {
284296
// inspecting the pod and getting the PodStatus to update the cache
285297
// serially may take a while. We should be aware of this and
286298
// parallelize if needed.
287-
if err, updated := g.updateCache(ctx, pod, pid); err != nil {
299+
status, updated, err := g.updateCache(ctx, pod, pid)
300+
if err != nil {
288301
// Rely on updateCache calling GetPodStatus to log the actual error.
289302
g.logger.V(4).Error(err, "PLEG: Ignoring events for pod", "pod", klog.KRef(pod.Namespace, pod.Name))
290303

291304
// make sure we try to reinspect the pod during the next relisting
292305
needsReinspection[pid] = pod
293306

294307
continue
295-
} else {
296-
// this pod was in the list to reinspect and we did so because it had events, so remove it
297-
// from the list (we don't want the reinspection code below to inspect it a second time in
298-
// this relist execution)
299-
delete(g.podsToReinspect, pid)
300-
if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
301-
if !updated {
302-
continue
303-
}
308+
} else if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) {
309+
if !updated {
310+
continue
311+
}
312+
}
313+
314+
var completedConditions []versionedWatchCondition
315+
for _, condition := range watchConditions {
316+
if condition.condition(status) {
317+
// condition was met: add it to the list of completed conditions.
318+
completedConditions = append(completedConditions, condition)
304319
}
305320
}
321+
if len(completedConditions) > 0 {
322+
g.completeWatchConditions(pid, completedConditions)
323+
// If at least 1 condition completed, emit a ConditionMet event to trigger a pod sync.
324+
// We only emit 1 event even if multiple conditions are met, since SyncPod reevaluates
325+
// all containers in the pod with the latest status.
326+
events = append(events, &PodLifecycleEvent{ID: pid, Type: ConditionMet})
327+
}
306328

307329
// Update the internal storage and send out the events.
308330
g.podRecords.update(pid)
@@ -325,8 +347,6 @@ func (g *GenericPLEG) Relist() {
325347
if events[i].Type == ContainerDied {
326348
// Fill up containerExitCode map for ContainerDied event when first time appeared
327349
if len(containerExitCode) == 0 && pod != nil {
328-
// Get updated podStatus
329-
status, err := g.cache.Get(pod.ID)
330350
if err == nil {
331351
for _, containerStatus := range status.ContainerStatuses {
332352
containerExitCode[containerStatus.ID.ID] = containerStatus.ExitCode
@@ -342,18 +362,6 @@ func (g *GenericPLEG) Relist() {
342362
}
343363
}
344364

345-
// reinspect any pods that failed inspection during the previous relist
346-
if len(g.podsToReinspect) > 0 {
347-
g.logger.V(5).Info("GenericPLEG: Reinspecting pods that previously failed inspection")
348-
for pid, pod := range g.podsToReinspect {
349-
if err, _ := g.updateCache(ctx, pod, pid); err != nil {
350-
// Rely on updateCache calling GetPodStatus to log the actual error.
351-
g.logger.V(5).Error(err, "PLEG: pod failed reinspection", "pod", klog.KRef(pod.Namespace, pod.Name))
352-
needsReinspection[pid] = pod
353-
}
354-
}
355-
}
356-
357365
// Update the cache timestamp. This needs to happen *after*
358366
// all pods have been properly updated in the cache.
359367
g.cache.UpdateTime(timestamp)
@@ -427,13 +435,13 @@ func (g *GenericPLEG) getPodIPs(pid types.UID, status *kubecontainer.PodStatus)
427435
// updateCache tries to update the pod status in the kubelet cache and returns true if the
428436
// pod status was actually updated in the cache. It will return false if the pod status
429437
// was ignored by the cache.
430-
func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, pid types.UID) (error, bool) {
438+
func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, pid types.UID) (*kubecontainer.PodStatus, bool, error) {
431439
if pod == nil {
432440
// The pod is missing in the current relist. This means that
433441
// the pod has no visible (active or inactive) containers.
434442
g.logger.V(4).Info("PLEG: Delete status for pod", "podUID", string(pid))
435443
g.cache.Delete(pid)
436-
return nil, true
444+
return nil, true, nil
437445
}
438446

439447
g.podCacheMutex.Lock()
@@ -477,22 +485,90 @@ func (g *GenericPLEG) updateCache(ctx context.Context, pod *kubecontainer.Pod, p
477485
timestamp = status.TimeStamp
478486
}
479487

480-
return err, g.cache.Set(pod.ID, status, err, timestamp)
488+
return status, g.cache.Set(pod.ID, status, err, timestamp), err
481489
}
482490

483-
func (g *GenericPLEG) UpdateCache(pod *kubecontainer.Pod, pid types.UID) (error, bool) {
484-
ctx := context.Background()
485-
if pod == nil {
486-
return fmt.Errorf("pod cannot be nil"), false
491+
// SetPodWatchCondition flags the pod for reinspection on every Relist iteration until the watch
492+
// condition is met. The condition is keyed so it can be updated before the condition
493+
// is met.
494+
func (g *GenericPLEG) SetPodWatchCondition(podUID types.UID, conditionKey string, condition WatchCondition) {
495+
g.watchConditionsLock.Lock()
496+
defer g.watchConditionsLock.Unlock()
497+
498+
conditions, ok := g.watchConditions[podUID]
499+
if !ok {
500+
conditions = make(map[string]versionedWatchCondition)
501+
}
502+
503+
versioned, found := conditions[conditionKey]
504+
if found {
505+
// Watch condition was already set. Increment its version & update the condition function.
506+
versioned.version++
507+
versioned.condition = condition
508+
conditions[conditionKey] = versioned
509+
} else {
510+
conditions[conditionKey] = versionedWatchCondition{
511+
key: conditionKey,
512+
condition: condition,
513+
}
514+
}
515+
516+
g.watchConditions[podUID] = conditions
517+
}
518+
519+
// getPodWatchConditions returns a list of the active watch conditions for the pod.
520+
func (g *GenericPLEG) getPodWatchConditions(podUID types.UID) []versionedWatchCondition {
521+
g.watchConditionsLock.Lock()
522+
defer g.watchConditionsLock.Unlock()
523+
524+
podConditions, ok := g.watchConditions[podUID]
525+
if !ok {
526+
return nil
487527
}
488-
return g.updateCache(ctx, pod, pid)
528+
529+
// Flatten the map into a list of conditions. This also serves to create a copy, so the lock can
530+
// be released.
531+
conditions := make([]versionedWatchCondition, 0, len(podConditions))
532+
for _, condition := range podConditions {
533+
conditions = append(conditions, condition)
534+
}
535+
return conditions
489536
}
490537

491-
func updateEvents(eventsByPodID map[types.UID][]*PodLifecycleEvent, e *PodLifecycleEvent) {
492-
if e == nil {
538+
// completeWatchConditions removes the completed watch conditions, unless they have been updated
539+
// since the condition was checked.
540+
func (g *GenericPLEG) completeWatchConditions(podUID types.UID, completedConditions []versionedWatchCondition) {
541+
g.watchConditionsLock.Lock()
542+
defer g.watchConditionsLock.Unlock()
543+
544+
conditions, ok := g.watchConditions[podUID]
545+
if !ok {
546+
// Pod was deleted, nothing to do.
493547
return
494548
}
495-
eventsByPodID[e.ID] = append(eventsByPodID[e.ID], e)
549+
550+
for _, completed := range completedConditions {
551+
condition := conditions[completed.key]
552+
// Only clear the condition if it has not been updated.
553+
if condition.version == completed.version {
554+
delete(conditions, completed.key)
555+
}
556+
}
557+
g.watchConditions[podUID] = conditions
558+
}
559+
560+
// cleanupOrphanedWatchConditions purges the watchConditions map of any pods that were removed from
561+
// the pod records. Events are not emitted for removed pods.
562+
func (g *GenericPLEG) cleanupOrphanedWatchConditions() {
563+
g.watchConditionsLock.Lock()
564+
defer g.watchConditionsLock.Unlock()
565+
566+
for podUID := range g.watchConditions {
567+
if g.podRecords.getCurrent(podUID) == nil {
568+
// Pod was deleted, remove it from the watch conditions.
569+
delete(g.watchConditions, podUID)
570+
}
571+
}
496572
}
497573

498574
func getContainerState(pod *kubecontainer.Pod, cid *kubecontainer.ContainerID) plegContainerState {

0 commit comments

Comments
 (0)