Skip to content

Commit 660bd6b

Browse files
committed
Track actuated resources in the allocation manager
1 parent 2effa5e commit 660bd6b

File tree

2 files changed

+68
-26
lines changed

2 files changed

+68
-26
lines changed

pkg/kubelet/allocation/allocation_manager.go

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package allocation
1818

1919
import (
20+
"path/filepath"
21+
2022
v1 "k8s.io/api/core/v1"
2123
apiequality "k8s.io/apimachinery/pkg/api/equality"
2224
"k8s.io/apimachinery/pkg/types"
@@ -29,7 +31,10 @@ import (
2931
)
3032

3133
// podStatusManagerStateFile is the file name where status manager stores its state
32-
const podStatusManagerStateFile = "pod_status_manager_state"
34+
const (
35+
allocatedPodsStateFile = "allocated_pods_state"
36+
actuatedPodsStateFile = "actuated_pods_state"
37+
)
3338

3439
// AllocationManager tracks pod resource allocations.
3540
type Manager interface {
@@ -42,56 +47,70 @@ type Manager interface {
4247
UpdatePodFromAllocation(pod *v1.Pod) (*v1.Pod, bool)
4348

4449
// SetPodAllocation checkpoints the resources allocated to a pod's containers.
45-
SetPodAllocation(pod *v1.Pod) error
50+
SetPodAllocation(allocatedPod *v1.Pod) error
51+
52+
// SetActuatedResources records the actuated resources of the given container (or the entire
53+
// pod, if actuatedContainer is nil).
54+
SetActuatedResources(allocatedPod *v1.Pod, actuatedContainer *v1.Container) error
4655

47-
// DeletePodAllocation removes any stored state for the given pod UID.
48-
DeletePodAllocation(uid types.UID)
56+
// GetActuatedResources returns the stored actuated resources for the container, and whether they exist.
57+
GetActuatedResources(podUID types.UID, containerName string) (v1.ResourceRequirements, bool)
58+
59+
// DeletePod removes any stored state for the given pod UID.
60+
DeletePod(uid types.UID)
4961

5062
// RemoveOrphanedPods removes the stored state for any pods not included in the set of remaining pods.
5163
RemoveOrphanedPods(remainingPods sets.Set[types.UID])
5264
}
5365

5466
type manager struct {
55-
state state.State
67+
allocated state.State
68+
actuated state.State
5669
}
5770

5871
func NewManager(checkpointDirectory string) Manager {
59-
m := &manager{}
60-
61-
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
62-
stateImpl, err := state.NewStateCheckpoint(checkpointDirectory, podStatusManagerStateFile)
63-
if err != nil {
64-
// This is a crictical, non-recoverable failure.
65-
klog.ErrorS(err, "Failed to initialize allocation checkpoint manager")
66-
panic(err)
67-
}
68-
m.state = stateImpl
69-
} else {
70-
m.state = state.NewNoopStateCheckpoint()
72+
return &manager{
73+
allocated: newStateImpl(checkpointDirectory, allocatedPodsStateFile),
74+
actuated: newStateImpl(checkpointDirectory, actuatedPodsStateFile),
75+
}
76+
}
77+
78+
func newStateImpl(checkpointDirectory, checkpointName string) state.State {
79+
if !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
80+
return state.NewNoopStateCheckpoint()
7181
}
7282

73-
return m
83+
stateImpl, err := state.NewStateCheckpoint(checkpointDirectory, checkpointName)
84+
if err != nil {
85+
// This is a critical, non-recoverable failure.
86+
klog.ErrorS(err, "Failed to initialize allocation checkpoint manager",
87+
"checkpointPath", filepath.Join(checkpointDirectory, checkpointName))
88+
panic(err)
89+
}
90+
91+
return stateImpl
7492
}
7593

7694
// NewInMemoryManager returns an allocation manager that doesn't persist state.
7795
// For testing purposes only!
7896
func NewInMemoryManager() Manager {
7997
return &manager{
80-
state: state.NewStateMemory(nil),
98+
allocated: state.NewStateMemory(nil),
99+
actuated: state.NewStateMemory(nil),
81100
}
82101
}
83102

84103
// GetContainerResourceAllocation returns the last checkpointed AllocatedResources values
85104
// If checkpoint manager has not been initialized, it returns nil, false
86105
func (m *manager) GetContainerResourceAllocation(podUID types.UID, containerName string) (v1.ResourceRequirements, bool) {
87-
return m.state.GetContainerResourceAllocation(podUID, containerName)
106+
return m.allocated.GetContainerResourceAllocation(podUID, containerName)
88107
}
89108

90109
// UpdatePodFromAllocation overwrites the pod spec with the allocation.
91110
// This function does a deep copy only if updates are needed.
92111
func (m *manager) UpdatePodFromAllocation(pod *v1.Pod) (*v1.Pod, bool) {
93112
// TODO(tallclair): This clones the whole cache, but we only need 1 pod.
94-
allocs := m.state.GetPodResourceAllocation()
113+
allocs := m.allocated.GetPodResourceAllocation()
95114
return updatePodFromAllocation(pod, allocs)
96115
}
97116

@@ -134,6 +153,10 @@ func updatePodFromAllocation(pod *v1.Pod, allocs state.PodResourceAllocation) (*
134153

135154
// SetPodAllocation checkpoints the resources allocated to a pod's containers
136155
func (m *manager) SetPodAllocation(pod *v1.Pod) error {
156+
return m.allocated.SetPodResourceAllocation(pod.UID, allocationFromPod(pod))
157+
}
158+
159+
func allocationFromPod(pod *v1.Pod) map[string]v1.ResourceRequirements {
137160
podAlloc := make(map[string]v1.ResourceRequirements)
138161
for _, container := range pod.Spec.Containers {
139162
alloc := *container.Resources.DeepCopy()
@@ -149,16 +172,35 @@ func (m *manager) SetPodAllocation(pod *v1.Pod) error {
149172
}
150173
}
151174

152-
return m.state.SetPodResourceAllocation(pod.UID, podAlloc)
175+
return podAlloc
153176
}
154177

155-
func (m *manager) DeletePodAllocation(uid types.UID) {
156-
if err := m.state.Delete(uid, ""); err != nil {
178+
func (m *manager) DeletePod(uid types.UID) {
179+
if err := m.allocated.Delete(uid, ""); err != nil {
180+
// If the deletion fails, it will be retried by RemoveOrphanedPods, so we can safely ignore the error.
181+
klog.V(3).ErrorS(err, "Failed to delete pod allocation", "podUID", uid)
182+
}
183+
184+
if err := m.actuated.Delete(uid, ""); err != nil {
157185
// If the deletion fails, it will be retried by RemoveOrphanedPods, so we can safely ignore the error.
158186
klog.V(3).ErrorS(err, "Failed to delete pod allocation", "podUID", uid)
159187
}
160188
}
161189

162190
func (m *manager) RemoveOrphanedPods(remainingPods sets.Set[types.UID]) {
163-
m.state.RemoveOrphanedPods(remainingPods)
191+
m.allocated.RemoveOrphanedPods(remainingPods)
192+
m.actuated.RemoveOrphanedPods(remainingPods)
193+
}
194+
195+
func (m *manager) SetActuatedResources(allocatedPod *v1.Pod, actuatedContainer *v1.Container) error {
196+
if actuatedContainer == nil {
197+
alloc := allocationFromPod(allocatedPod)
198+
return m.actuated.SetPodResourceAllocation(allocatedPod.UID, alloc)
199+
}
200+
201+
return m.actuated.SetContainerResourceAllocation(allocatedPod.UID, actuatedContainer.Name, actuatedContainer.Resources)
202+
}
203+
204+
func (m *manager) GetActuatedResources(podUID types.UID, containerName string) (v1.ResourceRequirements, bool) {
205+
return m.actuated.GetContainerResourceAllocation(podUID, containerName)
164206
}

pkg/kubelet/kubelet.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2722,7 +2722,7 @@ func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) {
27222722
start := kl.clock.Now()
27232723
for _, pod := range pods {
27242724
kl.podManager.RemovePod(pod)
2725-
kl.allocationManager.DeletePodAllocation(pod.UID)
2725+
kl.allocationManager.DeletePod(pod.UID)
27262726

27272727
pod, mirrorPod, wasMirror := kl.podManager.GetPodAndMirrorPod(pod)
27282728
if wasMirror {

0 commit comments

Comments
 (0)