Skip to content

Commit fd35f65

Browse files
committed
fix state mem constructor and adjust restoreState
1 parent efdd6be commit fd35f65

File tree

7 files changed

+43
-73
lines changed

7 files changed

+43
-73
lines changed

pkg/kubelet/kubelet.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2922,7 +2922,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
29222922
if fit {
29232923
// Update pod resource allocation checkpoint
29242924
if err := kl.statusManager.SetPodAllocation(pod); err != nil {
2925-
klog.ErrorS(err, "SetPodAllocation failed", "pod", klog.KObj(pod))
2925+
return nil, err
29262926
}
29272927
for i, container := range pod.Spec.Containers {
29282928
if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.Containers[i].Resources) {

pkg/kubelet/status/fake_status_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,6 @@ func (m *fakeManager) SetPodResizeStatus(podUID types.UID, resizeStatus v1.PodRe
9393
// NewFakeManager creates empty/fake memory manager
9494
func NewFakeManager() Manager {
9595
return &fakeManager{
96-
state: state.NewStateMemory(state.PodResourceAllocation{}, state.PodResizeStatus{}),
96+
state: state.NewStateMemory(state.PodResourceAllocation{}),
9797
}
9898
}

pkg/kubelet/status/state/state.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ type Reader interface {
4747

4848
type writer interface {
4949
SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceRequirements) error
50-
SetPodResourceAllocation(podUID string, alloc map[string]v1.ResourceRequirements) error
5150
SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus)
5251
Delete(podUID string, containerName string) error
5352
ClearState() error

pkg/kubelet/status/state/state_checkpoint.go

Lines changed: 19 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -42,50 +42,43 @@ func NewStateCheckpoint(stateDir, checkpointName string) (State, error) {
4242
if err != nil {
4343
return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err)
4444
}
45+
46+
praInfo, err := restoreState(checkpointManager, checkpointName)
47+
if err != nil {
48+
//lint:ignore ST1005 user-facing error message
49+
return nil, fmt.Errorf("could not restore state from checkpoint: %w, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet",
50+
err, path.Join(stateDir, checkpointName))
51+
}
52+
4553
stateCheckpoint := &stateCheckpoint{
46-
cache: NewStateMemory(PodResourceAllocation{}, PodResizeStatus{}),
54+
cache: NewStateMemory(praInfo.AllocationEntries),
4755
checkpointManager: checkpointManager,
4856
checkpointName: checkpointName,
4957
}
5058

51-
if err := stateCheckpoint.restoreState(); err != nil {
52-
//lint:ignore ST1005 user-facing error message
53-
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName))
54-
}
59+
klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
5560
return stateCheckpoint, nil
5661
}
5762

5863
// restores state from a checkpoint and creates it if it doesn't exist
59-
func (sc *stateCheckpoint) restoreState() error {
60-
sc.mux.Lock()
61-
defer sc.mux.Unlock()
64+
func restoreState(checkpointManager checkpointmanager.CheckpointManager, checkpointName string) (*PodResourceAllocationInfo, error) {
6265
var err error
66+
checkpoint := &Checkpoint{}
6367

64-
checkpoint, err := NewCheckpoint(nil)
65-
if err != nil {
66-
return fmt.Errorf("failed to create new checkpoint: %w", err)
67-
}
68-
69-
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
68+
if err = checkpointManager.GetCheckpoint(checkpointName, checkpoint); err != nil {
7069
if err == errors.ErrCheckpointNotFound {
71-
return sc.storeState()
70+
return &PodResourceAllocationInfo{
71+
AllocationEntries: make(map[string]map[string]v1.ResourceRequirements),
72+
}, nil
7273
}
73-
return err
74+
return nil, err
7475
}
7576
praInfo, err := checkpoint.GetPodResourceAllocationInfo()
7677
if err != nil {
77-
return fmt.Errorf("failed to get pod resource allocation info: %w", err)
78-
}
79-
80-
for podUID, alloc := range praInfo.AllocationEntries {
81-
err = sc.cache.SetPodResourceAllocation(podUID, alloc)
82-
if err != nil {
83-
klog.ErrorS(err, "failed to set pod resource allocation")
84-
}
78+
return nil, fmt.Errorf("failed to get pod resource allocation info: %w", err)
8579
}
8680

87-
klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
88-
return nil
81+
return praInfo, nil
8982
}
9083

9184
// saves state to a checkpoint, caller is responsible for locking
@@ -135,19 +128,6 @@ func (sc *stateCheckpoint) SetContainerResourceAllocation(podUID string, contain
135128
return sc.storeState()
136129
}
137130

138-
// SetPodResourceAllocation sets pod resource allocation
139-
func (sc *stateCheckpoint) SetPodResourceAllocation(podUID string, alloc map[string]v1.ResourceRequirements) error {
140-
sc.mux.Lock()
141-
defer sc.mux.Unlock()
142-
143-
err := sc.cache.SetPodResourceAllocation(podUID, alloc)
144-
if err != nil {
145-
return err
146-
}
147-
148-
return sc.storeState()
149-
}
150-
151131
// SetPodResizeStatus sets the last resize decision for a pod
152132
func (sc *stateCheckpoint) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) {
153133
sc.mux.Lock()
@@ -194,10 +174,6 @@ func (sc *noopStateCheckpoint) SetContainerResourceAllocation(_ string, _ string
194174
return nil
195175
}
196176

197-
func (sc *noopStateCheckpoint) SetPodResourceAllocation(_ string, _ map[string]v1.ResourceRequirements) error {
198-
return nil
199-
}
200-
201177
func (sc *noopStateCheckpoint) SetPodResizeStatus(_ string, _ v1.PodResizeStatus) {}
202178

203179
func (sc *noopStateCheckpoint) Delete(_ string, _ string) error {

pkg/kubelet/status/state/state_checkpoint_test.go

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ const testCheckpoint = "pod_status_manager_state"
3232

3333
func newTestStateCheckpoint(t *testing.T) *stateCheckpoint {
3434
testingDir := getTestDir(t)
35-
cache := NewStateMemory(PodResourceAllocation{}, PodResizeStatus{})
35+
cache := NewStateMemory(PodResourceAllocation{})
3636
checkpointManager, err := checkpointmanager.NewCheckpointManager(testingDir)
3737
require.NoError(t, err, "failed to create checkpoint manager")
3838
checkpointName := "pod_state_checkpoint"
@@ -110,9 +110,11 @@ func Test_stateCheckpoint_storeState(t *testing.T) {
110110
originalSC, err := NewStateCheckpoint(testDir, testCheckpoint)
111111
require.NoError(t, err)
112112

113-
for podUID, alloc := range tt.args.podResourceAllocation {
114-
err = originalSC.SetPodResourceAllocation(podUID, alloc)
115-
require.NoError(t, err)
113+
for podUID, containerAlloc := range tt.args.podResourceAllocation {
114+
for containerName, alloc := range containerAlloc {
115+
err = originalSC.SetContainerResourceAllocation(podUID, containerName, alloc)
116+
require.NoError(t, err)
117+
}
116118
}
117119

118120
actual := originalSC.GetPodResourceAllocation()
@@ -156,11 +158,15 @@ func Test_stateCheckpoint_formatUpgraded(t *testing.T) {
156158
err = sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
157159
require.NoError(t, err, "failed to create old checkpoint")
158160

159-
err = sc.restoreState()
161+
actualPodResourceAllocationInfo, err := restoreState(sc.checkpointManager, sc.checkpointName)
160162
require.NoError(t, err, "failed to restore state")
161163

162-
actualPodResourceAllocationInfo := &PodResourceAllocationInfo{}
164+
require.Equal(t, expectedPodResourceAllocationInfo, actualPodResourceAllocationInfo, "pod resource allocation info is not equal")
165+
166+
sc.cache = NewStateMemory(actualPodResourceAllocationInfo.AllocationEntries)
167+
168+
actualPodResourceAllocationInfo = &PodResourceAllocationInfo{}
163169
actualPodResourceAllocationInfo.AllocationEntries = sc.cache.GetPodResourceAllocation()
164-
require.NoError(t, err, "failed to get pod resource allocation info")
170+
165171
require.Equal(t, expectedPodResourceAllocationInfo, actualPodResourceAllocationInfo, "pod resource allocation info is not equal")
166172
}

pkg/kubelet/status/state/state_mem.go

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,14 @@ type stateMemory struct {
3232
var _ State = &stateMemory{}
3333

3434
// NewStateMemory creates new State to track resources allocated to pods
35-
func NewStateMemory(alloc PodResourceAllocation, stats PodResizeStatus) State {
35+
func NewStateMemory(alloc PodResourceAllocation) State {
36+
if alloc == nil {
37+
alloc = PodResourceAllocation{}
38+
}
3639
klog.V(2).InfoS("Initialized new in-memory state store for pod resource allocation tracking")
3740
return &stateMemory{
3841
podAllocation: alloc,
39-
podResizeStatus: stats,
42+
podResizeStatus: PodResizeStatus{},
4043
}
4144
}
4245

@@ -74,18 +77,6 @@ func (s *stateMemory) SetContainerResourceAllocation(podUID string, containerNam
7477
return nil
7578
}
7679

77-
func (s *stateMemory) SetPodResourceAllocation(podUID string, alloc map[string]v1.ResourceRequirements) error {
78-
s.Lock()
79-
defer s.Unlock()
80-
81-
for containerName, containerAlloc := range alloc {
82-
s.podAllocation[podUID][containerName] = containerAlloc
83-
}
84-
85-
klog.V(3).InfoS("Updated pod resource allocation", "podUID", podUID, "allocation", alloc)
86-
return nil
87-
}
88-
8980
func (s *stateMemory) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) {
9081
s.Lock()
9182
defer s.Unlock()

pkg/kubelet/status/status_manager.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -296,15 +296,13 @@ func (m *manager) GetPodResizeStatus(podUID types.UID) v1.PodResizeStatus {
296296
func (m *manager) SetPodAllocation(pod *v1.Pod) error {
297297
m.podStatusesLock.RLock()
298298
defer m.podStatusesLock.RUnlock()
299-
300-
podAlloc := make(map[string]v1.ResourceRequirements)
301-
302299
for _, container := range pod.Spec.Containers {
303300
alloc := *container.Resources.DeepCopy()
304-
podAlloc[container.Name] = alloc
301+
if err := m.state.SetContainerResourceAllocation(string(pod.UID), container.Name, alloc); err != nil {
302+
return err
303+
}
305304
}
306-
307-
return m.state.SetPodResourceAllocation(string(pod.UID), podAlloc)
305+
return nil
308306
}
309307

310308
// SetPodResizeStatus checkpoints the last resizing decision for the pod.

0 commit comments

Comments
 (0)