Skip to content

Commit 30bca32

Browse files
committed
Don't re-write the checkpoint file when the content is unchanged
1 parent c79d3ce commit 30bca32

File tree

2 files changed

+46
-28
lines changed

2 files changed

+46
-28
lines changed

pkg/kubelet/allocation/state/state_checkpoint.go

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"k8s.io/apimachinery/pkg/util/sets"
2727
"k8s.io/klog/v2"
2828
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
29+
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
2930
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
3031
)
3132

@@ -36,6 +37,7 @@ type stateCheckpoint struct {
3637
cache State
3738
checkpointManager checkpointmanager.CheckpointManager
3839
checkpointName string
40+
lastChecksum checksum.Checksum
3941
}
4042

4143
// NewStateCheckpoint creates new State for keeping track of pod resource allocations with checkpoint backend
@@ -45,41 +47,39 @@ func NewStateCheckpoint(stateDir, checkpointName string) (State, error) {
4547
return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err)
4648
}
4749

48-
praInfo, err := restoreState(checkpointManager, checkpointName)
50+
pra, checksum, err := restoreState(checkpointManager, checkpointName)
4951
if err != nil {
5052
//lint:ignore ST1005 user-facing error message
5153
return nil, fmt.Errorf("could not restore state from checkpoint: %w, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet",
5254
err, path.Join(stateDir, checkpointName))
5355
}
5456

5557
stateCheckpoint := &stateCheckpoint{
56-
cache: NewStateMemory(praInfo.AllocationEntries),
58+
cache: NewStateMemory(pra),
5759
checkpointManager: checkpointManager,
5860
checkpointName: checkpointName,
61+
lastChecksum: checksum,
5962
}
6063
return stateCheckpoint, nil
6164
}
6265

6366
// restores state from a checkpoint and creates it if it doesn't exist
64-
func restoreState(checkpointManager checkpointmanager.CheckpointManager, checkpointName string) (*PodResourceAllocationInfo, error) {
65-
var err error
67+
func restoreState(checkpointManager checkpointmanager.CheckpointManager, checkpointName string) (PodResourceAllocation, checksum.Checksum, error) {
6668
checkpoint := &Checkpoint{}
67-
68-
if err = checkpointManager.GetCheckpoint(checkpointName, checkpoint); err != nil {
69+
if err := checkpointManager.GetCheckpoint(checkpointName, checkpoint); err != nil {
6970
if err == errors.ErrCheckpointNotFound {
70-
return &PodResourceAllocationInfo{
71-
AllocationEntries: make(map[types.UID]map[string]v1.ResourceRequirements),
72-
}, nil
71+
return nil, 0, nil
7372
}
74-
return nil, err
73+
return nil, 0, err
7574
}
76-
klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
75+
7776
praInfo, err := checkpoint.GetPodResourceAllocationInfo()
7877
if err != nil {
79-
return nil, fmt.Errorf("failed to get pod resource allocation info: %w", err)
78+
return nil, 0, fmt.Errorf("failed to get pod resource allocation info: %w", err)
8079
}
8180

82-
return praInfo, nil
81+
klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
82+
return praInfo.AllocationEntries, checkpoint.Checksum, nil
8383
}
8484

8585
// saves state to a checkpoint, caller is responsible for locking
@@ -92,11 +92,16 @@ func (sc *stateCheckpoint) storeState() error {
9292
if err != nil {
9393
return fmt.Errorf("failed to create checkpoint: %w", err)
9494
}
95+
if checkpoint.Checksum == sc.lastChecksum {
96+
// No changes to the checkpoint => no need to re-write it.
97+
return nil
98+
}
9599
err = sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
96100
if err != nil {
97101
klog.ErrorS(err, "Failed to save pod allocation checkpoint")
98102
return err
99103
}
104+
sc.lastChecksum = checkpoint.Checksum
100105
return nil
101106
}
102107

pkg/kubelet/allocation/state/state_checkpoint_test.go

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ package state
1919
import (
2020
"fmt"
2121
"os"
22+
"path/filepath"
2223
"testing"
2324

2425
"github.com/stretchr/testify/require"
2526

2627
v1 "k8s.io/api/core/v1"
2728
"k8s.io/apimachinery/pkg/api/resource"
28-
"k8s.io/apimachinery/pkg/types"
2929
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
3030
)
3131

@@ -124,6 +124,22 @@ func Test_stateCheckpoint_storeState(t *testing.T) {
124124

125125
actual = newSC.GetPodResourceAllocation()
126126
verifyPodResourceAllocation(t, &tt.args.podResourceAllocation, &actual, "restored pod resource allocation is not equal to original pod resource allocation")
127+
128+
checkpointPath := filepath.Join(testDir, testCheckpoint)
129+
require.FileExists(t, checkpointPath)
130+
require.NoError(t, os.Remove(checkpointPath)) // Remove the checkpoint file to track whether it's re-written.
131+
132+
// Setting the pod allocations to the same values should not re-write the checkpoint.
133+
for podUID, alloc := range tt.args.podResourceAllocation {
134+
require.NoError(t, originalSC.SetPodResourceAllocation(podUID, alloc))
135+
require.NoFileExists(t, checkpointPath, "checkpoint should not be re-written")
136+
}
137+
138+
// Setting a new value should update the checkpoint.
139+
require.NoError(t, originalSC.SetPodResourceAllocation("foo-bar", map[string]v1.ResourceRequirements{
140+
"container1": {Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}},
141+
}))
142+
require.FileExists(t, checkpointPath, "checkpoint should be re-written")
127143
})
128144
}
129145
}
@@ -138,14 +154,12 @@ func Test_stateCheckpoint_formatUpgraded(t *testing.T) {
138154
// prepare old checkpoint, ResizeStatusEntries is unset,
139155
// pretend that the old checkpoint is unaware for the field ResizeStatusEntries
140156
const checkpointContent = `{"data":"{\"allocationEntries\":{\"pod1\":{\"container1\":{\"requests\":{\"cpu\":\"1Ki\",\"memory\":\"1Ki\"}}}}}","checksum":1555601526}`
141-
expectedPodResourceAllocationInfo := &PodResourceAllocationInfo{
142-
AllocationEntries: map[types.UID]map[string]v1.ResourceRequirements{
143-
"pod1": {
144-
"container1": {
145-
Requests: v1.ResourceList{
146-
v1.ResourceCPU: resource.MustParse("1Ki"),
147-
v1.ResourceMemory: resource.MustParse("1Ki"),
148-
},
157+
expectedPodResourceAllocation := PodResourceAllocation{
158+
"pod1": {
159+
"container1": {
160+
Requests: v1.ResourceList{
161+
v1.ResourceCPU: resource.MustParse("1Ki"),
162+
v1.ResourceMemory: resource.MustParse("1Ki"),
149163
},
150164
},
151165
},
@@ -157,15 +171,14 @@ func Test_stateCheckpoint_formatUpgraded(t *testing.T) {
157171
err = sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
158172
require.NoError(t, err, "failed to create old checkpoint")
159173

160-
actualPodResourceAllocationInfo, err := restoreState(sc.checkpointManager, sc.checkpointName)
174+
actualPodResourceAllocation, _, err := restoreState(sc.checkpointManager, sc.checkpointName)
161175
require.NoError(t, err, "failed to restore state")
162176

163-
require.Equal(t, expectedPodResourceAllocationInfo, actualPodResourceAllocationInfo, "pod resource allocation info is not equal")
177+
require.Equal(t, expectedPodResourceAllocation, actualPodResourceAllocation, "pod resource allocation info is not equal")
164178

165-
sc.cache = NewStateMemory(actualPodResourceAllocationInfo.AllocationEntries)
179+
sc.cache = NewStateMemory(actualPodResourceAllocation)
166180

167-
actualPodResourceAllocationInfo = &PodResourceAllocationInfo{}
168-
actualPodResourceAllocationInfo.AllocationEntries = sc.cache.GetPodResourceAllocation()
181+
actualPodResourceAllocation = sc.cache.GetPodResourceAllocation()
169182

170-
require.Equal(t, expectedPodResourceAllocationInfo, actualPodResourceAllocationInfo, "pod resource allocation info is not equal")
183+
require.Equal(t, expectedPodResourceAllocation, actualPodResourceAllocation, "pod resource allocation info is not equal")
171184
}

0 commit comments

Comments
 (0)