Skip to content

Commit b9d0084

Browse files
committed
kubelet: improve checkpoint errors
Recording the expected and actual checksum in the error makes it possible to provide that information, for example in a failed test like the ones for DRA. Otherwise developers have to manually step through the test with a debugger to figure out what the new checksum is.
1 parent 157f4b9 commit b9d0084

File tree

6 files changed

+56
-13
lines changed

6 files changed

+56
-13
lines changed

pkg/kubelet/checkpointmanager/checksum/checksum.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ type Checksum uint64
2828

2929
// Verify verifies that passed checksum is same as calculated checksum
3030
func (cs Checksum) Verify(data interface{}) error {
31-
if cs != New(data) {
32-
return errors.ErrCorruptCheckpoint
31+
actualCS := New(data)
32+
if cs != actualCS {
33+
return &errors.CorruptCheckpointError{ActualCS: uint64(actualCS), ExpectedCS: uint64(cs)}
3334
}
3435
return nil
3536
}

pkg/kubelet/checkpointmanager/errors/errors.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,28 @@ package errors
1818

1919
import "fmt"
2020

21-
// ErrCorruptCheckpoint error is reported when checksum does not match
22-
var ErrCorruptCheckpoint = fmt.Errorf("checkpoint is corrupted")
21+
// ErrCorruptCheckpoint error is reported when checksum does not match.
22+
// Check for it with:
23+
//
24+
// var csErr *CorruptCheckpointError
25+
// if errors.As(err, &csErr) { ... }
26+
// if errors.Is(err, CorruptCheckpointError{}) { ... }
27+
type CorruptCheckpointError struct {
28+
ActualCS, ExpectedCS uint64
29+
}
30+
31+
func (err CorruptCheckpointError) Error() string {
32+
return "checkpoint is corrupted"
33+
}
34+
35+
func (err CorruptCheckpointError) Is(target error) bool {
36+
switch target.(type) {
37+
case *CorruptCheckpointError, CorruptCheckpointError:
38+
return true
39+
default:
40+
return false
41+
}
42+
}
2343

2444
// ErrCheckpointNotFound is reported when checkpoint is not found for a given key
2545
var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found")

pkg/kubelet/cm/cpumanager/state/checkpoint.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,12 @@ func (cp *CPUManagerCheckpointV1) VerifyChecksum() error {
110110

111111
hash := fnv.New32a()
112112
fmt.Fprintf(hash, "%v", object)
113-
if cp.Checksum != checksum.Checksum(hash.Sum32()) {
114-
return errors.ErrCorruptCheckpoint
113+
actualCS := checksum.Checksum(hash.Sum32())
114+
if cp.Checksum != actualCS {
115+
return &errors.CorruptCheckpointError{
116+
ActualCS: uint64(actualCS),
117+
ExpectedCS: uint64(cp.Checksum),
118+
}
115119
}
116120

117121
return nil

pkg/kubelet/cm/dra/state/checkpoint.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@ package state
1818

1919
import (
2020
"encoding/json"
21+
"errors"
2122
"fmt"
2223
"hash/fnv"
2324
"strings"
2425

2526
"k8s.io/apimachinery/pkg/util/dump"
2627
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
2728
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
28-
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
29+
cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
2930
)
3031

3132
var _ checkpointmanager.Checkpoint = &DRAManagerCheckpoint{}
@@ -79,7 +80,7 @@ func (dc *DRAManagerCheckpoint) VerifyChecksum() error {
7980
ck := dc.Checksum
8081
dc.Checksum = 0
8182
err := ck.Verify(dc)
82-
if err == errors.ErrCorruptCheckpoint {
83+
if errors.Is(err, cmerrors.CorruptCheckpointError{}) {
8384
// Verify with old structs without ResourceHandles field
8485
// TODO: remove in Beta
8586
err = verifyChecksumWithoutResourceHandles(dc, ck)
@@ -115,8 +116,12 @@ func verifyChecksumWithoutResourceHandles(dc *DRAManagerCheckpoint, checkSum che
115116
object = strings.Replace(object, "ClaimInfoStateListWithoutResourceHandles", "ClaimInfoStateList", 1)
116117
hash := fnv.New32a()
117118
fmt.Fprintf(hash, "%v", object)
118-
if checkSum != checksum.Checksum(hash.Sum32()) {
119-
return errors.ErrCorruptCheckpoint
119+
actualCS := checksum.Checksum(hash.Sum32())
120+
if checkSum != actualCS {
121+
return &cmerrors.CorruptCheckpointError{
122+
ActualCS: uint64(actualCS),
123+
ExpectedCS: uint64(checkSum),
124+
}
120125
}
121126
return nil
122127
}

pkg/kubelet/cm/dra/state/state_checkpoint.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func (sc *stateCheckpoint) GetOrCreate() (ClaimInfoStateList, error) {
126126
return ClaimInfoStateList{}, nil
127127
}
128128
if err != nil {
129-
return nil, fmt.Errorf("failed to get checkpoint %v: %v", sc.checkpointName, err)
129+
return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err)
130130
}
131131

132132
return checkpoint.Entries, nil

pkg/kubelet/cm/dra/state/state_checkpoint_test.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,19 @@ limitations under the License.
1717
package state
1818

1919
import (
20+
"errors"
2021
"os"
2122
"path"
2223
"strings"
2324
"testing"
2425

2526
"github.com/stretchr/testify/assert"
27+
"github.com/stretchr/testify/require"
2628

2729
resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
2830
"k8s.io/apimachinery/pkg/util/sets"
2931
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
32+
cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
3033
testutil "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/testing"
3134
)
3235

@@ -198,7 +201,7 @@ func TestCheckpointGetOrCreate(t *testing.T) {
198201
assert.Error(t, err)
199202
assert.Contains(t, err.Error(), tc.expectedError)
200203
} else {
201-
assert.NoError(t, err, "unexpected error while creating checkpointState")
204+
requireNoCheckpointError(t, err)
202205
// compare state after restoration with the one expected
203206
assertStateEqual(t, state, tc.expectedState)
204207
}
@@ -270,11 +273,21 @@ func TestOldCheckpointRestore(t *testing.T) {
270273

271274
checkpoint := NewDRAManagerCheckpoint()
272275
err = cpm.GetCheckpoint(testingCheckpoint, checkpoint)
273-
assert.NoError(t, err, "could not restore checkpoint")
276+
requireNoCheckpointError(t, err)
274277

275278
checkpointData, err := checkpoint.MarshalCheckpoint()
276279
assert.NoError(t, err, "could not Marshal Checkpoint")
277280

278281
expectedData := `{"version":"v1","entries":[{"DriverName":"test-driver.cdi.k8s.io","ClassName":"class-name","ClaimUID":"067798be-454e-4be4-9047-1aa06aea63f7","ClaimName":"example","Namespace":"default","PodUIDs":{"139cdb46-f989-4f17-9561-ca10cfb509a6":{}},"ResourceHandles":null,"CDIDevices":{"test-driver.cdi.k8s.io":["example.com/example=cdi-example"]}}],"checksum":453625682}`
279282
assert.Equal(t, expectedData, string(checkpointData), "expected ClaimInfoState does not equal to restored one")
280283
}
284+
285+
func requireNoCheckpointError(t *testing.T, err error) {
286+
t.Helper()
287+
var cksumErr *cmerrors.CorruptCheckpointError
288+
if errors.As(err, &cksumErr) {
289+
t.Fatalf("unexpected corrupt checkpoint, expected checksum %d, got %d", cksumErr.ExpectedCS, cksumErr.ActualCS)
290+
} else {
291+
require.NoError(t, err, "could not restore checkpoint")
292+
}
293+
}

0 commit comments

Comments
 (0)