Skip to content

Commit 23fa359

Browse files
authored
Merge pull request kubernetes#84705 from whypro/cpumanager-panic-master
Return error instead of panic when cpu manager fails on startup.
2 parents f680c26 + f4bd4e2 commit 23fa359

12 files changed

+191
-137
lines changed

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,10 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
579579
if err != nil {
580580
return fmt.Errorf("failed to build map of initial containers from runtime: %v", err)
581581
}
582-
cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap)
582+
err = cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap)
583+
if err != nil {
584+
return fmt.Errorf("start cpu manager error: %v", err)
585+
}
583586
}
584587

585588
// cache the node Info including resource capacity and

pkg/kubelet/cm/cpumanager/cpu_manager.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ const cpuManagerStateFileName = "cpu_manager_state"
5353
// Manager interface provides methods for Kubelet to manage pod cpus.
5454
type Manager interface {
5555
// Start is called during Kubelet initialization.
56-
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap)
56+
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error
5757

5858
// AddContainer is called between container create and container start
5959
// so that initial CPU affinity settings can be written through to the
@@ -155,7 +155,10 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
155155
// exclusively allocated.
156156
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
157157
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
158-
policy = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity)
158+
policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity)
159+
if err != nil {
160+
return nil, fmt.Errorf("new static policy error: %v", err)
161+
}
159162

160163
default:
161164
return nil, fmt.Errorf("unknown policy: \"%s\"", cpuPolicyName)
@@ -173,7 +176,7 @@ func NewManager(cpuPolicyName string, reconcilePeriod time.Duration, machineInfo
173176
return manager, nil
174177
}
175178

176-
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) {
179+
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
177180
klog.Infof("[cpumanager] starting with %s policy", m.policy.Name())
178181
klog.Infof("[cpumanager] reconciling every %v", m.reconcilePeriod)
179182
m.sourcesReady = sourcesReady
@@ -183,16 +186,22 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
183186

184187
stateImpl, err := state.NewCheckpointState(m.stateFileDirectory, cpuManagerStateFileName, m.policy.Name(), initialContainers)
185188
if err != nil {
186-
klog.Errorf("[cpumanager] could not initialize checkpoint manager: %v\n", err)
187-
panic("[cpumanager] - please drain node and remove policy state file")
189+
klog.Errorf("[cpumanager] could not initialize checkpoint manager: %v, please drain node and remove policy state file", err)
190+
return err
188191
}
189192
m.state = stateImpl
190193

191-
m.policy.Start(m.state)
194+
err = m.policy.Start(m.state)
195+
if err != nil {
196+
klog.Errorf("[cpumanager] policy start error: %v", err)
197+
return err
198+
}
199+
192200
if m.policy.Name() == string(PolicyNone) {
193-
return
201+
return nil
194202
}
195203
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
204+
return nil
196205
}
197206

198207
func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) error {

pkg/kubelet/cm/cpumanager/cpu_manager_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ func (p *mockPolicy) Name() string {
100100
return "mock"
101101
}
102102

103-
func (p *mockPolicy) Start(s state.State) {
103+
func (p *mockPolicy) Start(s state.State) error {
104+
return p.err
104105
}
105106

106107
func (p *mockPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container) error {
@@ -206,7 +207,7 @@ func makeMultiContainerPod(initCPUs, appCPUs []struct{ request, limit string })
206207
}
207208

208209
func TestCPUManagerAdd(t *testing.T) {
209-
testPolicy := NewStaticPolicy(
210+
testPolicy, _ := NewStaticPolicy(
210211
&topology.CPUTopology{
211212
NumCPUs: 4,
212213
NumSockets: 1,
@@ -464,7 +465,7 @@ func TestCPUManagerAddWithInitContainers(t *testing.T) {
464465
}
465466

466467
for _, testCase := range testCases {
467-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
468+
policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
468469

469470
state := &mockState{
470471
assignments: testCase.stAssignments,
@@ -941,7 +942,7 @@ func TestReconcileState(t *testing.T) {
941942
// above test cases are without kubelet --reserved-cpus cmd option
942943
// the following tests are with --reserved-cpus configured
943944
func TestCPUManagerAddWithResvList(t *testing.T) {
944-
testPolicy := NewStaticPolicy(
945+
testPolicy, _ := NewStaticPolicy(
945946
&topology.CPUTopology{
946947
NumCPUs: 4,
947948
NumSockets: 1,

pkg/kubelet/cm/cpumanager/fake_cpu_manager.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@ type fakeManager struct {
3030
state state.State
3131
}
3232

33-
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) {
33+
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
3434
klog.Info("[fake cpumanager] Start()")
35+
return nil
3536
}
3637

3738
func (m *fakeManager) Policy() Policy {

pkg/kubelet/cm/cpumanager/policy.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
// Policy implements logic for pod container to CPU assignment.
2626
type Policy interface {
2727
Name() string
28-
Start(s state.State)
28+
Start(s state.State) error
2929
// AddContainer call is idempotent
3030
AddContainer(s state.State, pod *v1.Pod, container *v1.Container) error
3131
// RemoveContainer call is idempotent

pkg/kubelet/cm/cpumanager/policy_none.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@ func (p *nonePolicy) Name() string {
3939
return string(PolicyNone)
4040
}
4141

42-
func (p *nonePolicy) Start(s state.State) {
42+
func (p *nonePolicy) Start(s state.State) error {
4343
klog.Info("[cpumanager] none policy: Start")
44+
return nil
4445
}
4546

4647
func (p *nonePolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container) error {

pkg/kubelet/cm/cpumanager/policy_static.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ var _ Policy = &staticPolicy{}
8585
// NewStaticPolicy returns a CPU manager policy that does not change CPU
8686
// assignments for exclusively pinned guaranteed containers after the main
8787
// container process starts.
88-
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store) Policy {
88+
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store) (Policy, error) {
8989
allCPUs := topology.CPUDetails.CPUs()
9090
var reserved cpuset.CPUSet
9191
if reservedCPUs.Size() > 0 {
@@ -100,7 +100,8 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
100100
}
101101

102102
if reserved.Size() != numReservedCPUs {
103-
panic(fmt.Sprintf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs))
103+
err := fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs)
104+
return nil, err
104105
}
105106

106107
klog.Infof("[cpumanager] reserved %d CPUs (\"%s\") not available for exclusive assignment", reserved.Size(), reserved)
@@ -109,18 +110,19 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
109110
topology: topology,
110111
reserved: reserved,
111112
affinity: affinity,
112-
}
113+
}, nil
113114
}
114115

115116
func (p *staticPolicy) Name() string {
116117
return string(PolicyStatic)
117118
}
118119

119-
func (p *staticPolicy) Start(s state.State) {
120+
func (p *staticPolicy) Start(s state.State) error {
120121
if err := p.validateState(s); err != nil {
121-
klog.Errorf("[cpumanager] static policy invalid state: %s\n", err.Error())
122-
panic("[cpumanager] - please drain node and remove policy state file")
122+
klog.Errorf("[cpumanager] static policy invalid state: %v, please drain node and remove policy state file", err)
123+
return err
123124
}
125+
return nil
124126
}
125127

126128
func (p *staticPolicy) validateState(s state.State) error {

pkg/kubelet/cm/cpumanager/policy_static_test.go

Lines changed: 45 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,10 @@ type staticPolicyTest struct {
4141
expErr error
4242
expCPUAlloc bool
4343
expCSet cpuset.CPUSet
44-
expPanic bool
4544
}
4645

4746
func TestStaticPolicyName(t *testing.T) {
48-
policy := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
47+
policy, _ := NewStaticPolicy(topoSingleSocketHT, 1, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
4948

5049
policyName := policy.Name()
5150
if policyName != "static" {
@@ -81,7 +80,7 @@ func TestStaticPolicyStart(t *testing.T) {
8180
numReservedCPUs: 2,
8281
stAssignments: state.ContainerCPUAssignments{},
8382
stDefaultCPUSet: cpuset.NewCPUSet(0, 1),
84-
expPanic: true,
83+
expErr: fmt.Errorf("not all reserved cpus: \"0,6\" are present in defaultCpuSet: \"0-1\""),
8584
},
8685
{
8786
description: "assigned core 2 is still present in available cpuset",
@@ -92,7 +91,7 @@ func TestStaticPolicyStart(t *testing.T) {
9291
},
9392
},
9493
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
95-
expPanic: true,
94+
expErr: fmt.Errorf("pod: fakePod, container: 0 cpuset: \"0-2\" overlaps with default cpuset \"2-11\""),
9695
},
9796
{
9897
description: "core 12 is not present in topology but is in state cpuset",
@@ -104,7 +103,7 @@ func TestStaticPolicyStart(t *testing.T) {
104103
},
105104
},
106105
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10, 11, 12),
107-
expPanic: true,
106+
expErr: fmt.Errorf("current set of available CPUs \"0-11\" doesn't match with CPUs in state \"0-12\""),
108107
},
109108
{
110109
description: "core 11 is present in topology but is not in state cpuset",
@@ -116,26 +115,25 @@ func TestStaticPolicyStart(t *testing.T) {
116115
},
117116
},
118117
stDefaultCPUSet: cpuset.NewCPUSet(5, 6, 7, 8, 9, 10),
119-
expPanic: true,
118+
expErr: fmt.Errorf("current set of available CPUs \"0-11\" doesn't match with CPUs in state \"0-10\""),
120119
},
121120
}
122121
for _, testCase := range testCases {
123122
t.Run(testCase.description, func(t *testing.T) {
124-
defer func() {
125-
if err := recover(); err != nil {
126-
if !testCase.expPanic {
127-
t.Errorf("unexpected panic occurred: %q", err)
128-
}
129-
} else if testCase.expPanic {
130-
t.Error("expected panic doesn't occurred")
131-
}
132-
}()
133-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager()).(*staticPolicy)
123+
p, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
124+
policy := p.(*staticPolicy)
134125
st := &mockState{
135126
assignments: testCase.stAssignments,
136127
defaultCPUSet: testCase.stDefaultCPUSet,
137128
}
138-
policy.Start(st)
129+
err := policy.Start(st)
130+
if !reflect.DeepEqual(err, testCase.expErr) {
131+
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
132+
testCase.description, testCase.expErr, err)
133+
}
134+
if err != nil {
135+
return
136+
}
139137

140138
if !testCase.stDefaultCPUSet.IsEmpty() {
141139
for cpuid := 1; cpuid < policy.topology.NumCPUs; cpuid++ {
@@ -438,7 +436,7 @@ func TestStaticPolicyAdd(t *testing.T) {
438436
}
439437

440438
for _, testCase := range testCases {
441-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
439+
policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
442440

443441
st := &mockState{
444442
assignments: testCase.stAssignments,
@@ -539,7 +537,7 @@ func TestStaticPolicyRemove(t *testing.T) {
539537
}
540538

541539
for _, testCase := range testCases {
542-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
540+
policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
543541

544542
st := &mockState{
545543
assignments: testCase.stAssignments,
@@ -629,12 +627,17 @@ func TestTopologyAwareAllocateCPUs(t *testing.T) {
629627
},
630628
}
631629
for _, tc := range testCases {
632-
policy := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager()).(*staticPolicy)
630+
p, _ := NewStaticPolicy(tc.topo, 0, cpuset.NewCPUSet(), topologymanager.NewFakeManager())
631+
policy := p.(*staticPolicy)
633632
st := &mockState{
634633
assignments: tc.stAssignments,
635634
defaultCPUSet: tc.stDefaultCPUSet,
636635
}
637-
policy.Start(st)
636+
err := policy.Start(st)
637+
if err != nil {
638+
t.Errorf("StaticPolicy Start() error (%v)", err)
639+
continue
640+
}
638641

639642
cset, err := policy.allocateCPUs(st, tc.numRequested, tc.socketMask)
640643
if err != nil {
@@ -661,9 +664,9 @@ type staticPolicyTestWithResvList struct {
661664
stDefaultCPUSet cpuset.CPUSet
662665
pod *v1.Pod
663666
expErr error
667+
expNewErr error
664668
expCPUAlloc bool
665669
expCSet cpuset.CPUSet
666-
expPanic bool
667670
}
668671

669672
func TestStaticPolicyStartWithResvList(t *testing.T) {
@@ -684,7 +687,7 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
684687
reserved: cpuset.NewCPUSet(0, 1),
685688
stAssignments: state.ContainerCPUAssignments{},
686689
stDefaultCPUSet: cpuset.NewCPUSet(2, 3, 4, 5),
687-
expPanic: true,
690+
expErr: fmt.Errorf("not all reserved cpus: \"0-1\" are present in defaultCpuSet: \"2-5\""),
688691
},
689692
{
690693
description: "inconsistency between numReservedCPUs and reserved",
@@ -693,26 +696,32 @@ func TestStaticPolicyStartWithResvList(t *testing.T) {
693696
reserved: cpuset.NewCPUSet(0, 1),
694697
stAssignments: state.ContainerCPUAssignments{},
695698
stDefaultCPUSet: cpuset.NewCPUSet(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
696-
expPanic: true,
699+
expNewErr: fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of 0-1 did not equal 1)"),
697700
},
698701
}
699702
for _, testCase := range testCases {
700703
t.Run(testCase.description, func(t *testing.T) {
701-
defer func() {
702-
if err := recover(); err != nil {
703-
if !testCase.expPanic {
704-
t.Errorf("unexpected panic occurred: %q", err)
705-
}
706-
} else if testCase.expPanic {
707-
t.Error("expected panic doesn't occurred")
708-
}
709-
}()
710-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager()).(*staticPolicy)
704+
p, err := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
705+
if !reflect.DeepEqual(err, testCase.expNewErr) {
706+
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
707+
testCase.description, testCase.expNewErr, err)
708+
}
709+
if err != nil {
710+
return
711+
}
712+
policy := p.(*staticPolicy)
711713
st := &mockState{
712714
assignments: testCase.stAssignments,
713715
defaultCPUSet: testCase.stDefaultCPUSet,
714716
}
715-
policy.Start(st)
717+
err = policy.Start(st)
718+
if !reflect.DeepEqual(err, testCase.expErr) {
719+
t.Errorf("StaticPolicy Start() error (%v). expected error: %v but got: %v",
720+
testCase.description, testCase.expErr, err)
721+
}
722+
if err != nil {
723+
return
724+
}
716725

717726
if !st.GetDefaultCPUSet().Equals(testCase.expCSet) {
718727
t.Errorf("State CPUSet is different than expected. Have %q wants: %q", st.GetDefaultCPUSet(),
@@ -769,7 +778,7 @@ func TestStaticPolicyAddWithResvList(t *testing.T) {
769778
}
770779

771780
for _, testCase := range testCases {
772-
policy := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
781+
policy, _ := NewStaticPolicy(testCase.topo, testCase.numReservedCPUs, testCase.reserved, topologymanager.NewFakeManager())
773782

774783
st := &mockState{
775784
assignments: testCase.stAssignments,

0 commit comments

Comments
 (0)