@@ -192,6 +192,8 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
192
192
if m .policy .Name () == string (PolicyNone ) {
193
193
return
194
194
}
195
+ // Periodically call m.reconcileState() to continue to keep the CPU sets of
196
+ // all pods in sync with and guaranteed CPUs handed out among them.
195
197
go wait .Until (func () { m .reconcileState () }, m .reconcilePeriod , wait .NeverStop )
196
198
}
197
199
@@ -208,19 +210,24 @@ func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) e
208
210
}
209
211
}
210
212
}
213
+
214
+ // Call down into the policy to assign this container CPUs if required.
211
215
err := m .policyAddContainer (p , c , containerID )
212
216
if err != nil {
213
217
klog .Errorf ("[cpumanager] AddContainer error: %v" , err )
214
218
m .Unlock ()
215
219
return err
216
220
}
221
+
222
+ // Get the CPUs just assigned to the container (or fall back to the default
223
+ // CPUSet if none were assigned).
217
224
cpus := m .state .GetCPUSetOrDefault (string (p .UID ), c .Name )
218
225
m .Unlock ()
219
226
220
227
if ! cpus .IsEmpty () {
221
228
err = m .updateContainerCPUSet (containerID , cpus )
222
229
if err != nil {
223
- klog .Errorf ("[cpumanager] AddContainer error: %v" , err )
230
+ klog .Errorf ("[cpumanager] AddContainer error: error updating CPUSet for container (pod: %s, container: %s, container id: %s, err: %v)" , p . Name , c . Name , containerID , err )
224
231
m .Lock ()
225
232
err := m .policyRemoveContainerByID (containerID )
226
233
if err != nil {
@@ -376,7 +383,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
376
383
377
384
if cstatus .State .Waiting != nil ||
378
385
(cstatus .State .Waiting == nil && cstatus .State .Running == nil && cstatus .State .Terminated == nil ) {
379
- klog .Warningf ("[cpumanager] reconcileState: skipping container; container still in the waiting state (pod: %s, container: %s)" , pod .Name , container .Name )
386
+ klog .Warningf ("[cpumanager] reconcileState: skipping container; container still in the waiting state (pod: %s, container: %s, error: %v )" , pod .Name , container .Name , err )
380
387
failure = append (failure , reconciledContainer {pod .Name , container .Name , "" })
381
388
continue
382
389
}
@@ -398,6 +405,9 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
398
405
continue
399
406
}
400
407
408
+ // Once we make it here we know we have a running container.
409
+ // Idempotently add it to the containerMap incase it is missing.
410
+ // This can happen after a kubelet restart, for example.
401
411
m .containerMap .Add (string (pod .UID ), container .Name , containerID )
402
412
403
413
cset := m .state .GetCPUSetOrDefault (string (pod .UID ), container .Name )
0 commit comments