@@ -4,8 +4,10 @@ package checkpoint
44
55import (
66 "fmt"
7+ "os"
78 "time"
89
10+ "github.com/golang/glog"
911 "k8s.io/client-go/kubernetes"
1012 restclient "k8s.io/client-go/rest"
1113)
@@ -23,12 +25,14 @@ const (
2325 shouldCheckpoint = "true"
2426 podSourceFile = "file"
2527
26- defaultPollingFrequency = 3 * time .Second
27- defaultCheckpointTimeout = 1 * time .Minute
28+ defaultPollingFrequency = 5 * time .Second
29+ defaultCheckpointTimeout = 1 * time .Minute
30+ defaultCheckpointGracePeriod = 1 * time .Minute
2831)
2932
3033var (
31- lastCheckpoint time.Time
34+ lastCheckpoint time.Time
35+ checkpointGracePeriod = defaultCheckpointGracePeriod
3236)
3337
3438// Options defines the parameters that are required to start the checkpointer.
@@ -59,6 +63,7 @@ type checkpointer struct {
5963 kubelet * kubeletClient
6064 cri * remoteRuntimeService
6165 checkpointerPod CheckpointerPod
66+ checkpoints checkpoints
6267}
6368
6469// Run instantiates and starts a new checkpointer. Returns error if there was a problem creating
@@ -90,6 +95,11 @@ func Run(opts Options) error {
9095
9196// run is the main checkpointing loop.
9297func (c * checkpointer ) run () {
98+ // Make sure the inactive checkpoint path exists.
99+ if err := os .MkdirAll (inactiveCheckpointPath , 0700 ); err != nil {
100+ glog .Fatalf ("Could not create inactive checkpoint path: %v" , err )
101+ }
102+
93103 for {
94104 time .Sleep (defaultPollingFrequency )
95105
@@ -101,20 +111,24 @@ func (c *checkpointer) run() {
101111 localParentPods := c .kubelet .localParentPods ()
102112 localRunningPods := c .cri .localRunningPods ()
103113
104- c .createCheckpointsForValidParents (localParentPods )
105-
106114 // Try to get scheduled pods from the apiserver.
107115 // These will be used to GC checkpoints for parents no longer scheduled to this node.
108- // A return value of nil is assumed to be "could not contact apiserver"
109116 // TODO(aaron): only check this every 30 seconds or so
110- apiParentPods := c .getAPIParentPods (c .checkpointerPod .NodeName )
117+ apiAvailable , apiParentPods := c .getAPIParentPods (c .checkpointerPod .NodeName )
111118
112119 // Get on disk copies of (in)active checkpoints
113120 //TODO(aaron): Could be racy to load from disk each time, but much easier than trying to keep in-memory state in sync.
114121 activeCheckpoints := getFileCheckpoints (activeCheckpointPath )
115122 inactiveCheckpoints := getFileCheckpoints (inactiveCheckpointPath )
116123
117- start , stop , remove := process (localRunningPods , localParentPods , apiParentPods , activeCheckpoints , inactiveCheckpoints , c .checkpointerPod )
124+ // Update checkpoints using the latest information from the APIs.
125+ c .checkpoints .update (localRunningPods , localParentPods , apiParentPods , activeCheckpoints , inactiveCheckpoints , c .checkpointerPod )
126+
127+ // Update on-disk manifests based on updated checkpoint state.
128+ c .createCheckpointsForValidParents ()
129+
130+ // Update checkpoint states and determine which checkpoints to start, stop, or remove.
131+ start , stop , remove := c .checkpoints .process (time .Now (), apiAvailable , localRunningPods , localParentPods , apiParentPods )
118132
119133 // Handle remove at last because we may still have some work to do
120134 // before removing the checkpointer itself.
0 commit comments