@@ -10,13 +10,13 @@ import (
10
10
"math/rand"
11
11
"os"
12
12
"os/signal"
13
- "sync"
14
13
"syscall"
15
14
"time"
16
15
17
16
"github.com/google/uuid"
18
17
v1 "k8s.io/api/core/v1"
19
18
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19
+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
20
20
"k8s.io/client-go/informers"
21
21
"k8s.io/client-go/kubernetes"
22
22
"k8s.io/client-go/kubernetes/scheme"
@@ -77,6 +77,11 @@ type Options struct {
77
77
ResyncInterval time.Duration
78
78
}
79
79
80
+ type asyncResult struct {
81
+ name string
82
+ error error
83
+ }
84
+
80
85
func defaultEnv (name , defaultValue string ) string {
81
86
env , ok := os .LookupEnv (name )
82
87
if ! ok {
@@ -101,7 +106,7 @@ func NewOptions() *Options {
101
106
}
102
107
}
103
108
104
- func (o * Options ) Run () error {
109
+ func (o * Options ) Run (ctx context. Context ) error {
105
110
if o .NodeName == "" {
106
111
return fmt .Errorf ("node-name is required" )
107
112
}
@@ -137,29 +142,6 @@ func (o *Options) Run() error {
137
142
return err
138
143
}
139
144
140
- // TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
141
- // LeaderElectionConfig that allows us to have the lock code
142
- // release the lease when this context is cancelled. At that
143
- // time we can remove our changes to OnStartedLeading.
144
- ctx , cancel := context .WithCancel (context .Background ())
145
- defer cancel ()
146
- ch := make (chan os.Signal , 1 )
147
- defer func () { signal .Stop (ch ) }()
148
- signal .Notify (ch , os .Interrupt , syscall .SIGTERM )
149
- go func () {
150
- sig := <- ch
151
- klog .Infof ("Shutting down due to %s" , sig )
152
- cancel ()
153
-
154
- // exit after 2s no matter what
155
- select {
156
- case <- time .After (5 * time .Second ):
157
- klog .Fatalf ("Exiting" )
158
- case <- ch :
159
- klog .Fatalf ("Received shutdown signal twice, exiting" )
160
- }
161
- }()
162
-
163
145
o .run (ctx , controllerCtx , lock )
164
146
return nil
165
147
}
@@ -186,13 +168,33 @@ func (o *Options) makeTLSConfig() (*tls.Config, error) {
186
168
}), nil
187
169
}
188
170
171
+ // run launches a number of goroutines to handle manifest application,
172
+ // metrics serving, etc. It continues operating until ctx.Done(),
173
+ // and then attempts a clean shutdown limited by an internal context
174
+ // with a two-minute cap. It returns after it successfully collects all
175
+ // launched goroutines.
189
176
func (o * Options ) run (ctx context.Context , controllerCtx * Context , lock * resourcelock.ConfigMapLock ) {
190
- runContext , runCancel := context .WithCancel (ctx )
177
+ runContext , runCancel := context .WithCancel (ctx ) // so we can cancel internally on errors or TERM
191
178
defer runCancel ()
192
- shutdownContext , shutdownCancel := context .WithCancel (ctx )
179
+ shutdownContext , shutdownCancel := context .WithCancel (context . Background ()) // extends beyond ctx
193
180
defer shutdownCancel ()
194
- errorChannel := make (chan error , 1 )
195
- errorChannelCount := 0
181
+ postMainContext , postMainCancel := context .WithCancel (context .Background ()) // extends beyond ctx
182
+ defer postMainCancel ()
183
+
184
+ ch := make (chan os.Signal , 1 )
185
+ defer func () { signal .Stop (ch ) }()
186
+ signal .Notify (ch , os .Interrupt , syscall .SIGTERM )
187
+ go func () {
188
+ defer utilruntime .HandleCrash ()
189
+ sig := <- ch
190
+ klog .Infof ("Shutting down due to %s" , sig )
191
+ runCancel ()
192
+ sig = <- ch
193
+ klog .Fatalf ("Received shutdown signal twice, exiting: %s" , sig )
194
+ }()
195
+
196
+ resultChannel := make (chan asyncResult , 1 )
197
+ resultChannelCount := 0
196
198
if o .ListenAddr != "" {
197
199
var tlsConfig * tls.Config
198
200
if o .ServingCertFile != "" || o .ServingKeyFile != "" {
@@ -202,85 +204,96 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
202
204
klog .Fatalf ("Failed to create TLS config: %v" , err )
203
205
}
204
206
}
205
- errorChannelCount ++
207
+ resultChannelCount ++
206
208
go func () {
207
- errorChannel <- cvo .RunMetrics (runContext , shutdownContext , o .ListenAddr , tlsConfig )
209
+ defer utilruntime .HandleCrash ()
210
+ err := cvo .RunMetrics (postMainContext , shutdownContext , o .ListenAddr , tlsConfig )
211
+ resultChannel <- asyncResult {name : "metrics server" , error : err }
208
212
}()
209
213
}
210
214
211
- exit := make (chan struct {})
212
- exitClose := sync.Once {}
213
-
214
- // TODO: when we switch to graceful lock shutdown, this can be
215
- // moved back inside RunOrDie
216
- // TODO: properly wire ctx here
217
- go leaderelection .RunOrDie (context .TODO (), leaderelection.LeaderElectionConfig {
218
- Lock : lock ,
219
- LeaseDuration : leaseDuration ,
220
- RenewDeadline : renewDeadline ,
221
- RetryPeriod : retryPeriod ,
222
- Callbacks : leaderelection.LeaderCallbacks {
223
- OnStartedLeading : func (localCtx context.Context ) {
224
- controllerCtx .Start (runContext )
225
- select {
226
- case <- runContext .Done ():
227
- // WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
228
- // and client-go ContextCancelable, which allows us to block new API requests before
229
- // we step down. However, the CVO isn't that sensitive to races and can tolerate
230
- // brief overlap.
231
- klog .Infof ("Stepping down as leader" )
232
- // give the controllers some time to shut down
233
- time .Sleep (100 * time .Millisecond )
234
- // if we still hold the leader lease, clear the owner identity (other lease watchers
235
- // still have to wait for expiration) like the new ReleaseOnCancel code will do.
236
- if err := lock .Update (localCtx , resourcelock.LeaderElectionRecord {}); err == nil {
237
- // if we successfully clear the owner identity, we can safely delete the record
238
- if err := lock .Client .ConfigMaps (lock .ConfigMapMeta .Namespace ).Delete (localCtx , lock .ConfigMapMeta .Name , metav1.DeleteOptions {}); err != nil {
239
- klog .Warningf ("Unable to step down cleanly: %v" , err )
240
- }
215
+ informersDone := postMainContext .Done ()
216
+ // FIXME: would be nice if there was a way to collect these.
217
+ controllerCtx .CVInformerFactory .Start (informersDone )
218
+ controllerCtx .OpenshiftConfigInformerFactory .Start (informersDone )
219
+ controllerCtx .OpenshiftConfigManagedInformerFactory .Start (informersDone )
220
+ controllerCtx .InformerFactory .Start (informersDone )
221
+
222
+ resultChannelCount ++
223
+ go func () {
224
+ defer utilruntime .HandleCrash ()
225
+ leaderelection .RunOrDie (postMainContext , leaderelection.LeaderElectionConfig {
226
+ Lock : lock ,
227
+ ReleaseOnCancel : true ,
228
+ LeaseDuration : leaseDuration ,
229
+ RenewDeadline : renewDeadline ,
230
+ RetryPeriod : retryPeriod ,
231
+ Callbacks : leaderelection.LeaderCallbacks {
232
+ OnStartedLeading : func (_ context.Context ) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext
233
+ resultChannelCount ++
234
+ go func () {
235
+ defer utilruntime .HandleCrash ()
236
+ err := controllerCtx .CVO .Run (runContext , 2 )
237
+ resultChannel <- asyncResult {name : "main operator" , error : err }
238
+ }()
239
+
240
+ if controllerCtx .AutoUpdate != nil {
241
+ resultChannelCount ++
242
+ go func () {
243
+ defer utilruntime .HandleCrash ()
244
+ err := controllerCtx .AutoUpdate .Run (runContext , 2 )
245
+ resultChannel <- asyncResult {name : "auto-update controller" , error : err }
246
+ }()
241
247
}
242
- klog .Infof ("Finished shutdown" )
243
- exitClose .Do (func () { close (exit ) })
244
- case <- localCtx .Done ():
245
- // we will exit in OnStoppedLeading
246
- }
247
- },
248
- OnStoppedLeading : func () {
249
- klog .Warning ("leaderelection lost" )
250
- exitClose .Do (func () { close (exit ) })
248
+ },
249
+ OnStoppedLeading : func () {
250
+ klog .Info ("Stopped leading; shutting down." )
251
+ runCancel ()
252
+ },
251
253
},
252
- },
253
- })
254
+ })
255
+ resultChannel <- asyncResult {name : "leader controller" , error : nil }
256
+ }()
254
257
255
- for errorChannelCount > 0 {
256
- var shutdownTimer * time.Timer
258
+ var shutdownTimer * time.Timer
259
+ for resultChannelCount > 0 {
260
+ klog .Infof ("Waiting on %d outstanding goroutines." , resultChannelCount )
257
261
if shutdownTimer == nil { // running
258
262
select {
259
263
case <- runContext .Done ():
264
+ klog .Info ("Run context completed; beginning two-minute graceful shutdown period." )
260
265
shutdownTimer = time .NewTimer (2 * time .Minute )
261
- case err := <- errorChannel :
262
- errorChannelCount --
263
- if err != nil {
264
- klog .Error (err )
266
+ case result := <- resultChannel :
267
+ resultChannelCount --
268
+ if result .error == nil {
269
+ klog .Infof ("Collected %s goroutine." , result .name )
270
+ } else {
271
+ klog .Errorf ("Collected %s goroutine: %v" , result .name , result .error )
265
272
runCancel () // this will cause shutdownTimer initialization in the next loop
266
273
}
274
+ if result .name == "main operator" {
275
+ postMainCancel ()
276
+ }
267
277
}
268
278
} else { // shutting down
269
279
select {
270
280
case <- shutdownTimer .C : // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
271
281
shutdownCancel ()
272
282
shutdownTimer .Stop ()
273
- case err := <- errorChannel :
274
- errorChannelCount --
275
- if err != nil {
276
- klog .Error (err )
277
- runCancel ()
283
+ case result := <- resultChannel :
284
+ resultChannelCount --
285
+ if result .error == nil {
286
+ klog .Infof ("Collected %s goroutine." , result .name )
287
+ } else {
288
+ klog .Errorf ("Collected %s goroutine: %v" , result .name , result .error )
289
+ }
290
+ if result .name == "main operator" {
291
+ postMainCancel ()
278
292
}
279
293
}
280
294
}
281
295
}
282
-
283
- <- exit
296
+ klog .Info ("Finished collecting operator goroutines." )
284
297
}
285
298
286
299
// createResourceLock initializes the lock.
@@ -440,17 +453,3 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
440
453
}
441
454
return ctx
442
455
}
443
-
444
- // Start launches the controllers in the provided context and any supporting
445
- // infrastructure. When ch is closed the controllers will be shut down.
446
- func (c * Context ) Start (ctx context.Context ) {
447
- ch := ctx .Done ()
448
- go c .CVO .Run (ctx , 2 )
449
- if c .AutoUpdate != nil {
450
- go c .AutoUpdate .Run (ctx , 2 )
451
- }
452
- c .CVInformerFactory .Start (ch )
453
- c .OpenshiftConfigInformerFactory .Start (ch )
454
- c .OpenshiftConfigManagedInformerFactory .Start (ch )
455
- c .InformerFactory .Start (ch )
456
- }
0 commit comments