Skip to content

Commit b30aa0e

Browse files
committed
pkg/cvo/metrics: Graceful server shutdown
Somewhat like the example in [1]. This pushes the server management down into a new RunMetrics method, which we then run in its own goroutine. This is initial groundwork; I expect we will port more of our child goroutines to this framework in follow-up work. [1]: https://golang.org/pkg/net/http/#Server.Shutdown
1 parent 07e5809 commit b30aa0e

File tree

2 files changed

+130
-51
lines changed

2 files changed

+130
-51
lines changed

pkg/cvo/metrics.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
package cvo
22

33
import (
4+
"context"
5+
"crypto/tls"
6+
"net"
7+
"net/http"
48
"time"
59

10+
"github.com/cockroachdb/cmux"
611
"github.com/prometheus/client_golang/prometheus"
12+
"github.com/prometheus/client_golang/prometheus/promhttp"
713
corev1 "k8s.io/api/core/v1"
814
apierrors "k8s.io/apimachinery/pkg/api/errors"
915
"k8s.io/apimachinery/pkg/labels"
1016
"k8s.io/apimachinery/pkg/util/sets"
1117
"k8s.io/client-go/tools/cache"
18+
"k8s.io/klog"
1219

1320
configv1 "github.com/openshift/api/config/v1"
1421
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
@@ -86,6 +93,88 @@ version for 'cluster', or empty for 'initial'.
8693
}
8794
}
8895

96+
// RunMetrics launches an server bound to listenAddress serving
97+
// Prometheus metrics at /metrics over HTTP, and, if tlsConfig is
98+
// non-nil, also over HTTPS. Continues serving until runContext.Done()
99+
// and then attempts a clean shutdown limited by shutdownContext.Done().
100+
// Assumes runContext.Done() occurs before or simultaneously with
101+
// shutdownContext.Done().
102+
func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress string, tlsConfig *tls.Config) error {
103+
handler := http.NewServeMux()
104+
handler.Handle("/metrics", promhttp.Handler())
105+
server := &http.Server{
106+
Handler: handler,
107+
}
108+
109+
tcpListener, err := net.Listen("tcp", listenAddress)
110+
if err != nil {
111+
return err
112+
}
113+
114+
// if a TLS connection was requested, set up a connection mux that will send TLS requests to
115+
// the TLS server but send HTTP requests to the HTTP server. Preserves the ability for legacy
116+
// HTTP, needed during upgrade, while still allowing TLS certs and end to end metrics protection.
117+
mux := cmux.New(tcpListener)
118+
119+
errorChannel := make(chan error, 1)
120+
errorChannelCount := 1
121+
122+
go func() {
123+
// match HTTP first
124+
httpListener := mux.Match(cmux.HTTP1())
125+
klog.Infof("Metrics port listening for HTTP on %v", listenAddress)
126+
errorChannel <- server.Serve(httpListener)
127+
}()
128+
129+
if tlsConfig != nil {
130+
errorChannelCount++
131+
go func() {
132+
tlsListener := tls.NewListener(mux.Match(cmux.Any()), tlsConfig)
133+
klog.Infof("Metrics port listening for HTTPS on %v", listenAddress)
134+
errorChannel <- server.Serve(tlsListener)
135+
}()
136+
}
137+
138+
errorChannelCount++
139+
go func() {
140+
errorChannel <- mux.Serve()
141+
}()
142+
143+
shutdown := false
144+
var loopError error
145+
for errorChannelCount > 0 {
146+
if shutdown {
147+
err := <-errorChannel
148+
errorChannelCount--
149+
if err != nil && err != http.ErrServerClosed && err != cmux.ErrListenerClosed {
150+
if loopError == nil {
151+
loopError = err
152+
} else if err != nil { // log the error we are discarding
153+
klog.Errorf("Failed to gracefully shut down metrics server: %s", err)
154+
}
155+
}
156+
} else {
157+
select {
158+
case <-runContext.Done(): // clean shutdown
159+
case err := <-errorChannel: // crashed before a shutdown was requested
160+
errorChannelCount--
161+
if err != nil && err != http.ErrServerClosed && err != cmux.ErrListenerClosed {
162+
loopError = err
163+
}
164+
}
165+
shutdown = true
166+
shutdownError := server.Shutdown(shutdownContext)
167+
if loopError == nil {
168+
loopError = shutdownError
169+
} else if shutdownError != nil { // log the error we are discarding
170+
klog.Errorf("Failed to gracefully shut down metrics server: %s", shutdownError)
171+
}
172+
}
173+
}
174+
175+
return loopError
176+
}
177+
89178
type conditionKey struct {
90179
Name string
91180
Type string

pkg/start/start.go

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,13 @@ import (
88
"fmt"
99
"io/ioutil"
1010
"math/rand"
11-
"net"
12-
"net/http"
1311
"os"
1412
"os/signal"
1513
"sync"
1614
"syscall"
1715
"time"
1816

19-
"github.com/cockroachdb/cmux"
20-
2117
"github.com/google/uuid"
22-
"github.com/prometheus/client_golang/prometheus/promhttp"
2318
v1 "k8s.io/api/core/v1"
2419
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2520
"k8s.io/client-go/informers"
@@ -188,56 +183,23 @@ func (o *Options) makeTLSConfig() (*tls.Config, error) {
188183
}
189184

190185
func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
191-
// listen on metrics
186+
runContext, runCancel := context.WithCancel(ctx)
187+
shutdownContext, shutdownCancel := context.WithCancel(ctx)
188+
errorChannel := make(chan error, 1)
189+
errorChannelCount := 0
192190
if o.ListenAddr != "" {
193-
handler := http.NewServeMux()
194-
handler.Handle("/metrics", promhttp.Handler())
195-
tcpl, err := net.Listen("tcp", o.ListenAddr)
196-
if err != nil {
197-
klog.Fatalf("Listen error: %v", err)
198-
}
199-
200-
// if a TLS connection was requested, set up a connection mux that will send TLS requests to
201-
// the TLS server but send HTTP requests to the HTTP server. Preserves the ability for legacy
202-
// HTTP, needed during upgrade, while still allowing TLS certs and end to end metrics protection.
203-
m := cmux.New(tcpl)
204-
205-
// match HTTP first
206-
httpl := m.Match(cmux.HTTP1())
207-
go func() {
208-
s := &http.Server{
209-
Handler: handler,
210-
}
211-
if err := s.Serve(httpl); err != cmux.ErrListenerClosed {
212-
klog.Fatalf("HTTP serve error: %v", err)
213-
}
214-
}()
215-
191+
var tlsConfig *tls.Config
216192
if o.ServingCertFile != "" || o.ServingKeyFile != "" {
217-
tlsConfig, err := o.makeTLSConfig()
193+
var err error
194+
tlsConfig, err = o.makeTLSConfig()
218195
if err != nil {
219196
klog.Fatalf("Failed to create TLS config: %v", err)
220197
}
221-
222-
tlsListener := tls.NewListener(m.Match(cmux.Any()), tlsConfig)
223-
klog.Infof("Metrics port listening for HTTP and HTTPS on %v", o.ListenAddr)
224-
go func() {
225-
s := &http.Server{
226-
Handler: handler,
227-
}
228-
if err := s.Serve(tlsListener); err != cmux.ErrListenerClosed {
229-
klog.Fatalf("HTTPS serve error: %v", err)
230-
}
231-
}()
232-
233-
go func() {
234-
if err := m.Serve(); err != nil {
235-
klog.Errorf("CMUX serve error: %v", err)
236-
}
237-
}()
238-
} else {
239-
klog.Infof("Metrics port listening for HTTP on %v", o.ListenAddr)
240198
}
199+
errorChannelCount++
200+
go func() {
201+
errorChannel <- cvo.RunMetrics(runContext, shutdownContext, o.ListenAddr, tlsConfig)
202+
}()
241203
}
242204

243205
exit := make(chan struct{})
@@ -253,9 +215,9 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
253215
RetryPeriod: retryPeriod,
254216
Callbacks: leaderelection.LeaderCallbacks{
255217
OnStartedLeading: func(localCtx context.Context) {
256-
controllerCtx.Start(ctx)
218+
controllerCtx.Start(runContext)
257219
select {
258-
case <-ctx.Done():
220+
case <-runContext.Done():
259221
// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
260222
// and client-go ContextCancelable, which allows us to block new API requests before
261223
// we step down. However, the CVO isn't that sensitive to races and can tolerate
@@ -284,6 +246,34 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
284246
},
285247
})
286248

249+
for errorChannelCount > 0 {
250+
var shutdownTimer *time.Timer
251+
if shutdownTimer == nil { // running
252+
select {
253+
case <-runContext.Done():
254+
shutdownTimer = time.NewTimer(2 * time.Minute)
255+
case err := <-errorChannel:
256+
errorChannelCount--
257+
if err != nil {
258+
klog.Error(err)
259+
runCancel() // this will cause shutdownTimer initialization in the next loop
260+
}
261+
}
262+
} else { // shutting down
263+
select {
264+
case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
265+
shutdownCancel()
266+
shutdownTimer.Stop()
267+
case err := <-errorChannel:
268+
errorChannelCount--
269+
if err != nil {
270+
klog.Error(err)
271+
runCancel()
272+
}
273+
}
274+
}
275+
}
276+
287277
<-exit
288278
}
289279

0 commit comments

Comments
 (0)