Skip to content

Commit 4f16ee5

Browse files
committed
pkg/cvo/metrics: Abandon child goroutines after shutdownContext expires
Apparently there's something in the HTTPS server goroutine that can hang up even if we've called Shutdown() on the server [1]. Defend against that with a safety valve to abandon stuck goroutines if shutdownContext expires. Also pivot to resultChannel and asyncResult, so we can get names for the collected channels (and more easily identify the stuck channels by elimination), following the pattern set by cc1921d (pkg/start: Release leader lease on graceful shutdown, 2020-08-03, #424). [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1891143#c1
1 parent 964a46f commit 4f16ee5

File tree

1 file changed

+34
-19
lines changed

1 file changed

+34
-19
lines changed

pkg/cvo/metrics.go

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,11 @@ version for 'cluster', or empty for 'initial'.
100100
}
101101
}
102102

103+
type asyncResult struct {
104+
name string
105+
error error
106+
}
107+
103108
// RunMetrics launches an server bound to listenAddress serving
104109
// Prometheus metrics at /metrics over HTTP, and, if tlsConfig is
105110
// non-nil, also over HTTPS. Continues serving until runContext.Done()
@@ -123,50 +128,60 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis
123128
// HTTP, needed during upgrade, while still allowing TLS certs and end to end metrics protection.
124129
mux := cmux.New(tcpListener)
125130

126-
errorChannel := make(chan error, 1)
127-
errorChannelCount := 1
131+
resultChannel := make(chan asyncResult, 1)
132+
resultChannelCount := 1
128133

129134
go func() {
130135
// match HTTP first
131136
httpListener := mux.Match(cmux.HTTP1())
132137
klog.Infof("Metrics port listening for HTTP on %v", listenAddress)
133-
errorChannel <- server.Serve(httpListener)
138+
err := server.Serve(httpListener)
139+
resultChannel <- asyncResult{name: "HTTP server", error: err}
134140
}()
135141

136142
if tlsConfig != nil {
137-
errorChannelCount++
143+
resultChannelCount++
138144
go func() {
139145
tlsListener := tls.NewListener(mux.Match(cmux.Any()), tlsConfig)
140146
klog.Infof("Metrics port listening for HTTPS on %v", listenAddress)
141-
errorChannel <- server.Serve(tlsListener)
147+
err := server.Serve(tlsListener)
148+
resultChannel <- asyncResult{name: "HTTPS server", error: err}
142149
}()
143150
}
144151

145-
errorChannelCount++
152+
resultChannelCount++
146153
go func() {
147-
errorChannel <- mux.Serve()
154+
err := mux.Serve()
155+
resultChannel <- asyncResult{name: "TCP muxer", error: err}
148156
}()
149157

150158
shutdown := false
151159
var loopError error
152-
for errorChannelCount > 0 {
160+
for resultChannelCount > 0 {
153161
if shutdown {
154-
err := <-errorChannel
155-
errorChannelCount--
156-
if err != nil && err != http.ErrServerClosed && err != cmux.ErrListenerClosed {
157-
if loopError == nil {
158-
loopError = err
159-
} else if err != nil { // log the error we are discarding
160-
klog.Errorf("Failed to gracefully shut down metrics server: %s", err)
162+
select {
163+
case result := <-resultChannel:
164+
resultChannelCount--
165+
if result.error == nil {
166+
klog.Infof("Collected metrics %s goroutine.", result.name)
167+
} else {
168+
klog.Errorf("Collected metrics %s goroutine: %v", result.name, result.error)
169+
loopError = result.error
161170
}
171+
case <-shutdownContext.Done(): // out of time
172+
klog.Errorf("Abandoning %d uncollected metrics goroutines", resultChannelCount)
173+
return shutdownContext.Err()
162174
}
163175
} else {
164176
select {
165177
case <-runContext.Done(): // clean shutdown
166-
case err := <-errorChannel: // crashed before a shutdown was requested
167-
errorChannelCount--
168-
if err != nil && err != http.ErrServerClosed && err != cmux.ErrListenerClosed {
169-
loopError = err
178+
case result := <-resultChannel: // crashed before a shutdown was requested
179+
resultChannelCount--
180+
if result.error == nil {
181+
klog.Infof("Collected metrics %s goroutine.", result.name)
182+
} else {
183+
klog.Errorf("Collected metrics %s goroutine: %v", result.name, result.error)
184+
loopError = result.error
170185
}
171186
}
172187
shutdown = true

0 commit comments

Comments
 (0)