Skip to content

Commit 693d13c

Browse files
committed
Improve shutdown logic: wait until no requests are made
Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM - we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints and sends SIGTERM to pods *in parallel*. See kubernetes/kubernetes#106476 for more detail.
1 parent 68d381e commit 693d13c

File tree

3 files changed

+151
-1
lines changed

3 files changed

+151
-1
lines changed

internal/ingress/controller/nginx.go

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"errors"
2424
"fmt"
2525
"io/fs"
26+
"k8s.io/ingress-nginx/internal/ingress/metric/collectors"
2627
"net"
2728
"net/http"
2829
"os"
@@ -377,6 +378,63 @@ func (n *NGINXController) Start() {
377378
}
378379
}
379380

381+
// stopWait waits until no more connections are made to nginx.
382+
//
383+
// This waits until all of following conditions are met:
384+
// - No more requests are made to nginx for the last 5 seconds.
385+
// - 'shutdown-grace-period' seconds have passed after calling this method.
386+
//
387+
// Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM -
388+
// we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints
389+
// and sends SIGTERM to pods *in parallel*.
390+
// If we don't see new requests for 5 seconds, then we assume that this pod was removed from the upstream endpoints
391+
// (AWS ALB endpoints for example), and proceed with shutdown.
392+
//
393+
// See https://github.com/kubernetes/kubernetes/issues/106476 for more detail on this issue.
394+
func (n *NGINXController) stopWait() {
395+
const checkFrequency = time.Second
396+
const waitUntilNoConnectionsFor = int((5 * time.Second) / checkFrequency)
397+
waitAtLeastUntil := time.Now().Add(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second)
398+
399+
var scraper collectors.NginxStatusScraper
400+
lastRequests := 0
401+
noChangeTimes := 0
402+
403+
for ; ; time.Sleep(checkFrequency) {
404+
st, err := scraper.Scrape()
405+
if err != nil {
406+
klog.Warningf("failed to scrape nginx status: %v", err)
407+
noChangeTimes = 0
408+
continue
409+
}
410+
411+
diff := st.Requests - lastRequests
412+
// We assume that there were no client requests to nginx, if and only if
413+
// there were 0 to 2 increase in handled requests from the last scrape.
414+
// 1 is to account for our own stub_status request from this method,
415+
// and the other 1 is to account for the readinessProbe.
416+
// Note that readinessProbe DO happen even when the pod is terminating.
417+
// See: https://github.com/kubernetes/kubernetes/issues/122824#issuecomment-1899224434
418+
noChange := 0 <= diff && diff <= 2
419+
if noChange {
420+
noChangeTimes++
421+
if noChangeTimes >= waitUntilNoConnectionsFor {
422+
// Safe to proceed shutdown, we are seeing no more client request.
423+
break
424+
}
425+
} else {
426+
noChangeTimes = 0
427+
}
428+
lastRequests = st.Requests
429+
}
430+
431+
// Wait at least for the configured duration, if any
432+
delay := waitAtLeastUntil.Sub(time.Now())
433+
if delay > 0 {
434+
time.Sleep(delay)
435+
}
436+
}
437+
380438
// Stop gracefully stops the NGINX master process.
381439
func (n *NGINXController) Stop() error {
382440
n.isShuttingDown = true
@@ -388,7 +446,8 @@ func (n *NGINXController) Stop() error {
388446
return fmt.Errorf("shutdown already in progress")
389447
}
390448

391-
time.Sleep(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second)
449+
klog.InfoS("Graceful shutdown - waiting until no more requests are made")
450+
n.stopWait()
392451

393452
klog.InfoS("Shutting down controller queues")
394453
close(n.stopCh)

test/e2e/framework/deployment.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,10 @@ func (f *Framework) ScaleDeploymentToZero(name string) {
624624
assert.Nil(ginkgo.GinkgoT(), err, "getting deployment")
625625
assert.NotNil(ginkgo.GinkgoT(), d, "expected a deployment but none returned")
626626

627+
err = waitForPodsDeleted(f.KubeClientSet, 2*time.Minute, f.Namespace, &metav1.ListOptions{
628+
LabelSelector: labelSelectorToString(d.Spec.Selector.MatchLabels),
629+
})
630+
assert.Nil(ginkgo.GinkgoT(), err, "waiting for no pods")
627631
err = WaitForEndpoints(f.KubeClientSet, DefaultTimeout, name, f.Namespace, 0)
628632
assert.Nil(ginkgo.GinkgoT(), err, "waiting for no endpoints")
629633
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package gracefulshutdown
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"github.com/onsi/ginkgo/v2"
23+
"github.com/stretchr/testify/assert"
24+
appsv1 "k8s.io/api/apps/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/ingress-nginx/test/e2e/framework"
27+
"net/http"
28+
"strings"
29+
"time"
30+
)
31+
32+
var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func() {
33+
f := framework.NewDefaultFramework("k8s-async-shutdown", func(f *framework.Framework) {
34+
f.Namespace = "k8s-async-shutdown"
35+
})
36+
37+
host := "async-shutdown"
38+
39+
ginkgo.BeforeEach(func() {
40+
f.NewSlowEchoDeployment()
41+
})
42+
43+
ginkgo.It("should not shut down while still receiving traffic", func() {
44+
defer ginkgo.GinkgoRecover()
45+
46+
err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error {
47+
// Note: e2e's default terminationGracePeriodSeconds is 1 for some reason, so extend it
48+
grace := int64(300)
49+
deployment.Spec.Template.Spec.TerminationGracePeriodSeconds = &grace
50+
_, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{})
51+
return err
52+
})
53+
assert.Nil(ginkgo.GinkgoT(), err, "updating ingress controller deployment")
54+
55+
f.EnsureIngress(framework.NewSingleIngress(host, "/", host, f.Namespace, framework.SlowEchoService, 80, nil))
56+
57+
f.WaitForNginxServer(host,
58+
func(server string) bool {
59+
return strings.Contains(server, "server_name "+host)
60+
})
61+
62+
// We need to get pod IP first because after the pod becomes terminating,
63+
// it is removed from Service endpoints, and becomes unable to be discovered by "f.HTTPTestClient()".
64+
ip := f.GetNginxPodIP()
65+
66+
// Assume that the upstream takes 30 seconds to update its endpoints,
67+
// therefore we are still receiving traffic while shutting down
68+
go func() {
69+
defer ginkgo.GinkgoRecover()
70+
for i := 0; i < 120; i++ {
71+
f.HTTPDumbTestClient().
72+
GET("/").
73+
WithURL(fmt.Sprintf("http://%s/", ip)).
74+
WithHeader("Host", host).
75+
Expect().
76+
Status(http.StatusOK)
77+
78+
framework.Sleep(250 * time.Millisecond)
79+
}
80+
}()
81+
82+
start := time.Now()
83+
f.ScaleDeploymentToZero("nginx-ingress-controller")
84+
assert.GreaterOrEqualf(ginkgo.GinkgoT(), int(time.Since(start).Seconds()), 35,
85+
"should take more than 30 + 5 seconds for graceful shutdown")
86+
})
87+
})

0 commit comments

Comments
 (0)