Skip to content

Commit 29b45e3

Browse files
pleshakovsjberman
andauthored
Implement delayed termination to achieve zero downtime upgrades (#1159)
Problem: During an upgrade of NGF, external clients can experience downtime. Solution: - Introduce configurable delayed termination. - Add sleep subcommand to gateway binary - Add lifecycle paramaters to helm to both nginx-gateway and nginx containers. - Add terminationGracePeriodSeconds parameter to helm. - Add affinity parameter to helm (primary needed for testing to prevent pods running on the same node). - Rerun zero downtime non-functional tests. Testing: - Manual testing SOLVES #1155 Co-authored-by: Saylor Berman <[email protected]>
1 parent ee0c491 commit 29b45e3

File tree

16 files changed

+1622
-35
lines changed

16 files changed

+1622
-35
lines changed

cmd/gateway/commands.go

Lines changed: 81 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"fmt"
66
"os"
7+
"time"
78

89
"github.com/spf13/cobra"
910
"go.uber.org/zap"
@@ -22,23 +23,11 @@ const (
2223
gatewayClassFlag = "gatewayclass"
2324
gatewayClassNameUsage = `The name of the GatewayClass resource. ` +
2425
`Every NGINX Gateway Fabric must have a unique corresponding GatewayClass resource.`
25-
gatewayCtrlNameFlag = "gateway-ctlr-name"
26-
gatewayCtrlNameUsageFmt = `The name of the Gateway controller. ` +
26+
gatewayCtlrNameFlag = "gateway-ctlr-name"
27+
gatewayCtlrNameUsageFmt = `The name of the Gateway controller. ` +
2728
`The controller name must be of the form: DOMAIN/PATH. The controller's domain is '%s'`
2829
)
2930

30-
var (
31-
// Backing values for common cli flags shared among all subcommands
32-
// The values are managed by the Root command.
33-
gatewayCtlrName = stringValidatingValue{
34-
validator: validateGatewayControllerName,
35-
}
36-
37-
gatewayClassName = stringValidatingValue{
38-
validator: validateResourceName,
39-
}
40-
)
41-
4231
func createRootCommand() *cobra.Command {
4332
rootCmd := &cobra.Command{
4433
Use: "gateway",
@@ -47,20 +36,6 @@ func createRootCommand() *cobra.Command {
4736
},
4837
}
4938

50-
rootCmd.PersistentFlags().Var(
51-
&gatewayCtlrName,
52-
gatewayCtrlNameFlag,
53-
fmt.Sprintf(gatewayCtrlNameUsageFmt, domain),
54-
)
55-
utilruntime.Must(rootCmd.MarkPersistentFlagRequired(gatewayCtrlNameFlag))
56-
57-
rootCmd.PersistentFlags().Var(
58-
&gatewayClassName,
59-
gatewayClassFlag,
60-
gatewayClassNameUsage,
61-
)
62-
utilruntime.Must(rootCmd.MarkPersistentFlagRequired(gatewayClassFlag))
63-
6439
return rootCmd
6540
}
6641

@@ -82,6 +57,14 @@ func createStaticModeCommand() *cobra.Command {
8257

8358
// flag values
8459
var (
60+
gatewayCtlrName = stringValidatingValue{
61+
validator: validateGatewayControllerName,
62+
}
63+
64+
gatewayClassName = stringValidatingValue{
65+
validator: validateResourceName,
66+
}
67+
8568
updateGCStatus bool
8669
gateway = namespacedNameValue{}
8770
configName = stringValidatingValue{
@@ -185,6 +168,20 @@ func createStaticModeCommand() *cobra.Command {
185168
},
186169
}
187170

171+
cmd.Flags().Var(
172+
&gatewayCtlrName,
173+
gatewayCtlrNameFlag,
174+
fmt.Sprintf(gatewayCtlrNameUsageFmt, domain),
175+
)
176+
utilruntime.Must(cmd.MarkFlagRequired(gatewayCtlrNameFlag))
177+
178+
cmd.Flags().Var(
179+
&gatewayClassName,
180+
gatewayClassFlag,
181+
gatewayClassNameUsage,
182+
)
183+
utilruntime.Must(cmd.MarkFlagRequired(gatewayClassFlag))
184+
188185
cmd.Flags().Var(
189186
&gateway,
190187
gatewayFlag,
@@ -271,7 +268,16 @@ func createStaticModeCommand() *cobra.Command {
271268
}
272269

273270
func createProvisionerModeCommand() *cobra.Command {
274-
return &cobra.Command{
271+
var (
272+
gatewayCtlrName = stringValidatingValue{
273+
validator: validateGatewayControllerName,
274+
}
275+
gatewayClassName = stringValidatingValue{
276+
validator: validateResourceName,
277+
}
278+
)
279+
280+
cmd := &cobra.Command{
275281
Use: "provisioner-mode",
276282
Short: "Provision a static-mode NGINX Gateway Fabric Deployment per Gateway resource",
277283
Hidden: true,
@@ -291,4 +297,50 @@ func createProvisionerModeCommand() *cobra.Command {
291297
})
292298
},
293299
}
300+
301+
cmd.Flags().Var(
302+
&gatewayCtlrName,
303+
gatewayCtlrNameFlag,
304+
fmt.Sprintf(gatewayCtlrNameUsageFmt, domain),
305+
)
306+
utilruntime.Must(cmd.MarkFlagRequired(gatewayCtlrNameFlag))
307+
308+
cmd.Flags().Var(
309+
&gatewayClassName,
310+
gatewayClassFlag,
311+
gatewayClassNameUsage,
312+
)
313+
utilruntime.Must(cmd.MarkFlagRequired(gatewayClassFlag))
314+
315+
return cmd
316+
}
317+
318+
// FIXME(pleshakov): Remove this command once NGF min supported Kubernetes version supports sleep action in
319+
// preStop hook.
320+
// nolint:lll
321+
// See https://github.com/kubernetes/enhancements/tree/4ec371d92dcd4f56a2ab18c8ba20bb85d8d20efe/keps/sig-node/3960-pod-lifecycle-sleep-action
322+
func createSleepCommand() *cobra.Command {
323+
// flag names
324+
const durationFlag = "duration"
325+
// flag values
326+
var duration time.Duration
327+
328+
cmd := &cobra.Command{
329+
Use: "sleep",
330+
Short: "Sleep for specified duration and exit",
331+
Run: func(cmd *cobra.Command, args []string) {
332+
// It is expected that this command is run from lifecycle hook.
333+
// Because logs from hooks are not visible in the container logs, we don't log here at all.
334+
time.Sleep(duration)
335+
},
336+
}
337+
338+
cmd.Flags().DurationVar(
339+
&duration,
340+
durationFlag,
341+
30*time.Second,
342+
"Set the duration of sleep. Must be parsable by https://pkg.go.dev/time#ParseDuration",
343+
)
344+
345+
return cmd
294346
}

cmd/gateway/commands_test.go

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,17 @@ func testFlag(t *testing.T, cmd *cobra.Command, test flagTestCase) {
3737
}
3838
}
3939

40-
func TestRootCmdFlagValidation(t *testing.T) {
40+
func TestRootCmd(t *testing.T) {
41+
testCase := flagTestCase{
42+
name: "no flags",
43+
args: nil,
44+
wantErr: false,
45+
}
46+
47+
testFlag(t, createRootCommand(), testCase)
48+
}
49+
50+
func TestCommonFlagsValidation(t *testing.T) {
4151
tests := []flagTestCase{
4252
{
4353
name: "valid flags",
@@ -103,9 +113,11 @@ func TestRootCmdFlagValidation(t *testing.T) {
103113
}
104114

105115
for _, test := range tests {
106-
t.Run(test.name, func(t *testing.T) {
107-
rootCmd := createRootCommand()
108-
testFlag(t, rootCmd, test)
116+
t.Run(test.name+"_static_mode", func(t *testing.T) {
117+
testFlag(t, createStaticModeCommand(), test)
118+
})
119+
t.Run(test.name+"_provisioner_mode", func(t *testing.T) {
120+
testFlag(t, createProvisionerModeCommand(), test)
109121
})
110122
}
111123
}
@@ -115,6 +127,8 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
115127
{
116128
name: "valid flags",
117129
args: []string{
130+
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
131+
"--gatewayclass=nginx", // common and required flag
118132
"--gateway=nginx-gateway/nginx",
119133
"--config=nginx-gateway-config",
120134
"--service=nginx-gateway",
@@ -130,8 +144,11 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
130144
wantErr: false,
131145
},
132146
{
133-
name: "valid flags, not set",
134-
args: nil,
147+
name: "valid flags, non-required not set",
148+
args: []string{
149+
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
150+
"--gatewayclass=nginx", // common and required flag,
151+
},
135152
wantErr: false,
136153
},
137154
{
@@ -280,10 +297,67 @@ func TestStaticModeCmdFlagValidation(t *testing.T) {
280297
},
281298
}
282299

300+
// common flags validation is tested separately
301+
283302
for _, test := range tests {
284303
t.Run(test.name, func(t *testing.T) {
285304
cmd := createStaticModeCommand()
286305
testFlag(t, cmd, test)
287306
})
288307
}
289308
}
309+
310+
func TestProvisionerModeCmdFlagValidation(t *testing.T) {
311+
testCase := flagTestCase{
312+
name: "valid flags",
313+
args: []string{
314+
"--gateway-ctlr-name=gateway.nginx.org/nginx-gateway", // common and required flag
315+
"--gatewayclass=nginx", // common and required flag
316+
},
317+
wantErr: false,
318+
}
319+
320+
// common flags validation is tested separately
321+
322+
testFlag(t, createProvisionerModeCommand(), testCase)
323+
}
324+
325+
func TestSleepCmdFlagValidation(t *testing.T) {
326+
tests := []flagTestCase{
327+
{
328+
name: "valid flags",
329+
args: []string{
330+
"--duration=1s",
331+
},
332+
wantErr: false,
333+
},
334+
{
335+
name: "omitted flags",
336+
args: nil,
337+
wantErr: false,
338+
},
339+
{
340+
name: "duration is set to empty string",
341+
args: []string{
342+
"--duration=",
343+
},
344+
wantErr: true,
345+
expectedErrPrefix: `invalid argument "" for "--duration" flag: time: invalid duration ""`,
346+
},
347+
{
348+
name: "duration is invalid",
349+
args: []string{
350+
"--duration=invalid",
351+
},
352+
wantErr: true,
353+
expectedErrPrefix: `invalid argument "invalid" for "--duration" flag: time: invalid duration "invalid"`,
354+
},
355+
}
356+
357+
for _, test := range tests {
358+
t.Run(test.name, func(t *testing.T) {
359+
cmd := createSleepCommand()
360+
testFlag(t, cmd, test)
361+
})
362+
}
363+
}

cmd/gateway/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ func main() {
1818
rootCmd.AddCommand(
1919
createStaticModeCommand(),
2020
createProvisionerModeCommand(),
21+
createSleepCommand(),
2122
)
2223

2324
if err := rootCmd.Execute(); err != nil {

conformance/provisioner/static-deployment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ spec:
101101
mountPath: /var/cache/nginx
102102
- name: nginx-lib
103103
mountPath: /var/lib/nginx
104+
terminationGracePeriodSeconds: 30
104105
serviceAccountName: nginx-gateway
105106
shareProcessNamespace: true
106107
securityContext:

deploy/helm-chart/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ The following tables lists the configurable parameters of the NGINX Gateway Fabr
138138
| `nginxGateway.image.repository` | The repository for the NGINX Gateway Fabric image. | ghcr.io/nginxinc/nginx-gateway-fabric |
139139
| `nginxGateway.image.tag` | The tag for the NGINX Gateway Fabric image. | edge |
140140
| `nginxGateway.image.pullPolicy` | The `imagePullPolicy` for the NGINX Gateway Fabric image. | Always |
141+
| `nginxGateway.lifecycle` | The `lifecycle` of the nginx-gateway container. | {} |
141142
| `nginxGateway.gatewayClassName` | The name of the GatewayClass for the NGINX Gateway Fabric deployment. | nginx |
142143
| `nginxGateway.gatewayControllerName` | The name of the Gateway controller. The controller name must be of the form: DOMAIN/PATH. The controller's domain is gateway.nginx.org. | gateway.nginx.org/nginx-gateway-controller |
143144
| `nginxGateway.kind` | The kind of the NGINX Gateway Fabric installation - currently, only Deployment is supported. | deployment |
@@ -151,6 +152,9 @@ The following tables lists the configurable parameters of the NGINX Gateway Fabr
151152
| `nginx.image.repository` | The repository for the NGINX image. | ghcr.io/nginxinc/nginx-gateway-fabric/nginx |
152153
| `nginx.image.tag` | The tag for the NGINX image. | edge |
153154
| `nginx.image.pullPolicy` | The `imagePullPolicy` for the NGINX image. | Always |
155+
| `nginx.lifecycle` | The `lifecycle` of the nginx container. | {} |
156+
| `terminationGracePeriodSeconds` | The termination grace period of the NGINX Gateway Fabric pod. | 30 |
157+
| `affinity` | The `affinity` of the NGINX Gateway Fabric pod. | {} |
154158
| `serviceAccount.annotations` | The `annotations` for the ServiceAccount used by the NGINX Gateway Fabric deployment. | {} |
155159
| `serviceAccount.name` | Name of the ServiceAccount used by the NGINX Gateway Fabric deployment. | Autogenerated |
156160
| `service.create` | Creates a service to expose the NGINX Gateway Fabric pods. | true |

deploy/helm-chart/templates/deployment.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ spec:
6565
image: {{ .Values.nginxGateway.image.repository }}:{{ .Values.nginxGateway.image.tag | default .Chart.AppVersion }}
6666
imagePullPolicy: {{ .Values.nginxGateway.image.pullPolicy }}
6767
name: nginx-gateway
68+
{{- if .Values.nginxGateway.lifecycle }}
69+
lifecycle:
70+
{{- toYaml .Values.nginxGateway.lifecycle | nindent 10 }}
71+
{{- end }}
6872
ports:
6973
{{- if .Values.metrics.enable }}
7074
- name: metrics
@@ -100,6 +104,10 @@ spec:
100104
- image: {{ .Values.nginx.image.repository }}:{{ .Values.nginx.image.tag | default .Chart.AppVersion }}
101105
imagePullPolicy: {{ .Values.nginx.image.pullPolicy }}
102106
name: nginx
107+
{{- if .Values.nginx.lifecycle }}
108+
lifecycle:
109+
{{- toYaml .Values.nginx.lifecycle | nindent 10 }}
110+
{{- end }}
103111
ports:
104112
- containerPort: 80
105113
name: http
@@ -125,6 +133,11 @@ spec:
125133
mountPath: /var/cache/nginx
126134
- name: nginx-lib
127135
mountPath: /var/lib/nginx
136+
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
137+
{{- if .Values.affinity }}
138+
affinity:
139+
{{- toYaml .Values.affinity | nindent 8 }}
140+
{{- end }}
128141
serviceAccountName: {{ include "nginx-gateway.serviceAccountName" . }}
129142
shareProcessNamespace: true
130143
securityContext:

deploy/helm-chart/values.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,25 @@ nginxGateway:
4545
## Some environments may need this set to true in order for the control plane to successfully reload NGINX.
4646
allowPrivilegeEscalation: false
4747

48+
## The lifecycle of the nginx-gateway container.
49+
lifecycle: {}
50+
4851
nginx:
4952
## The NGINX image to use
5053
image:
5154
repository: ghcr.io/nginxinc/nginx-gateway-fabric/nginx
5255
tag: edge
5356
pullPolicy: Always
5457

58+
## The lifecycle of the nginx container.
59+
lifecycle: {}
60+
61+
## The termination grace period of the NGINX Gateway Fabric pod.
62+
terminationGracePeriodSeconds: 30
63+
64+
## The affinity of the NGINX Gateway Fabric pod.
65+
affinity: {}
66+
5567
serviceAccount:
5668
annotations: {}
5769
## The name of the service account of the NGINX Gateway Fabric pods. Used for RBAC.

deploy/manifests/nginx-gateway.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ spec:
215215
mountPath: /var/cache/nginx
216216
- name: nginx-lib
217217
mountPath: /var/lib/nginx
218+
terminationGracePeriodSeconds: 30
218219
serviceAccountName: nginx-gateway
219220
shareProcessNamespace: true
220221
securityContext:

0 commit comments

Comments
 (0)