Skip to content

Commit 76184a9

Browse files
authored
Add nginx image version validation during agent connections (#3928)
Problem: During an NGF upgrade, the new version of the control plane will send a configuration to the old version of the nginx data plane, before the nginx data plane is updated to the new version. This can cause incompatibility issues for a brief amount of time, which could cause disruptions. Solution: Implement version validation by ensuring the pod image matches the image in the current deployment/ daemonset spec to prevent configuration from being sent to nginx data plane pods still running the previous image version during upgrades.
1 parent 3e21104 commit 76184a9

File tree

5 files changed

+185
-83
lines changed

5 files changed

+185
-83
lines changed

internal/controller/handler.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,13 @@ func (h *eventHandlerImpl) sendNginxConfig(ctx context.Context, logger logr.Logg
209209
panic("expected deployment, got nil")
210210
}
211211

212+
nginxImage, _ := provisioner.DetermineNginxImageName(
213+
gw.EffectiveNginxProxy,
214+
h.cfg.plus,
215+
h.cfg.gatewayPodConfig.Version,
216+
)
217+
deployment.SetImageVersion(nginxImage)
218+
212219
cfg := dataplane.BuildConfiguration(ctx, logger, gr, gw, h.cfg.serviceResolver, h.cfg.plus)
213220
depCtx, getErr := h.getDeploymentContext(ctx)
214221
if getErr != nil {

internal/controller/nginx/agent/command.go

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (cs *commandService) CreateConnection(
8686
podName := resource.GetContainerInfo().GetHostname()
8787
cs.logger.Info(fmt.Sprintf("Creating connection for nginx pod: %s", podName))
8888

89-
owner, err := cs.getPodOwner(podName)
89+
owner, _, err := cs.getPodOwner(podName)
9090
if err != nil {
9191
response := &pb.CreateConnectionResponse{
9292
Response: &pb.CommandResponse{
@@ -281,6 +281,17 @@ func (cs *commandService) setInitialConfig(
281281
deployment.FileLock.Lock()
282282
defer deployment.FileLock.Unlock()
283283

284+
_, pod, err := cs.getPodOwner(conn.PodName)
285+
if err != nil {
286+
cs.logAndSendErrorStatus(deployment, conn, err)
287+
288+
return grpcStatus.Error(codes.Internal, err.Error())
289+
}
290+
if err := cs.validatePodImageVersion(pod, deployment.imageVersion); err != nil {
291+
cs.logAndSendErrorStatus(deployment, conn, err)
292+
return grpcStatus.Errorf(codes.FailedPrecondition, "nginx image version validation failed: %s", err.Error())
293+
}
294+
284295
fileOverviews, configVersion := deployment.GetFileOverviews()
285296

286297
cs.logger.Info("Sending initial configuration to agent", "pod", conn.PodName, "configVersion", configVersion)
@@ -443,7 +454,7 @@ func buildPlusAPIRequest(action *pb.NGINXPlusAction, instanceID string) *pb.Mana
443454
}
444455
}
445456

446-
func (cs *commandService) getPodOwner(podName string) (types.NamespacedName, error) {
457+
func (cs *commandService) getPodOwner(podName string) (types.NamespacedName, *v1.Pod, error) {
447458
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
448459
defer cancel()
449460

@@ -452,30 +463,31 @@ func (cs *commandService) getPodOwner(podName string) (types.NamespacedName, err
452463
FieldSelector: fields.SelectorFromSet(fields.Set{"metadata.name": podName}),
453464
}
454465
if err := cs.k8sReader.List(ctx, &pods, listOpts); err != nil {
455-
return types.NamespacedName{}, fmt.Errorf("error listing pods: %w", err)
466+
return types.NamespacedName{}, nil, fmt.Errorf("error listing pods: %w", err)
456467
}
457468

458469
if len(pods.Items) == 0 {
459-
return types.NamespacedName{}, fmt.Errorf("no pods found with name %q", podName)
470+
return types.NamespacedName{}, nil, fmt.Errorf("no pods found with name %q", podName)
460471
}
461472

462473
if len(pods.Items) > 1 {
463-
return types.NamespacedName{}, fmt.Errorf("should only be one pod with name %q", podName)
474+
return types.NamespacedName{}, nil, fmt.Errorf("should only be one pod with name %q", podName)
464475
}
465-
pod := pods.Items[0]
476+
pod := &pods.Items[0]
466477

467478
podOwnerRefs := pod.GetOwnerReferences()
468479
if len(podOwnerRefs) != 1 {
469-
return types.NamespacedName{}, fmt.Errorf("expected one owner reference of the nginx Pod, got %d", len(podOwnerRefs))
480+
tooManyOwnersError := "expected one owner reference of the nginx Pod, got %d"
481+
return types.NamespacedName{}, nil, fmt.Errorf(tooManyOwnersError, len(podOwnerRefs))
470482
}
471483

472484
if podOwnerRefs[0].Kind != "ReplicaSet" && podOwnerRefs[0].Kind != "DaemonSet" {
473485
err := fmt.Errorf("expected pod owner reference to be ReplicaSet or DaemonSet, got %s", podOwnerRefs[0].Kind)
474-
return types.NamespacedName{}, err
486+
return types.NamespacedName{}, nil, err
475487
}
476488

477489
if podOwnerRefs[0].Kind == "DaemonSet" {
478-
return types.NamespacedName{Namespace: pod.Namespace, Name: podOwnerRefs[0].Name}, nil
490+
return types.NamespacedName{Namespace: pod.Namespace, Name: podOwnerRefs[0].Name}, pod, nil
479491
}
480492

481493
var replicaSet appsv1.ReplicaSet
@@ -497,16 +509,46 @@ func (cs *commandService) getPodOwner(podName string) (types.NamespacedName, err
497509
return true, nil
498510
},
499511
); err != nil {
500-
return types.NamespacedName{}, fmt.Errorf("failed to get nginx Pod's ReplicaSet: %w", replicaSetErr)
512+
return types.NamespacedName{}, nil, fmt.Errorf("failed to get nginx Pod's ReplicaSet: %w", replicaSetErr)
501513
}
502514

503515
replicaOwnerRefs := replicaSet.GetOwnerReferences()
504516
if len(replicaOwnerRefs) != 1 {
505517
err := fmt.Errorf("expected one owner reference of the nginx ReplicaSet, got %d", len(replicaOwnerRefs))
506-
return types.NamespacedName{}, err
518+
return types.NamespacedName{}, nil, err
519+
}
520+
521+
return types.NamespacedName{Namespace: pod.Namespace, Name: replicaOwnerRefs[0].Name}, pod, nil
522+
}
523+
524+
// validatePodImageVersion checks if the pod's nginx container image version matches the expected version
525+
// from its deployment. Returns an error if versions don't match.
526+
func (cs *commandService) validatePodImageVersion(
527+
pod *v1.Pod,
528+
expectedImage string,
529+
) error {
530+
var podNginxImage string
531+
532+
for _, container := range pod.Spec.Containers {
533+
if container.Name == "nginx" {
534+
podNginxImage = container.Image
535+
break
536+
}
537+
}
538+
if podNginxImage == "" {
539+
return fmt.Errorf("nginx container not found in pod %q", pod.Name)
540+
}
541+
542+
// Compare images
543+
if podNginxImage != expectedImage {
544+
return fmt.Errorf("nginx image version mismatch: pod has %q but expected %q", podNginxImage, expectedImage)
507545
}
508546

509-
return types.NamespacedName{Namespace: pod.Namespace, Name: replicaOwnerRefs[0].Name}, nil
547+
cs.logger.V(1).Info("Pod nginx image version validated successfully",
548+
"podName", pod.Name,
549+
"image", podNginxImage)
550+
551+
return nil
510552
}
511553

512554
// UpdateDataPlaneStatus is called by agent on startup and upon any change in agent metadata,

internal/controller/nginx/agent/command_test.go

Lines changed: 75 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,48 @@ func createGrpcContextWithCancel() (context.Context, context.CancelFunc) {
8585
}), cancel
8686
}
8787

88+
func getDefaultPodList() []runtime.Object {
89+
pod := &v1.PodList{
90+
Items: []v1.Pod{
91+
{
92+
ObjectMeta: metav1.ObjectMeta{
93+
Name: "nginx-pod",
94+
Namespace: "test",
95+
OwnerReferences: []metav1.OwnerReference{
96+
{
97+
Kind: "ReplicaSet",
98+
Name: "nginx-replicaset",
99+
},
100+
},
101+
},
102+
Spec: v1.PodSpec{
103+
Containers: []v1.Container{
104+
{
105+
Image: "nginx:v1.0.0",
106+
Name: "nginx",
107+
},
108+
},
109+
},
110+
},
111+
},
112+
}
113+
114+
replicaSet := &appsv1.ReplicaSet{
115+
ObjectMeta: metav1.ObjectMeta{
116+
Name: "nginx-replicaset",
117+
Namespace: "test",
118+
OwnerReferences: []metav1.OwnerReference{
119+
{
120+
Kind: "Deployment",
121+
Name: "nginx-deployment",
122+
},
123+
},
124+
},
125+
}
126+
127+
return []runtime.Object{pod, replicaSet}
128+
}
129+
88130
func TestCreateConnection(t *testing.T) {
89131
t.Parallel()
90132

@@ -165,40 +207,9 @@ func TestCreateConnection(t *testing.T) {
165207
connTracker := agentgrpcfakes.FakeConnectionsTracker{}
166208

167209
var objs []runtime.Object
168-
if test.errString == "" {
169-
pod := &v1.PodList{
170-
Items: []v1.Pod{
171-
{
172-
ObjectMeta: metav1.ObjectMeta{
173-
Name: "nginx-pod",
174-
Namespace: "test",
175-
OwnerReferences: []metav1.OwnerReference{
176-
{
177-
Kind: "ReplicaSet",
178-
Name: "nginx-replicaset",
179-
},
180-
},
181-
},
182-
},
183-
},
184-
}
185-
186-
replicaSet := &appsv1.ReplicaSet{
187-
ObjectMeta: metav1.ObjectMeta{
188-
Name: "nginx-replicaset",
189-
Namespace: "test",
190-
OwnerReferences: []metav1.OwnerReference{
191-
{
192-
Kind: "Deployment",
193-
Name: "nginx-deployment",
194-
},
195-
},
196-
},
197-
}
198-
199-
objs = []runtime.Object{pod, replicaSet}
210+
if test.errString != "error getting pod owner" {
211+
objs = getDefaultPodList()
200212
}
201-
202213
fakeClient, err := createFakeK8sClient(objs...)
203214
g.Expect(err).ToNot(HaveOccurred())
204215

@@ -298,10 +309,13 @@ func TestSubscribe(t *testing.T) {
298309
}
299310
connTracker.GetConnectionReturns(conn)
300311

312+
fakeClient, err := createFakeK8sClient(getDefaultPodList()...)
313+
g.Expect(err).ToNot(HaveOccurred())
314+
301315
store := NewDeploymentStore(&connTracker)
302316
cs := newCommandService(
303317
logr.Discard(),
304-
fake.NewFakeClient(),
318+
fakeClient,
305319
store,
306320
&connTracker,
307321
status.NewQueue(),
@@ -329,6 +343,7 @@ func TestSubscribe(t *testing.T) {
329343
},
330344
}
331345
deployment.SetFiles(files)
346+
deployment.SetImageVersion("nginx:v1.0.0")
332347

333348
initialAction := &pb.NGINXPlusAction{
334349
Action: &pb.NGINXPlusAction_UpdateHttpUpstreamServers{},
@@ -439,11 +454,14 @@ func TestSubscribe_Reset(t *testing.T) {
439454
}
440455
connTracker.GetConnectionReturns(conn)
441456

457+
fakeClient, err := createFakeK8sClient(getDefaultPodList()...)
458+
g.Expect(err).ToNot(HaveOccurred())
459+
442460
store := NewDeploymentStore(&connTracker)
443461
resetChan := make(chan struct{})
444462
cs := newCommandService(
445463
logr.Discard(),
446-
fake.NewFakeClient(),
464+
fakeClient,
447465
store,
448466
&connTracker,
449467
status.NewQueue(),
@@ -471,6 +489,7 @@ func TestSubscribe_Reset(t *testing.T) {
471489
},
472490
}
473491
deployment.SetFiles(files)
492+
deployment.SetImageVersion("nginx:v1.0.0")
474493

475494
ctx, cancel := createGrpcContextWithCancel()
476495
defer cancel()
@@ -590,9 +609,10 @@ func TestSetInitialConfig_Errors(t *testing.T) {
590609
t.Parallel()
591610

592611
tests := []struct {
593-
name string
594612
setup func(msgr *messengerfakes.FakeMessenger, deployment *Deployment)
613+
name string
595614
errString string
615+
podList []runtime.Object
596616
}{
597617
{
598618
name: "error sending initial config",
@@ -652,6 +672,13 @@ func TestSetInitialConfig_Errors(t *testing.T) {
652672
},
653673
errString: "api apply error",
654674
},
675+
{
676+
name: "error validating nginx version",
677+
setup: func(_ *messengerfakes.FakeMessenger, deployment *Deployment) {
678+
deployment.SetImageVersion("nginx:v2.0.0")
679+
},
680+
errString: "nginx image version mismatch: pod has \"nginx:v1.0.0\" but expected \"nginx:v2.0.0\"",
681+
},
655682
}
656683

657684
for _, test := range tests {
@@ -662,9 +689,17 @@ func TestSetInitialConfig_Errors(t *testing.T) {
662689
connTracker := agentgrpcfakes.FakeConnectionsTracker{}
663690
msgr := &messengerfakes.FakeMessenger{}
664691

692+
podList := test.podList
693+
if len(podList) == 0 {
694+
podList = getDefaultPodList()
695+
}
696+
697+
fakeClient, err := createFakeK8sClient(podList...)
698+
g.Expect(err).ToNot(HaveOccurred())
699+
665700
cs := newCommandService(
666701
logr.Discard(),
667-
fake.NewFakeClient(),
702+
fakeClient,
668703
NewDeploymentStore(&connTracker),
669704
&connTracker,
670705
status.NewQueue(),
@@ -678,12 +713,13 @@ func TestSetInitialConfig_Errors(t *testing.T) {
678713
}
679714

680715
deployment := newDeployment(&broadcastfakes.FakeBroadcaster{})
716+
deployment.SetImageVersion("nginx:v1.0.0")
681717

682718
if test.setup != nil {
683719
test.setup(msgr, deployment)
684720
}
685721

686-
err := cs.setInitialConfig(context.Background(), deployment, conn, msgr)
722+
err = cs.setInitialConfig(context.Background(), deployment, conn, msgr)
687723

688724
g.Expect(err).To(HaveOccurred())
689725
g.Expect(err.Error()).To(ContainSubstring(test.errString))
@@ -882,7 +918,7 @@ func TestGetPodOwner(t *testing.T) {
882918
nil,
883919
)
884920

885-
owner, err := cs.getPodOwner(test.podName)
921+
owner, _, err := cs.getPodOwner(test.podName)
886922

887923
if test.errString != "" {
888924
g.Expect(err).To(HaveOccurred())

internal/controller/nginx/agent/deployment.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ type Deployment struct {
4040

4141
broadcaster broadcast.Broadcaster
4242

43+
imageVersion string
44+
4345
configVersion string
4446
// error that is set if a ConfigApply call failed for a Pod. This is needed
4547
// because if subsequent upstream API calls are made within the same update event,
@@ -73,6 +75,14 @@ func (d *Deployment) GetBroadcaster() broadcast.Broadcaster {
7375
return d.broadcaster
7476
}
7577

78+
// SetImageVersion sets the deployment's image version.
79+
func (d *Deployment) SetImageVersion(imageVersion string) {
80+
d.FileLock.Lock()
81+
defer d.FileLock.Unlock()
82+
83+
d.imageVersion = imageVersion
84+
}
85+
7686
// SetLatestConfigError sets the latest config apply error for the deployment.
7787
func (d *Deployment) SetLatestConfigError(err error) {
7888
d.errLock.Lock()

0 commit comments

Comments
 (0)