Skip to content

Commit b4c1d9e

Browse files
authored
fix(trafficrouting): Do not block the switch of service selectors for single pod failures (argoproj#2441)
* fix(traficrouter): WIP on not setting weight if not available Signed-off-by: zachaller <[email protected]> * fix tests Signed-off-by: zachaller <[email protected]> * try bailing vs setting weight Signed-off-by: zachaller <[email protected]> * work with expirments that do not set any weights Signed-off-by: zachaller <[email protected]> * fix test by commenting out code Signed-off-by: zachaller <[email protected]> * lint Signed-off-by: zachaller <[email protected]> * simplify logic Signed-off-by: zachaller <[email protected]> * switch logic Signed-off-by: zachaller <[email protected]> * add more comments Signed-off-by: zachaller <[email protected]> * add more comments Signed-off-by: zachaller <[email protected]> * add more test Signed-off-by: zachaller <[email protected]> * refactor test Signed-off-by: zachaller <[email protected]> * refactor code to reduce duplication Signed-off-by: zachaller <[email protected]> * change comments a bit Signed-off-by: zachaller <[email protected]> * remove else Signed-off-by: zachaller <[email protected]> Signed-off-by: zachaller <[email protected]>
1 parent f4a8c55 commit b4c1d9e

File tree

7 files changed

+149
-13
lines changed

7 files changed

+149
-13
lines changed

rollout/service.go

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,6 @@ import (
44
"context"
55
"fmt"
66

7-
appsv1 "k8s.io/api/apps/v1"
8-
corev1 "k8s.io/api/core/v1"
9-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10-
patchtypes "k8s.io/apimachinery/pkg/types"
11-
"k8s.io/utils/pointer"
12-
137
"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
148
"github.com/argoproj/argo-rollouts/rollout/trafficrouting"
159
"github.com/argoproj/argo-rollouts/utils/annotations"
@@ -21,6 +15,11 @@ import (
2115
replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
2216
rolloututils "github.com/argoproj/argo-rollouts/utils/rollout"
2317
serviceutil "github.com/argoproj/argo-rollouts/utils/service"
18+
appsv1 "k8s.io/api/apps/v1"
19+
corev1 "k8s.io/api/core/v1"
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
patchtypes "k8s.io/apimachinery/pkg/types"
22+
"k8s.io/utils/pointer"
2423
)
2524

2625
const (
@@ -266,7 +265,10 @@ func (c *rolloutContext) reconcileStableAndCanaryService() error {
266265
}
267266

268267
// ensureSVCTargets updates the service with the given name to point to the given ReplicaSet,
269-
// but only if that ReplicaSet has full availability.
268+
// but only if that ReplicaSet has proper availability. There is still an edge case with this function if
269+
// in the small window of time between a rollout being completed, and we try to update the service selector, we lose 100%
270+
// of the pods availability. We will not switch service selector but still go and reconcile the traffic router, setting the
271+
// stable weight to zero. This really only affects dynamic stable scale.
270272
func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet, checkRsAvailability bool) error {
271273
if rs == nil || svcName == "" {
272274
return nil
@@ -277,13 +279,31 @@ func (c *rolloutContext) ensureSVCTargets(svcName string, rs *appsv1.ReplicaSet,
277279
}
278280
currSelector := svc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
279281
desiredSelector := rs.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
282+
logCtx := c.log.WithField(logutil.ServiceKey, svc.Name)
283+
280284
if currSelector != desiredSelector {
281-
// ensure ReplicaSet is fully available, otherwise we will point the service to nothing or an underprovisioned ReplicaSet
282-
if checkRsAvailability && !replicasetutil.IsReplicaSetAvailable(rs) {
283-
logCtx := c.log.WithField(logutil.ServiceKey, svc.Name)
284-
logCtx.Infof("delaying service switch from %s to %s: ReplicaSet not fully available", currSelector, desiredSelector)
285+
if _, ok := svc.Annotations[v1alpha1.ManagedByRolloutsKey]; !ok {
286+
// This block will be entered only when adopting a service that already exists, because the current annotation
287+
// will be empty at that point. When we are adopting a service, we want to make sure that the replicaset is fully
288+
// available before we start routing traffic to it, so we do not overload it.
289+
// See PR: https://github.com/argoproj/argo-rollouts/pull/1777
290+
291+
// ensure ReplicaSet is fully available, otherwise we will point the service to nothing or an underprovisioned ReplicaSet
292+
if checkRsAvailability && !replicasetutil.IsReplicaSetAvailable(rs) {
293+
logCtx.Infof("delaying service switch from %s to %s: ReplicaSet not fully available", currSelector, desiredSelector)
294+
return nil
295+
}
296+
logCtx.Infof("adopting service %s", svc.Name)
297+
}
298+
299+
// When we are at the end of a rollout we generally will have enough capacity to handle the traffic, so we do not
300+
// need to check the full availability of the ReplicaSet. We do still want to make sure we have at least one pod
301+
// available, so we do not point the service to nothing, but losing a pod or two should be tolerable to still switch service selectors.
302+
if checkRsAvailability && !replicasetutil.IsReplicaSetPartiallyAvailable(rs) {
303+
logCtx.Infof("delaying service switch from %s to %s: ReplicaSet has zero availability", currSelector, desiredSelector)
285304
return nil
286305
}
306+
287307
err = c.switchServiceSelector(svc, desiredSelector, c.rollout)
288308
if err != nil {
289309
return err

rollout/service_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,22 @@ func TestDelayCanaryStableServiceLabelInjection(t *testing.T) {
787787
_, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
788788
assert.False(t, stableInjected)
789789
}
790+
{
791+
// ensure we don't update service because new/stable are both partially available on an adoption of service reconcile
792+
ctrl, _, _ := f.newController(noResyncPeriodFunc)
793+
roCtx, err := ctrl.newRolloutContext(ro1)
794+
assert.NoError(t, err)
795+
796+
roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 1)
797+
roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 1)
798+
799+
err = roCtx.reconcileStableAndCanaryService()
800+
assert.NoError(t, err)
801+
_, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
802+
assert.False(t, canaryInjected)
803+
_, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
804+
assert.False(t, stableInjected)
805+
}
790806
{
791807
// next ensure we do update service because new/stable are now available
792808
ctrl, _, _ := f.newController(noResyncPeriodFunc)
@@ -805,3 +821,58 @@ func TestDelayCanaryStableServiceLabelInjection(t *testing.T) {
805821
}
806822

807823
}
824+
825+
// TestDelayCanaryStableServiceDelayOnAdoptedService verifies allow partial readiness of pods when switching labels
826+
// on an adopted services, but that if there is zero readiness we will not switch
827+
func TestDelayCanaryStableServiceDelayOnAdoptedService(t *testing.T) {
828+
ro1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromInt(1), intstr.FromInt(1))
829+
ro1.Spec.Strategy.Canary.CanaryService = "canary"
830+
ro1.Spec.Strategy.Canary.StableService = "stable"
831+
//Setup services that are already adopted by rollouts
832+
stableSvc := newService("stable", 80, ro1.Spec.Selector.MatchLabels, ro1)
833+
ro2 := bumpVersion(ro1)
834+
canarySvc := newService("canary", 80, ro1.Spec.Selector.MatchLabels, ro2)
835+
836+
f := newFixture(t)
837+
defer f.Close()
838+
f.kubeobjects = append(f.kubeobjects, canarySvc, stableSvc)
839+
f.serviceLister = append(f.serviceLister, canarySvc, stableSvc)
840+
841+
t.Run("AdoptedService No Availability", func(t *testing.T) {
842+
// first ensure we don't update service because new/stable are both not available
843+
ctrl, _, _ := f.newController(noResyncPeriodFunc)
844+
roCtx, err := ctrl.newRolloutContext(ro1)
845+
assert.NoError(t, err)
846+
847+
roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 0)
848+
roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 0)
849+
850+
err = roCtx.reconcileStableAndCanaryService()
851+
assert.NoError(t, err)
852+
canaryHash2, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
853+
assert.False(t, canaryInjected)
854+
fmt.Println(canaryHash2)
855+
stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
856+
assert.False(t, stableInjected)
857+
fmt.Println(stableHash2)
858+
})
859+
t.Run("AdoptedService Partial Availability", func(t *testing.T) {
860+
// ensure we do change selector on partially available replica sets
861+
ctrl, _, _ := f.newController(noResyncPeriodFunc)
862+
roCtx, err := ctrl.newRolloutContext(ro1)
863+
assert.NoError(t, err)
864+
865+
roCtx.newRS = newReplicaSetWithStatus(ro1, 3, 1)
866+
roCtx.stableRS = newReplicaSetWithStatus(ro2, 3, 2)
867+
868+
err = roCtx.reconcileStableAndCanaryService()
869+
assert.NoError(t, err)
870+
canaryHash2, canaryInjected := canarySvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
871+
assert.True(t, canaryInjected)
872+
fmt.Println(canaryHash2)
873+
stableHash2, stableInjected := stableSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
874+
assert.True(t, stableInjected)
875+
fmt.Println(stableHash2)
876+
})
877+
878+
}

test/e2e/alb/rollout-alb-experiment-no-setweight.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ spec:
9393
servicePort: 80
9494
steps:
9595
- experiment:
96+
duration: 15s
9697
templates:
9798
- name: experiment-alb-canary
9899
specRef: canary

test/e2e/aws_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ func (s *AWSSuite) TestALBExperimentStepNoSetWeight() {
168168
When().
169169
PromoteRollout().
170170
WaitForRolloutStatus("Healthy").
171-
Sleep(1 * time.Second). // stable is currently set first, and then changes made to VirtualServices/DestinationRules
171+
Sleep(2 * time.Second). // stable is currently set first, and then changes made to VirtualServices/DestinationRules
172172
Then().
173173
Assert(assertWeights(s, "alb-rollout-canary", "alb-rollout-stable", 0, 100))
174174
}

test/e2e/functional_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ func (s *FunctionalSuite) TestRolloutPDBRestart() {
288288
s.Given().
289289
HealthyRollout(`
290290
---
291-
apiVersion: policy/v1beta1
291+
apiVersion: policy/v1
292292
kind: PodDisruptionBudget
293293
metadata:
294294
name: rollout-pdb-restart

utils/replicaset/replicaset.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,3 +646,8 @@ func IsReplicaSetAvailable(rs *appsv1.ReplicaSet) bool {
646646
availableReplicas := rs.Status.AvailableReplicas
647647
return replicas != nil && *replicas != 0 && availableReplicas != 0 && *replicas <= availableReplicas
648648
}
649+
650+
// IsReplicaSetPartiallyAvailable returns if a ReplicaSet is scaled up and has at least 1 pod available
651+
func IsReplicaSetPartiallyAvailable(rs *appsv1.ReplicaSet) bool {
652+
return rs.Status.AvailableReplicas > 0
653+
}

utils/replicaset/replicaset_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,3 +1336,42 @@ func TestIsReplicaSetAvailable(t *testing.T) {
13361336
assert.False(t, IsReplicaSetAvailable(&rs))
13371337
}
13381338
}
1339+
1340+
func TestIsReplicaSetPartiallyAvailable(t *testing.T) {
1341+
t.Run("No Availability", func(t *testing.T) {
1342+
rs := appsv1.ReplicaSet{
1343+
Spec: appsv1.ReplicaSetSpec{
1344+
Replicas: pointer.Int32Ptr(2),
1345+
},
1346+
Status: appsv1.ReplicaSetStatus{
1347+
ReadyReplicas: 0,
1348+
AvailableReplicas: 0,
1349+
},
1350+
}
1351+
assert.False(t, IsReplicaSetPartiallyAvailable(&rs))
1352+
})
1353+
t.Run("Partial Availability", func(t *testing.T) {
1354+
rs := appsv1.ReplicaSet{
1355+
Spec: appsv1.ReplicaSetSpec{
1356+
Replicas: pointer.Int32Ptr(2),
1357+
},
1358+
Status: appsv1.ReplicaSetStatus{
1359+
ReadyReplicas: 2,
1360+
AvailableReplicas: 1,
1361+
},
1362+
}
1363+
assert.True(t, IsReplicaSetPartiallyAvailable(&rs))
1364+
})
1365+
t.Run("Full Availability", func(t *testing.T) {
1366+
rs := appsv1.ReplicaSet{
1367+
Spec: appsv1.ReplicaSetSpec{
1368+
Replicas: pointer.Int32Ptr(2),
1369+
},
1370+
Status: appsv1.ReplicaSetStatus{
1371+
ReadyReplicas: 2,
1372+
AvailableReplicas: 2,
1373+
},
1374+
}
1375+
assert.True(t, IsReplicaSetPartiallyAvailable(&rs))
1376+
})
1377+
}

0 commit comments

Comments
 (0)