Skip to content

Commit a630637

Browse files
authored
fix(conformance): Reduce flakiness by using service selector modification in EppUnAvailableFailOpen (#1265)
* refactor restructure the conformance tests. * refactor merge helper functions and strict the test cases. * refactor restructure the conformance tests. * Use modify service selector to mimic EPP unavailabl and add a cleanup function.
1 parent 71b8ada commit a630637

File tree

4 files changed

+76
-32
lines changed

4 files changed

+76
-32
lines changed

conformance/resources/resourcename.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ const (
3232
PrimaryModelServerDeploymentName = "primary-inference-model-server-deployment"
3333
SecondaryModelServerDeploymentName = "secondary-inference-model-server-deployment"
3434

35-
ModelServerPodReplicas = 3
35+
ModelServerPodReplicas = 3
36+
EndPointPickerPodReplicas = 1
3637
)
3738

3839
var (
@@ -44,4 +45,7 @@ var (
4445

4546
PrimaryEppDeploymentNN = types.NamespacedName{Name: "primary-app-endpoint-picker", Namespace: AppBackendNamespace}
4647
SecondaryEppDeploymentNN = types.NamespacedName{Name: "secondary-app-endpoint-picker", Namespace: AppBackendNamespace}
48+
49+
PrimaryEppServiceNN = types.NamespacedName{Name: "primary-endpoint-picker-svc", Namespace: AppBackendNamespace}
50+
SecondaryEppServiceNN = types.NamespacedName{Name: "secondary-endpoint-picker-svc", Namespace: AppBackendNamespace}
4751
)

conformance/tests/epp_unavailable_fail_open.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"sigs.k8s.io/gateway-api/pkg/features"
2727

2828
"sigs.k8s.io/gateway-api-inference-extension/conformance/resources"
29+
"sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config"
2930
k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes"
3031
trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic"
3132
testfilter "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/test/filter"
@@ -86,9 +87,11 @@ var EppUnAvailableFailOpen = suite.ConformanceTest{
8687
})
8788

8889
t.Run("Phase 2: Verify fail-open behavior after EPP becomes unavailable", func(t *testing.T) {
89-
t.Log("Simulating an EPP failure by deleting its deployment...")
90-
deleteErr := k8sutils.DeleteDeployment(t, s.Client, s.TimeoutConfig, resources.SecondaryEppDeploymentNN)
91-
require.NoError(t, deleteErr, "Failed to delete the EPP deployment")
90+
t.Logf("Making EPP service %v unavailable...", resources.PrimaryEppServiceNN)
91+
timeconfig := config.DefaultInferenceExtensionTimeoutConfig()
92+
restore, err := k8sutils.MakeServiceUnavailable(t, s.Client, resources.PrimaryEppServiceNN, timeconfig.ServiceUpdateTimeout)
93+
t.Cleanup(restore)
94+
require.NoError(t, err, "Failed to make the EPP service %v unavailable", resources.PrimaryEppServiceNN)
9295

9396
t.Log("Sending request again, expecting success to verify fail-open...")
9497
trafficutils.MakeRequestAndExpectSuccess(

conformance/utils/config/timing.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ type InferenceExtensionTimeoutConfig struct {
3939

4040
// HTTPRouteConditionTimeout represents the maximum time to wait for an HTTPRoute to have a specific condition.
4141
HTTPRouteDeletionReconciliationTimeout time.Duration
42+
43+
// ServiceUpdateTimeout represents the maximum time to wait for a service to be updated.
44+
ServiceUpdateTimeout time.Duration
4245
}
4346

4447
// DefaultInferenceExtensionTimeoutConfig returns a new InferenceExtensionTimeoutConfig with default values.
@@ -54,5 +57,6 @@ func DefaultInferenceExtensionTimeoutConfig() InferenceExtensionTimeoutConfig {
5457
InferencePoolMustHaveConditionInterval: 10 * time.Second,
5558
GatewayObjectPollInterval: 5 * time.Second,
5659
HTTPRouteDeletionReconciliationTimeout: 5 * time.Second,
60+
ServiceUpdateTimeout: 10 * time.Second,
5761
}
5862
}

conformance/utils/kubernetes/helpers.go

Lines changed: 61 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ import (
2626
"time"
2727

2828
"github.com/stretchr/testify/require"
29-
appsv1 "k8s.io/api/apps/v1"
3029
corev1 "k8s.io/api/core/v1"
30+
discoveryv1 "k8s.io/api/discovery/v1"
3131
apierrors "k8s.io/apimachinery/pkg/api/errors"
3232
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3333
"k8s.io/apimachinery/pkg/types"
@@ -38,6 +38,7 @@ import (
3838
gatewayk8sutils "sigs.k8s.io/gateway-api/conformance/utils/kubernetes"
3939

4040
inferenceapi "sigs.k8s.io/gateway-api-inference-extension/api/v1"
41+
"sigs.k8s.io/gateway-api-inference-extension/conformance/resources"
4142
"sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config"
4243
)
4344

@@ -328,42 +329,74 @@ func GetPodsWithLabel(t *testing.T, c client.Reader, namespace string, labels ma
328329
return pods.Items, waitErr
329330
}
330331

331-
// DeleteDeployment deletes the specified Deployment and waits until it is no longer
332-
// present in the cluster.
333-
func DeleteDeployment(t *testing.T, c client.Client, timeoutConfig gatewayapiconfig.TimeoutConfig, deploymentRef types.NamespacedName) error {
332+
// MakeServiceUnavailable modifies a Service's selector to make it temporarily unavailable.
333+
// It returns a cleanup function to restore the original selector and ensure the test state is clean.
334+
func MakeServiceUnavailable(t *testing.T, c client.Client, serviceRef types.NamespacedName, timeout time.Duration) (func(), error) {
334335
t.Helper()
335336

336-
deploymentToDelete := &appsv1.Deployment{
337-
ObjectMeta: metav1.ObjectMeta{
338-
Name: deploymentRef.Name,
339-
Namespace: deploymentRef.Namespace,
340-
},
337+
ctx := context.Background()
338+
svc := &corev1.Service{}
339+
340+
t.Logf("Making Service %s/%s unavailable by modifying its selector...", serviceRef.Namespace, serviceRef.Name)
341+
if err := c.Get(ctx, serviceRef, svc); err != nil {
342+
return nil, fmt.Errorf("failed to get Service %s/%s: %w", serviceRef.Namespace, serviceRef.Name, err)
343+
}
344+
originalSelector := svc.Spec.Selector
345+
svc.Spec.Selector = map[string]string{"app": "do-not-match-for-testing"}
346+
if err := c.Update(ctx, svc); err != nil {
347+
return nil, fmt.Errorf("failed to update selector for Service %s/%s: %w", serviceRef.Namespace, serviceRef.Name, err)
341348
}
342349

343-
t.Logf("Deleting Deployment %s/%s...", deploymentRef.Namespace, deploymentRef.Name)
344-
if err := c.Delete(context.Background(), deploymentToDelete); err != nil {
345-
// If the resource is already gone, we don't consider it an error.
346-
if !apierrors.IsNotFound(err) {
347-
return fmt.Errorf("failed to delete Deployment %s/%s: %w", deploymentRef.Namespace, deploymentRef.Name, err)
348-
}
350+
t.Logf("Waiting for EndpointSlices of Service %s/%s to become empty...", serviceRef.Namespace, serviceRef.Name)
351+
err := waitNumberOfEndpointsForService(ctx, c, serviceRef, 0, timeout)
352+
if err != nil {
353+
return nil, fmt.Errorf("timed out waiting for EndpointSlices of Service %s/%s to become empty: %w", serviceRef.Namespace, serviceRef.Name, err)
349354
}
355+
t.Logf("Successfully modified selector for Service %s/%s", serviceRef.Namespace, serviceRef.Name)
350356

351-
// Wait for the Deployment to be fully removed.
352-
waitErr := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, timeoutConfig.DeleteTimeout, true, func(ctx context.Context) (bool, error) {
353-
var dep appsv1.Deployment
354-
err := c.Get(ctx, deploymentRef, &dep)
355-
if apierrors.IsNotFound(err) {
356-
return true, nil
357+
cleanupFunc := func() {
358+
t.Helper()
359+
t.Logf("Restoring original selector for Service %s/%s...", serviceRef.Namespace, serviceRef.Name)
360+
361+
restorationCtx := context.Background()
362+
svcToRestore := &corev1.Service{}
363+
364+
if err := c.Get(restorationCtx, serviceRef, svcToRestore); err != nil {
365+
t.Fatalf("Cleanup failed: could not get Service %s/%s for restoration: %v", serviceRef.Namespace, serviceRef.Name, err)
357366
}
367+
368+
svcToRestore.Spec.Selector = originalSelector
369+
if err := c.Update(restorationCtx, svcToRestore); err != nil {
370+
t.Fatalf("Cleanup failed: could not restore original selector for Service %s/%s: %v", serviceRef.Namespace, serviceRef.Name, err)
371+
}
372+
373+
t.Logf("Waiting for EndpointSlices of Service %s/%s to be restored...", serviceRef.Namespace, serviceRef.Name)
374+
err := waitNumberOfEndpointsForService(restorationCtx, c, serviceRef, resources.EndPointPickerPodReplicas, timeout)
358375
if err != nil {
359-
return false, fmt.Errorf("error waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, err)
376+
t.Fatalf("Cleanup failed: timed out waiting for EndpointSlices of Service %s/%s to be restored: %v", serviceRef.Namespace, serviceRef.Name, err)
377+
}
378+
t.Logf("Successfully restored selector for Service %s/%s", serviceRef.Namespace, serviceRef.Name)
379+
}
380+
return cleanupFunc, nil
381+
}
382+
383+
func waitNumberOfEndpointsForService(ctx context.Context, c client.Client, serviceRef types.NamespacedName, wantNum int, timeout time.Duration) error {
384+
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
385+
endpointSliceList := &discoveryv1.EndpointSliceList{}
386+
if err := c.List(ctx, endpointSliceList,
387+
client.InNamespace(serviceRef.Namespace),
388+
client.MatchingLabels{discoveryv1.LabelServiceName: serviceRef.Name},
389+
); err != nil {
390+
return false, fmt.Errorf("failed to list EndpointSlices for Service %s: %w", serviceRef.Name, err)
391+
}
392+
totalEndpoints := 0
393+
for _, slice := range endpointSliceList.Items {
394+
totalEndpoints += len(slice.Endpoints)
395+
}
396+
if totalEndpoints == wantNum {
397+
return true, nil
360398
}
361399
return false, nil
362400
})
363-
364-
if waitErr != nil {
365-
return fmt.Errorf("timed out waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, waitErr)
366-
}
367-
t.Logf("Successfully deleted Deployment %s/%s", deploymentRef.Namespace, deploymentRef.Name)
368-
return nil
401+
return err
369402
}

0 commit comments

Comments
 (0)