Skip to content

Commit 5afe58b

Browse files
authored
feat(conformance) add EPP unavailable fail-open test (#999)
* Add test for epp becoming unavailable and the extensionRef.failureMode is set to failOpen. * resolve minor comments. * format. * import format.
1 parent 64c62ad commit 5afe58b

File tree

4 files changed

+196
-1
lines changed

4 files changed

+196
-1
lines changed

conformance/resources/manifests/manifests.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ metadata:
110110
labels:
111111
app: secondary-inference-model-server
112112
spec:
113+
replicas: 3
113114
selector:
114115
matchLabels:
115116
app: secondary-inference-model-server
@@ -245,6 +246,7 @@ spec:
245246
targetPortNumber: 3000
246247
extensionRef:
247248
name: secondary-endpoint-picker-svc
249+
failureMode: FailOpen
248250
---
249251
# --- Secondary Conformance EPP service Definition ---
250252
apiVersion: v1
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package basic
18+
19+
import (
20+
"net/http"
21+
"testing"
22+
23+
"github.com/stretchr/testify/require"
24+
"k8s.io/apimachinery/pkg/types"
25+
"sigs.k8s.io/gateway-api/conformance/utils/suite"
26+
"sigs.k8s.io/gateway-api/pkg/features"
27+
28+
"sigs.k8s.io/gateway-api-inference-extension/conformance/tests"
29+
k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes"
30+
trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic"
31+
)
32+
33+
func init() {
34+
tests.ConformanceTests = append(tests.ConformanceTests, EppUnAvailableFailOpen)
35+
}
36+
37+
var EppUnAvailableFailOpen = suite.ConformanceTest{
38+
ShortName: "EppUnAvailableFailOpen",
39+
Description: "Inference gateway should send traffic to backends even when the EPP is unavailable (fail-open)",
40+
Manifests: []string{"tests/basic/epp_unavailable_fail_open.yaml"},
41+
Features: []features.FeatureName{
42+
features.FeatureName("SupportInferencePool"),
43+
features.SupportGateway,
44+
},
45+
Test: func(t *testing.T, s *suite.ConformanceTestSuite) {
46+
const (
47+
appBackendNamespace = "gateway-conformance-app-backend"
48+
infraNamespace = "gateway-conformance-infra"
49+
hostname = "secondary.example.com"
50+
path = "/failopen-pool-test"
51+
expectedPodReplicas = 3
52+
eppSelectionHeaderName = "test-epp-endpoint-selection"
53+
appPodBackendPrefix = "secondary-inference-model-server"
54+
requestBody = `{
55+
"model": "conformance-fake-model",
56+
"prompt": "Write as if you were a critic: San Francisco"
57+
}`
58+
)
59+
60+
httpRouteNN := types.NamespacedName{Name: "httproute-for-failopen-pool-gw", Namespace: appBackendNamespace}
61+
gatewayNN := types.NamespacedName{Name: "conformance-secondary-gateway", Namespace: infraNamespace}
62+
poolNN := types.NamespacedName{Name: "secondary-inference-pool", Namespace: appBackendNamespace}
63+
eppDeploymentNN := types.NamespacedName{Name: "secondary-app-endpoint-picker", Namespace: appBackendNamespace}
64+
backendPodLabels := map[string]string{"app": "secondary-inference-model-server"}
65+
66+
k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRouteNN, gatewayNN)
67+
k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN)
68+
gwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN)
69+
70+
pods, err := k8sutils.GetPodsWithLabel(t, s.Client, appBackendNamespace, backendPodLabels)
71+
require.NoError(t, err, "Failed to get backend pods")
72+
require.Len(t, pods, expectedPodReplicas, "Expected to find %d backend pod, but found %d.", expectedPodReplicas, len(pods))
73+
74+
targetPodIP := pods[0].Status.PodIP
75+
t.Run("Phase 1: Verify baseline connectivity with EPP available", func(t *testing.T) {
76+
t.Log("Sending request to ensure the Gateway and EPP are working correctly...")
77+
trafficutils.MakeRequestWithRequestParamAndExpectSuccess(
78+
t,
79+
s.RoundTripper,
80+
s.TimeoutConfig,
81+
gwAddr,
82+
trafficutils.Request{
83+
Host: hostname,
84+
Path: path,
85+
Headers: map[string]string{eppSelectionHeaderName: targetPodIP},
86+
Method: http.MethodPost,
87+
Body: requestBody,
88+
Backend: pods[0].Name, // Make sure the request is from the targetPod when the EPP is alive.
89+
Namespace: appBackendNamespace,
90+
},
91+
)
92+
})
93+
94+
t.Run("Phase 2: Verify fail-open behavior after EPP becomes unavailable", func(t *testing.T) {
95+
t.Log("Simulating an EPP failure by deleting its deployment...")
96+
deleteErr := k8sutils.DeleteDeployment(t, s.Client, s.TimeoutConfig, eppDeploymentNN)
97+
require.NoError(t, deleteErr, "Failed to delete the EPP deployment")
98+
99+
t.Log("Sending request again, expecting success to verify fail-open...")
100+
trafficutils.MakeRequestWithRequestParamAndExpectSuccess(
101+
t,
102+
s.RoundTripper,
103+
s.TimeoutConfig,
104+
gwAddr,
105+
trafficutils.Request{
106+
Host: hostname,
107+
Path: path,
108+
Headers: map[string]string{eppSelectionHeaderName: targetPodIP},
109+
Method: http.MethodPost,
110+
Body: requestBody,
111+
Backend: appPodBackendPrefix, // Only checks the prefix since the EPP is not alive and the response can return from any Pod.
112+
Namespace: appBackendNamespace,
113+
},
114+
)
115+
})
116+
},
117+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# --- InferenceModel Definition ---
2+
# TODO: remove inferenceModel dependency https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1002
3+
apiVersion: inference.networking.x-k8s.io/v1alpha2
4+
kind: InferenceModel
5+
metadata:
6+
name: conformance-fake-model-server
7+
namespace: gateway-conformance-app-backend
8+
spec:
9+
modelName: conformance-fake-model
10+
criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics.
11+
poolRef:
12+
name: secondary-inference-pool
13+
---
14+
apiVersion: gateway.networking.k8s.io/v1
15+
kind: HTTPRoute
16+
metadata:
17+
name: httproute-for-failopen-pool-gw
18+
namespace: gateway-conformance-app-backend
19+
spec:
20+
parentRefs:
21+
- group: gateway.networking.k8s.io
22+
kind: Gateway
23+
name: conformance-secondary-gateway
24+
namespace: gateway-conformance-infra
25+
sectionName: http
26+
hostnames:
27+
- "secondary.example.com"
28+
rules:
29+
- backendRefs:
30+
- group: inference.networking.x-k8s.io
31+
kind: InferencePool
32+
name: secondary-inference-pool # Use secondary-inferencePool because it has failureMode set to failOpen
33+
matches:
34+
- path:
35+
type: PathPrefix
36+
value: /failopen-pool-test

conformance/utils/kubernetes/helpers.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"time"
2727

2828
"github.com/stretchr/testify/require"
29+
appsv1 "k8s.io/api/apps/v1"
2930
corev1 "k8s.io/api/core/v1"
3031
apierrors "k8s.io/apimachinery/pkg/api/errors"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -361,11 +362,50 @@ func GetPod(t *testing.T, c client.Client, namespace string, selector labels.Sel
361362
}
362363
return false, nil
363364
})
364-
365365
require.NoErrorf(t, waitErr, "timed out waiting for Pod with selector %s in namespace %s to be ready", selector.String(), namespace)
366366
require.NotEmpty(t, pods.Items, "expected at least one pod for selector %s in namespace %s, but found none", selector.String(), namespace)
367367

368368
pod := &pods.Items[0]
369369
t.Logf("Successfully found ready Pod %s with IP %s for selector %s", pod.Name, pod.Status.PodIP, selector.String())
370370
return pod
371371
}
372+
373+
// DeleteDeployment deletes the specified Deployment and waits until it is no longer
374+
// present in the cluster.
375+
func DeleteDeployment(t *testing.T, c client.Client, timeoutConfig gatewayapiconfig.TimeoutConfig, deploymentRef types.NamespacedName) error {
376+
t.Helper()
377+
378+
deploymentToDelete := &appsv1.Deployment{
379+
ObjectMeta: metav1.ObjectMeta{
380+
Name: deploymentRef.Name,
381+
Namespace: deploymentRef.Namespace,
382+
},
383+
}
384+
385+
t.Logf("Deleting Deployment %s/%s...", deploymentRef.Namespace, deploymentRef.Name)
386+
if err := c.Delete(context.Background(), deploymentToDelete); err != nil {
387+
// If the resource is already gone, we don't consider it an error.
388+
if !apierrors.IsNotFound(err) {
389+
return fmt.Errorf("failed to delete Deployment %s/%s: %w", deploymentRef.Namespace, deploymentRef.Name, err)
390+
}
391+
}
392+
393+
// Wait for the Deployment to be fully removed.
394+
waitErr := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, timeoutConfig.DeleteTimeout, true, func(ctx context.Context) (bool, error) {
395+
var dep appsv1.Deployment
396+
err := c.Get(ctx, deploymentRef, &dep)
397+
if apierrors.IsNotFound(err) {
398+
return true, nil
399+
}
400+
if err != nil {
401+
return false, fmt.Errorf("error waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, err)
402+
}
403+
return false, nil
404+
})
405+
406+
if waitErr != nil {
407+
return fmt.Errorf("timed out waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, waitErr)
408+
}
409+
t.Logf("Successfully deleted Deployment %s/%s", deploymentRef.Namespace, deploymentRef.Name)
410+
return nil
411+
}

0 commit comments

Comments
 (0)