Skip to content

Commit 7ca36bf

Browse files
authored
feat: add health check for epp cluster (#966)
* feat: add health check for epp cluster Signed-off-by: zhengkezhou1 <[email protected]> * remove tls Signed-off-by: zhengkezhou1 <[email protected]> * don't use tls Signed-off-by: zhengkezhou1 <[email protected]> * health checking flag Signed-off-by: zhengkezhou1 <[email protected]> * fix import Signed-off-by: zhengkezhou1 <[email protected]> * add tls options Signed-off-by: zhengkezhou1 <[email protected]> --------- Signed-off-by: zhengkezhou1 <[email protected]>
1 parent ed25ed3 commit 7ca36bf

File tree

4 files changed

+30
-3
lines changed

4 files changed

+30
-3
lines changed

cmd/epp/runner/runner.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ var (
9393
logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity")
9494
secureServing = flag.Bool(
9595
"secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.")
96-
certPath = flag.String(
96+
healthChecking = flag.Bool("healthChecking", runserver.DefaultHealthChecking, "Enables health checking")
97+
certPath = flag.String(
9798
"certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+
9899
"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
99100
"then a self-signed certificate is used.")
@@ -229,6 +230,7 @@ func (r *Runner) Run(ctx context.Context) error {
229230
PoolNamespacedName: poolNamespacedName,
230231
Datastore: datastore,
231232
SecureServing: *secureServing,
233+
HealthChecking: *healthChecking,
232234
CertPath: *certPath,
233235
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
234236
Director: director,

pkg/epp/server/runserver.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,15 @@ import (
2020
"context"
2121
"crypto/tls"
2222
"fmt"
23+
2324
"time"
2425

2526
extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
2627
"github.com/go-logr/logr"
2728
"google.golang.org/grpc"
2829
"google.golang.org/grpc/credentials"
30+
"google.golang.org/grpc/health"
31+
healthgrpc "google.golang.org/grpc/health/grpc_health_v1"
2932
"k8s.io/apimachinery/pkg/types"
3033
ctrl "sigs.k8s.io/controller-runtime"
3134
"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -46,6 +49,7 @@ type ExtProcServerRunner struct {
4649
PoolNamespacedName types.NamespacedName
4750
Datastore datastore.Datastore
4851
SecureServing bool
52+
HealthChecking bool
4953
CertPath string
5054
RefreshPrometheusMetricsInterval time.Duration
5155
Director *requestcontrol.Director
@@ -66,6 +70,7 @@ const (
6670
DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval
6771
DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval
6872
DefaultSecureServing = true // default for --secureServing
73+
DefaultHealthChecking = false // default for --healthChecking
6974
)
7075

7176
// NewDefaultExtProcServerRunner creates a runner with default values.
@@ -77,6 +82,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
7782
DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace,
7883
PoolNamespacedName: types.NamespacedName{Name: DefaultPoolName, Namespace: DefaultPoolNamespace},
7984
SecureServing: DefaultSecureServing,
85+
HealthChecking: DefaultHealthChecking,
8086
RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval,
8187
// Dependencies can be assigned later.
8288
}
@@ -152,6 +158,16 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable {
152158
extProcServer,
153159
)
154160

161+
if r.HealthChecking {
162+
healthcheck := health.NewServer()
163+
healthgrpc.RegisterHealthServer(srv,
164+
healthcheck,
165+
)
166+
svcName := extProcPb.ExternalProcessor_ServiceDesc.ServiceName
167+
logger.Info("Setting ExternalProcessor service status to SERVING", "serviceName", svcName)
168+
healthcheck.SetServingStatus(svcName, healthgrpc.HealthCheckResponse_SERVING)
169+
}
170+
155171
// Forward to the gRPC runnable.
156172
return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx)
157173
}))

test/testdata/envoy.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,16 @@ data:
170170
max_pending_requests: 40000
171171
max_requests: 40000
172172
max_retries: 1024
173+
health_checks:
174+
- timeout: 2s
175+
interval: 10s
176+
unhealthy_threshold: 3
177+
healthy_threshold: 2
178+
reuse_connection: true
179+
grpc_health_check:
180+
service_name: "envoy.service.ext_proc.v3.ExternalProcessor"
181+
tls_options:
182+
alpn_protocols: ["h2"]
173183
# This ensures that envoy accepts untrusted certificates. We tried to explicitly
174184
# set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work
175185
# and what worked is setting the common_tls_context to empty.
@@ -197,7 +207,6 @@ data:
197207
socket_address:
198208
address: vllm-llama3-8b-instruct-epp.$E2E_NS
199209
port_value: 9002
200-
health_status: HEALTHY
201210
load_balancing_weight: 1
202211
---
203212
apiVersion: apps/v1

test/testdata/inferencepool-e2e.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ rules:
109109
- subjectaccessreviews
110110
verbs:
111111
- create
112-
---
112+
---
113113
kind: ClusterRoleBinding
114114
apiVersion: rbac.authorization.k8s.io/v1
115115
metadata:

0 commit comments

Comments
 (0)