Skip to content

Commit d22652d

Browse files
feat(aigw): consolidate admin server into a single port (envoyproxy#1236)
**Description** This consolidates the extproc admin server from two separate ports (metricsPort and healthPort) into a single adminPort (default 1064) serving both /metrics and /health endpoints. The `aigw run` and the new `aigw healthcheck` commands now take --admin-port flags, propagating this to extproc. Previously, extproc exposed metrics on port 1064 and health checks on port 1065. Now both are served from a single HTTP server on port 1064, simplifying configuration and Docker HEALTHCHECK setup. The Docker HEALTHCHECK is now functional, using the new healthcheck subcommand which polls the admin server's /health endpoint. This /health endpoint proxies to the extproc gRPC health check, returning HTTP 200 when SERVING. A new subcommand (`aigw healthcheck`) was needed as the image is distroless (has no curl etc). When aigw run starts Envoy, it polls for readiness using either the Envoy admin address (when configured in EnvoyProxy bootstrap) or by checking if the Gateway listener port accepts TCP connections. This ensures startup completes reliably with minimal Gateway configurations. Gateway validation now requires at least one listener to be configured. --------- Signed-off-by: Adrian Cole <[email protected]>
1 parent 8d3310e commit d22652d

File tree

23 files changed

+830
-415
lines changed

23 files changed

+830
-415
lines changed

Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,10 @@ COPY --from=envoy-downloader /tmp/envoy-gateway /tmp/envoy-gateway
3535
COPY ./out/${COMMAND_NAME}-${TARGETOS}-${TARGETARCH} /app
3636

3737
USER nonroot:nonroot
38+
39+
# The healthcheck subcommand performs an HTTP GET to localhost:1064/healthlthy for "aigw run".
40+
# NOTE: This is only for aigw in practice since this is ignored by Kubernetes.
41+
HEALTHCHECK --interval=10s --timeout=5s --start-period=5s --retries=3 \
42+
CMD ["/app", "healthcheck"]
43+
3844
ENTRYPOINT ["/app"]

cmd/aigw/docker-compose-otel.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ services:
6262
- OPENAI_API_KEY=unused
6363
ports:
6464
- "1975:1975" # OpenAI compatible endpoint at /v1
65-
- "1064:1064" # Prometheus endpoint at /metrics
65+
- "1064:1064" # Admin server: /metrics (Prometheus) and /health endpoints
6666
extra_hosts: # localhost:host-gateway trick doesn't work with aigw
6767
- "host.docker.internal:host-gateway"
6868
command: ["run"]

cmd/aigw/docker-compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ services:
3838
- OPENAI_API_KEY=unused
3939
ports:
4040
- "1975:1975" # OpenAI compatible endpoint at /v1
41-
- "1064:1064" # Prometheus endpoint at /metrics
41+
- "1064:1064" # Admin server: /metrics (Prometheus) and /health endpoints
4242
extra_hosts: # localhost:host-gateway trick doesn't work with aigw
4343
- "host.docker.internal:host-gateway"
4444
command: ["run"]

cmd/aigw/healthcheck.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright Envoy AI Gateway Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
// The full text of the Apache license is available in the LICENSE file at
4+
// the root of the repo.
5+
6+
package main
7+
8+
import (
9+
"context"
10+
"fmt"
11+
"io"
12+
"net/http"
13+
"time"
14+
)
15+
16+
// healthcheck performs an HTTP GET request to the admin server health endpoint.
17+
// This is used by Docker HEALTHCHECK to verify the aigw admin server is responsive.
18+
// It exits with code 0 on success (healthy) or 1 on failure (unhealthy).
19+
func healthcheck(ctx context.Context, port int, stdout, _ io.Writer) error {
20+
url := fmt.Sprintf("http://localhost:%d/health", port)
21+
22+
client := &http.Client{
23+
Timeout: 5 * time.Second,
24+
}
25+
26+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
27+
if err != nil {
28+
return fmt.Errorf("failed to create request: %w", err)
29+
}
30+
31+
resp, err := client.Do(req)
32+
if err != nil {
33+
return fmt.Errorf("failed to connect to admin server")
34+
}
35+
defer resp.Body.Close()
36+
37+
if resp.StatusCode != http.StatusOK {
38+
body, _ := io.ReadAll(resp.Body)
39+
return fmt.Errorf("unhealthy: status %d, body: %s", resp.StatusCode, string(body))
40+
}
41+
42+
// Optionally read and print the response for debugging
43+
body, err := io.ReadAll(resp.Body)
44+
if err != nil {
45+
return fmt.Errorf("failed to read response: %w", err)
46+
}
47+
48+
_, _ = fmt.Fprintf(stdout, "%s", body)
49+
return nil
50+
}

cmd/aigw/healthcheck_test.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright Envoy AI Gateway Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
// The full text of the Apache license is available in the LICENSE file at
4+
// the root of the repo.
5+
6+
package main
7+
8+
import (
9+
"bytes"
10+
"net/http"
11+
"net/http/httptest"
12+
"net/url"
13+
"strconv"
14+
"testing"
15+
16+
"github.com/stretchr/testify/require"
17+
)
18+
19+
func Test_healthcheck(t *testing.T) {
20+
tests := []struct {
21+
name string
22+
closeServer bool
23+
statusCode int
24+
respBody string
25+
expOut string
26+
expErr string
27+
}{
28+
{
29+
name: "success",
30+
statusCode: http.StatusOK,
31+
respBody: "OK",
32+
expOut: "OK",
33+
},
34+
{
35+
name: "unhealthy status",
36+
statusCode: http.StatusServiceUnavailable,
37+
respBody: "not ready",
38+
expErr: "unhealthy: status 503, body: not ready",
39+
},
40+
{
41+
name: "internal error",
42+
statusCode: http.StatusInternalServerError,
43+
respBody: "server error",
44+
expErr: "unhealthy: status 500, body: server error",
45+
},
46+
{
47+
name: "connection failure",
48+
closeServer: true,
49+
expErr: "failed to connect to admin server",
50+
},
51+
}
52+
for _, tt := range tests {
53+
t.Run(tt.name, func(t *testing.T) {
54+
s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
55+
w.WriteHeader(tt.statusCode)
56+
_, _ = w.Write([]byte(tt.respBody))
57+
}))
58+
t.Cleanup(s.Close)
59+
60+
u, err := url.Parse(s.URL)
61+
require.NoError(t, err)
62+
port, err := strconv.Atoi(u.Port())
63+
require.NoError(t, err)
64+
65+
if tt.closeServer {
66+
s.Close()
67+
}
68+
69+
stdout := &bytes.Buffer{}
70+
err = healthcheck(t.Context(), port, stdout, nil)
71+
72+
if tt.expErr != "" {
73+
require.Equal(t, tt.expErr, err.Error())
74+
require.Empty(t, stdout.String())
75+
} else {
76+
require.NoError(t, err)
77+
require.Equal(t, tt.expOut, stdout.String())
78+
}
79+
})
80+
}
81+
}

cmd/aigw/main.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ type (
2828
Translate cmdTranslate `cmd:"" help:"Translate yaml files containing AI Gateway resources to Envoy Gateway and Kubernetes resources. The translated resources are written to stdout."`
2929
// Run is the sub-command parsed by the `cmdRun` struct.
3030
Run cmdRun `cmd:"" help:"Run the AI Gateway locally for given configuration."`
31+
// Healthcheck is the sub-command to check if the aigw server is healthy.
32+
Healthcheck cmdHealthcheck `cmd:"" help:"Docker HEALTHCHECK command."`
3133
}
3234
// cmdTranslate corresponds to `aigw translate` command.
3335
cmdTranslate struct {
@@ -36,8 +38,13 @@ type (
3638
}
3739
// cmdRun corresponds to `aigw run` command.
3840
cmdRun struct {
39-
Debug bool `help:"Enable debug logging emitted to stderr."`
40-
Path string `arg:"" name:"path" optional:"" help:"Path to the AI Gateway configuration yaml file. Optional when at least OPENAI_API_KEY is set." type:"path"`
41+
Debug bool `help:"Enable debug logging emitted to stderr."`
42+
Path string `arg:"" name:"path" optional:"" help:"Path to the AI Gateway configuration yaml file. Optional when at least OPENAI_API_KEY is set." type:"path"`
43+
AdminPort int `help:"HTTP port for the admin server (serves /metrics and /health endpoints)." default:"1064"`
44+
}
45+
// cmdHealthcheck corresponds to `aigw healthcheck` command.
46+
cmdHealthcheck struct {
47+
AdminPort int `help:"HTTP port for the admin server (serves /metrics and /health endpoints)." default:"1064"`
4148
}
4249
)
4350

@@ -53,10 +60,11 @@ type (
5360
subCmdFn[T any] func(context.Context, T, io.Writer, io.Writer) error
5461
translateFn subCmdFn[cmdTranslate]
5562
runFn func(context.Context, cmdRun, runOpts, io.Writer, io.Writer) error
63+
healthcheckFn func(context.Context, int, io.Writer, io.Writer) error
5664
)
5765

5866
func main() {
59-
doMain(ctrl.SetupSignalHandler(), os.Stdout, os.Stderr, os.Args[1:], os.Exit, translate, run)
67+
doMain(ctrl.SetupSignalHandler(), os.Stdout, os.Stderr, os.Args[1:], os.Exit, translate, run, healthcheck)
6068
}
6169

6270
// doMain is the main entry point for the CLI. It parses the command line arguments and executes the appropriate command.
@@ -70,6 +78,7 @@ func main() {
7078
func doMain(ctx context.Context, stdout, stderr io.Writer, args []string, exitFn func(int),
7179
tf translateFn,
7280
rf runFn,
81+
hf healthcheckFn,
7382
) {
7483
var c cmd
7584
parser, err := kong.New(&c,
@@ -96,6 +105,11 @@ func doMain(ctx context.Context, stdout, stderr io.Writer, args []string, exitFn
96105
if err != nil {
97106
log.Fatalf("Error running: %v", err)
98107
}
108+
case "healthcheck":
109+
err = hf(ctx, c.Healthcheck.AdminPort, stdout, stderr)
110+
if err != nil {
111+
log.Fatalf("Health check failed: %v", err)
112+
}
99113
default:
100114
panic("unreachable")
101115
}

cmd/aigw/main_test.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func Test_doMain(t *testing.T) {
2424
env map[string]string
2525
tf translateFn
2626
rf runFn
27+
hf healthcheckFn
2728
expOut string
2829
expPanicCode *int
2930
}{
@@ -48,6 +49,9 @@ Commands:
4849
run [<path>] [flags]
4950
Run the AI Gateway locally for given configuration.
5051
52+
healthcheck [flags]
53+
Docker HEALTHCHECK command.
54+
5155
Run "aigw <command> --help" for more information on a command.
5256
`,
5357
expPanicCode: ptr.To(0),
@@ -130,9 +134,11 @@ Arguments:
130134
least OPENAI_API_KEY is set.
131135
132136
Flags:
133-
-h, --help Show context-sensitive help.
137+
-h, --help Show context-sensitive help.
134138
135-
--debug Enable debug logging emitted to stderr.
139+
--debug Enable debug logging emitted to stderr.
140+
--admin-port=1064 HTTP port for the admin server (serves /metrics and
141+
/health endpoints).
136142
`,
137143
expPanicCode: ptr.To(0),
138144
},
@@ -155,10 +161,10 @@ Flags:
155161
out := &bytes.Buffer{}
156162
if tt.expPanicCode != nil {
157163
require.PanicsWithValue(t, *tt.expPanicCode, func() {
158-
doMain(t.Context(), out, os.Stderr, tt.args, func(code int) { panic(code) }, tt.tf, tt.rf)
164+
doMain(t.Context(), out, os.Stderr, tt.args, func(code int) { panic(code) }, tt.tf, tt.rf, tt.hf)
159165
})
160166
} else {
161-
doMain(t.Context(), out, os.Stderr, tt.args, nil, tt.tf, tt.rf)
167+
doMain(t.Context(), out, os.Stderr, tt.args, nil, tt.tf, tt.rf, tt.hf)
162168
}
163169
require.Equal(t, tt.expOut, out.String())
164170
})

0 commit comments

Comments
 (0)