Skip to content

Commit 2044ddc

Browse files
authored
cli: fail and exit if ext-proc fails to start (envoyproxy#1134)
**Description** When using the `aigw run` command, if the extproc fails to start, the command does not return and hangs forever without giving any feedback to the user unless the `--debug` flag is used. This PR captures the extproc start errors and terminates the CLI process to give proper feedback to the user, printing the error and exiting with an error code: ``` $ ./out/aigw-darwin-arm64 run looking up the latest patch for Envoy version 1.35 1.35.0 is already downloaded starting: /tmp/envoy-gateway/versions/1.35.0/bin/envoy in run directory /tmp/envoy-gateway/runs/1756378634894657000 [2025-08-28 12:57:15.197][3950822][warning][config] [source/server/options_impl_platform_default.cc:9] CPU number provided by HW thread count (instead of cpuset). 2025/08/28 12:57:18 Error running: external processor run error: failed to start config watcher: failed to load initial config: failed to load config: cannot create backend auth handler: cannot retrieve AWS credentials: failed to refresh cached credentials, no EC2 IMDS role found, operation error ec2imds: GetMetadata, exceeded maximum number of attempts, 3, request send failed, Get "http://169.254.169.254/latest/meta-data/iam/security-credentials/": dial tcp 169.254.169.254:80: connect: host is down $ echo $? 1 ``` **Related Issues/PRs (if applicable)** N/A **Special notes for reviewers (if applicable)** N/A --------- Signed-off-by: Ignasi Barrera <[email protected]>
1 parent 9b1d867 commit 2044ddc

File tree

4 files changed

+88
-40
lines changed

4 files changed

+88
-40
lines changed

cmd/aigw/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ type (
4444
type (
4545
subCmdFn[T any] func(context.Context, T, io.Writer, io.Writer) error
4646
translateFn subCmdFn[cmdTranslate]
47-
runFn subCmdFn[cmdRun]
47+
runFn func(context.Context, cmdRun, runOpts, io.Writer, io.Writer) error
4848
)
4949

5050
func main() {
@@ -84,7 +84,7 @@ func doMain(ctx context.Context, stdout, stderr io.Writer, args []string, exitFn
8484
log.Fatalf("Error translating: %v", err)
8585
}
8686
case "run", "run <path>":
87-
err = rf(ctx, c.Run, stdout, stderr)
87+
err = rf(ctx, c.Run, runOpts{}, stdout, stderr)
8888
if err != nil {
8989
log.Fatalf("Error running: %v", err)
9090
}

cmd/aigw/main_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,12 @@ Flags:
107107
{
108108
name: "run no arg",
109109
args: []string{"run"},
110-
rf: func(_ context.Context, _ cmdRun, _, _ io.Writer) error { return nil },
110+
rf: func(context.Context, cmdRun, runOpts, io.Writer, io.Writer) error { return nil },
111111
},
112112
{
113113
name: "run help",
114114
args: []string{"run", "--help"},
115-
rf: func(_ context.Context, _ cmdRun, _, _ io.Writer) error { return nil },
115+
rf: func(context.Context, cmdRun, runOpts, io.Writer, io.Writer) error { return nil },
116116
expOut: `Usage: aigw run [<path>] [flags]
117117
118118
Run the AI Gateway locally for given configuration.
@@ -133,15 +133,15 @@ Flags:
133133
{
134134
name: "run show default",
135135
args: []string{"run", "--show-default"},
136-
rf: func(_ context.Context, c cmdRun, _, _ io.Writer) error {
136+
rf: func(_ context.Context, c cmdRun, _ runOpts, _, _ io.Writer) error {
137137
require.True(t, c.ShowDefault)
138138
return nil
139139
},
140140
},
141141
{
142142
name: "run with path",
143143
args: []string{"run", "./path"},
144-
rf: func(_ context.Context, c cmdRun, _, _ io.Writer) error {
144+
rf: func(_ context.Context, c cmdRun, _ runOpts, _, _ io.Writer) error {
145145
abs, err := filepath.Abs("./path")
146146
require.NoError(t, err)
147147
require.Equal(t, abs, c.Path)

cmd/aigw/run.go

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,27 @@ import (
3535
"github.com/envoyproxy/ai-gateway/internal/extensionserver"
3636
)
3737

38-
// This is the default configuration for the AI Gateway when <path> parameter is not given.
39-
//
40-
//go:embed ai-gateway-default-resources.yaml
41-
var aiGatewayDefaultResources string
38+
var (
39+
// This is the default configuration for the AI Gateway when <path> parameter is not given.
40+
//
41+
//go:embed ai-gateway-default-resources.yaml
42+
aiGatewayDefaultResources string
4243

43-
// This is the template for the Envoy Gateway configuration where PLACEHOLDER_TMPDIR will be replaced with the temporary
44-
// directory where the resources are written to.
45-
//
46-
//go:embed envoy-gateway-config.yaml
47-
var envoyGatewayConfigTemplate string
44+
// This is the template for the Envoy Gateway configuration where PLACEHOLDER_TMPDIR will be replaced with the temporary
45+
// directory where the resources are written to.
46+
//
47+
//go:embed envoy-gateway-config.yaml
48+
envoyGatewayConfigTemplate string
49+
)
4850

4951
const (
5052
substitutionEnvAnnotationPrefix = "substitution.aigw.run/env/"
5153
substitutionFileAnnotationPrefix = "substitution.aigw.run/file/"
5254
)
5355

56+
// errExtProcRun is returned when the external processor fails to run.
57+
var errExtProcRun = fmt.Errorf("external processor run error")
58+
5459
type runCmdContext struct {
5560
// isDebug true if the original `agw run` command is run with debug mode. Using this to
5661
// set the log level of the external process currently. TODO: maybe simply expose the external process log level
@@ -69,12 +74,18 @@ type runCmdContext struct {
6974
fakeClientSet *fake.Clientset
7075
}
7176

77+
// runOpts are the options for the run command.
78+
type runOpts struct {
79+
// udsPath is the path to the UDS socket used by the AI Gateway extproc.
80+
udsPath string
81+
}
82+
7283
// run starts the AI Gateway locally for a given configuration.
7384
//
7485
// This will create a temporary directory and a file:
7586
// 1. ${os.TempDir}/envoy-gateway-config.yaml: This contains the configuration for the Envoy Gateway agent to run, derived from envoyGatewayConfig.
7687
// 2. ${os.TempDir}/envoy-ai-gateway-resources: This will contain the EG resource generated by the translation and deployed by EG.
77-
func run(ctx context.Context, c cmdRun, stdout, stderr io.Writer) error {
88+
func run(ctx context.Context, c cmdRun, o runOpts, stdout, stderr io.Writer) error {
7889
if !c.Debug {
7990
stderr = io.Discard
8091
}
@@ -123,8 +134,11 @@ func run(ctx context.Context, c cmdRun, stdout, stderr io.Writer) error {
123134
// Write the Envoy Gateway resources into a file under resourcesTmpdir.
124135
resourceYamlPath := filepath.Join(resourcesTmpdir, "config.yaml")
125136
stderrLogger.Info("Creating Envoy Gateway resource file", "path", resourceYamlPath)
126-
udsPath := filepath.Join(tmpdir, "uds.sock")
127-
_ = os.Remove(udsPath)
137+
udsPath := o.udsPath
138+
if udsPath == "" {
139+
udsPath = filepath.Join(tmpdir, "uds.sock")
140+
_ = os.Remove(udsPath)
141+
}
128142

129143
// Do the translation of the given AI Gateway resources Yaml into Envoy Gateway resources and write them to the file.
130144
resourcesBuf := &bytes.Buffer{}
@@ -133,7 +147,7 @@ func run(ctx context.Context, c cmdRun, stdout, stderr io.Writer) error {
133147
if err != nil {
134148
return err
135149
}
136-
fakeClient, err := runCtx.writeEnvoyResourcesAndRunExtProc(ctx, aiGatewayResourcesYaml)
150+
fakeClient, extProxDone, err := runCtx.writeEnvoyResourcesAndRunExtProc(ctx, aiGatewayResourcesYaml)
137151
if err != nil {
138152
return fmt.Errorf("failed to write envoy resources and run extproc: %w", err)
139153
}
@@ -150,9 +164,17 @@ func run(ctx context.Context, c cmdRun, stdout, stderr io.Writer) error {
150164
extSrv := extensionserver.New(fakeClient, ctrl.Log, udsPath, true)
151165
egextension.RegisterEnvoyGatewayExtensionServer(s, extSrv)
152166
grpc_health_v1.RegisterHealthServer(s, extSrv)
167+
168+
serverCtx, serverCancel := context.WithCancel(ctx)
169+
170+
var extProcErr error
153171
go func() {
154-
<-ctx.Done()
172+
select {
173+
case <-ctx.Done():
174+
case extProcErr = <-extProxDone:
175+
}
155176
s.GracefulStop()
177+
serverCancel()
156178
}()
157179
go func() {
158180
if err := s.Serve(lis); err != nil {
@@ -177,10 +199,11 @@ func run(ctx context.Context, c cmdRun, stdout, stderr io.Writer) error {
177199
server.SetErr(io.Discard)
178200
}
179201
server.SetArgs([]string{"server", "--config-path", egConfigPath})
180-
if err := server.ExecuteContext(ctx); err != nil {
202+
if err := server.ExecuteContext(serverCtx); err != nil {
181203
return fmt.Errorf("failed to execute server: %w", err)
182204
}
183-
return nil
205+
206+
return extProcErr
184207
}
185208

186209
// readConfig returns config from the given path, substituting ENV variables
@@ -212,32 +235,32 @@ func recreateDir(path string) error {
212235

213236
// writeEnvoyResourcesAndRunExtProc reads all resources from the given string, writes them to the output file, and runs
214237
// external processes for EnvoyExtensionPolicy resources.
215-
func (runCtx *runCmdContext) writeEnvoyResourcesAndRunExtProc(ctx context.Context, original string) (client.Client, error) {
238+
func (runCtx *runCmdContext) writeEnvoyResourcesAndRunExtProc(ctx context.Context, original string) (client.Client, <-chan error, error) {
216239
aigwRoutes, aigwBackends, backendSecurityPolicies, gateways, secrets, err := collectObjects(original, runCtx.envoyGatewayResourcesOut, runCtx.stderrLogger)
217240
if err != nil {
218-
return nil, fmt.Errorf("error collecting: %w", err)
241+
return nil, nil, fmt.Errorf("error collecting: %w", err)
219242
}
220243
if len(gateways) > 1 {
221-
return nil, fmt.Errorf("multiple gateways are not supported: %s", gateways[0].Name)
244+
return nil, nil, fmt.Errorf("multiple gateways are not supported: %s", gateways[0].Name)
222245
}
223246
for _, bsp := range backendSecurityPolicies {
224247
spec := bsp.Spec
225248
if spec.AWSCredentials != nil && spec.AWSCredentials.OIDCExchangeToken != nil {
226249
// TODO: We can make it work by generalizing the rotation logic.
227-
return nil, fmt.Errorf("OIDC exchange token is not supported: %s", bsp.Name)
250+
return nil, nil, fmt.Errorf("OIDC exchange token is not supported: %s", bsp.Name)
228251
}
229252
}
230253

231254
// Do the substitution for the secrets.
232255
for _, s := range secrets {
233256
if err = runCtx.rewriteSecretWithAnnotatedLocation(s); err != nil {
234-
return nil, fmt.Errorf("failed to rewrite secret %s: %w", s.Name, err)
257+
return nil, nil, fmt.Errorf("failed to rewrite secret %s: %w", s.Name, err)
235258
}
236259
}
237260

238261
fakeClient, _fakeClientSet, httpRoutes, eps, httpRouteFilter, backends, _, err := translateCustomResourceObjects(ctx, aigwRoutes, aigwBackends, backendSecurityPolicies, gateways, secrets, runCtx.stderrLogger)
239262
if err != nil {
240-
return nil, fmt.Errorf("error translating: %w", err)
263+
return nil, nil, fmt.Errorf("error translating: %w", err)
241264
}
242265
runCtx.fakeClientSet = _fakeClientSet
243266

@@ -260,27 +283,27 @@ func (runCtx *runCmdContext) writeEnvoyResourcesAndRunExtProc(ctx context.Contex
260283
Secrets("").Get(ctx,
261284
controller.FilterConfigSecretPerGatewayName(gw.Name, gw.Namespace), metav1.GetOptions{})
262285
if err != nil {
263-
return nil, fmt.Errorf("failed to get filter config secret: %w", err)
286+
return nil, nil, fmt.Errorf("failed to get filter config secret: %w", err)
264287
}
265288

266289
rawConfig, ok := filterConfigSecret.StringData[controller.FilterConfigKeyInSecret]
267290
if !ok {
268-
return nil, fmt.Errorf("failed to get filter config from secret: %w", err)
291+
return nil, nil, fmt.Errorf("failed to get filter config from secret: %w", err)
269292
}
270293
var fc filterapi.Config
271294
if err = yaml.Unmarshal([]byte(rawConfig), &fc); err != nil {
272-
return nil, fmt.Errorf("failed to unmarshal filter config: %w", err)
295+
return nil, nil, fmt.Errorf("failed to unmarshal filter config: %w", err)
273296
}
274297
runCtx.stderrLogger.Info("Running external process", "config", fc)
275-
runCtx.mustStartExtProc(ctx, &fc)
276-
return fakeClient, nil
298+
done := runCtx.mustStartExtProc(ctx, &fc)
299+
return fakeClient, done, nil
277300
}
278301

279302
// mustStartExtProc starts the external process with the given working directory, port, and filter configuration.
280303
func (runCtx *runCmdContext) mustStartExtProc(
281304
ctx context.Context,
282305
filterCfg *filterapi.Config,
283-
) {
306+
) <-chan error {
284307
marshaled, err := yaml.Marshal(filterCfg)
285308
if err != nil {
286309
panic(fmt.Sprintf("BUG: failed to marshal filter config: %v", err))
@@ -300,11 +323,16 @@ func (runCtx *runCmdContext) mustStartExtProc(
300323
} else {
301324
args = append(args, "--logLevel", "warn")
302325
}
326+
327+
done := make(chan error)
303328
go func() {
304329
if err := mainlib.Main(ctx, args, os.Stderr); err != nil {
305330
runCtx.stderrLogger.Error("Failed to run external processor", "error", err)
331+
done <- fmt.Errorf("%w: %w", errExtProcRun, err)
306332
}
333+
close(done)
307334
}()
335+
return done
308336
}
309337

310338
// mustClearSetOwnerReferencesAndStatusAndWriteObj clears the owner references and status of the given object, marshals it

cmd/aigw/run_test.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func TestRun(t *testing.T) {
4747
defer cancel()
4848
done := make(chan struct{})
4949
go func() {
50-
require.NoError(t, run(ctx, cmdRun{Debug: true, Path: resourcePath}, os.Stdout, os.Stderr))
50+
require.NoError(t, run(ctx, cmdRun{Debug: true, Path: resourcePath}, runOpts{}, os.Stdout, os.Stderr))
5151
close(done)
5252
}()
5353
defer func() {
@@ -157,6 +157,26 @@ func TestRun(t *testing.T) {
157157
})
158158
}
159159

160+
func TestRunExtprocStartFailure(t *testing.T) {
161+
var (
162+
resourcePath, _ = setupDefaultAIGatewayResourcesWithAvailableCredentials(t)
163+
errChan = make(chan error)
164+
)
165+
166+
go func() {
167+
errChan <- run(t.Context(), cmdRun{Debug: true, Path: resourcePath}, runOpts{
168+
udsPath: "/dev/null", // This will cause the external processor to fail to start.
169+
}, os.Stdout, os.Stderr)
170+
}()
171+
172+
select {
173+
case <-time.After(10 * time.Second):
174+
t.Fatalf("expected extproc start process to fail and return")
175+
case err := <-errChan:
176+
require.ErrorIs(t, err, errExtProcRun)
177+
}
178+
}
179+
160180
func TestRunCmdContext_writeEnvoyResourcesAndRunExtProc(t *testing.T) {
161181
resourcePath, _ := setupDefaultAIGatewayResourcesWithAvailableCredentials(t)
162182
runCtx := &runCmdContext{
@@ -170,29 +190,29 @@ func TestRunCmdContext_writeEnvoyResourcesAndRunExtProc(t *testing.T) {
170190
content, err := os.ReadFile(resourcePath)
171191
require.NoError(t, err)
172192
ctx, cancel := context.WithCancel(t.Context())
173-
_, err = runCtx.writeEnvoyResourcesAndRunExtProc(ctx, string(content))
193+
_, done, err := runCtx.writeEnvoyResourcesAndRunExtProc(ctx, string(content))
174194
require.NoError(t, err)
175195
time.Sleep(1 * time.Second)
176196
cancel()
177197
// Wait for the external processor to stop.
178-
time.Sleep(1 * time.Second)
198+
require.NoError(t, <-done)
179199
}
180200

181201
func Test_mustStartExtProc(t *testing.T) {
182202
ctx, cancel := context.WithCancel(t.Context())
183-
defer cancel()
203+
t.Cleanup(cancel)
184204
runCtx := &runCmdContext{
185205
tmpdir: t.TempDir(),
186206
// UNIX doesn't like a long UDS path, so we use a short one.
187207
// https://unix.stackexchange.com/questions/367008/why-is-socket-path-length-limited-to-a-hundred-chars
188208
udsPath: filepath.Join("/tmp", "run.sock"),
189209
stderrLogger: slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{})),
190210
}
191-
runCtx.mustStartExtProc(ctx, filterapi.MustLoadDefaultConfig())
211+
done := runCtx.mustStartExtProc(ctx, filterapi.MustLoadDefaultConfig())
192212
time.Sleep(1 * time.Second)
193213
cancel()
194214
// Wait for the external processor to stop.
195-
time.Sleep(1 * time.Second)
215+
require.NoError(t, <-done)
196216
}
197217

198218
// checkIfOllamaReady checks if the Ollama server is ready and if the specified model is available.

0 commit comments

Comments
 (0)