@@ -241,8 +241,6 @@ func (p *Profile) kubectlApply(ctx context.Context, kubeconfig, manifestPath str
241241}
242242
243243func (p * Profile ) verifyEnvironment (ctx context.Context , opts * framework.SetupOptions ) error {
244- p .log ("Verifying all deployments are healthy..." )
245-
246244 // Create Kubernetes client
247245 config , err := clientcmd .BuildConfigFromFlags ("" , opts .KubeConfig )
248246 if err != nil {
@@ -251,9 +249,56 @@ func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOp
251249
252250 client , err := kubernetes .NewForConfig (config )
253251 if err != nil {
254- return fmt .Errorf ("failed to create kubernetes client: %w" , err )
252+ return fmt .Errorf ("failed to create kube client: %w" , err )
253+ }
254+
255+ // Wait for Envoy Gateway service to be ready with retry
256+ retryTimeout := 10 * time .Minute
257+ retryInterval := 5 * time .Second
258+ startTime := time .Now ()
259+
260+ p .log ("Waiting for Envoy Gateway service to be ready..." )
261+
262+ // Label selector for the semantic-router gateway service
263+ labelSelector := "gateway.envoyproxy.io/owning-gateway-namespace=default,gateway.envoyproxy.io/owning-gateway-name=semantic-router"
264+
265+ var envoyService string
266+ for {
267+ // Try to get Envoy service name
268+ envoyService , err = helpers .GetEnvoyServiceName (ctx , client , labelSelector , p .verbose )
269+ if err == nil {
270+ // Verify that the service has exactly 1 pod running with all containers ready
271+ podErr := helpers .VerifyServicePodsRunning (ctx , client , "envoy-gateway-system" , envoyService , p .verbose )
272+ if podErr == nil {
273+ p .log ("Envoy Gateway service is ready: %s" , envoyService )
274+ break
275+ }
276+ if p .verbose {
277+ p .log ("Envoy service found but pods not ready: %v" , podErr )
278+ }
279+ err = fmt .Errorf ("service pods not ready: %w" , podErr )
280+ }
281+
282+ if time .Since (startTime ) >= retryTimeout {
283+ return fmt .Errorf ("failed to get Envoy service with running pods after %v: %w" , retryTimeout , err )
284+ }
285+
286+ if p .verbose {
287+ p .log ("Envoy service not ready, retrying in %v... (elapsed: %v)" ,
288+ retryInterval , time .Since (startTime ).Round (time .Second ))
289+ }
290+
291+ select {
292+ case <- ctx .Done ():
293+ return ctx .Err ()
294+ case <- time .After (retryInterval ):
295+ // Continue retry
296+ }
255297 }
256298
299+ // Check all deployments are healthy
300+ p .log ("Verifying all deployments are healthy..." )
301+
257302 // Check semantic-router deployment
258303 if err := helpers .CheckDeployment (ctx , client , "vllm-semantic-router-system" , "semantic-router" , p .verbose ); err != nil {
259304 return fmt .Errorf ("semantic-router deployment not healthy: %w" , err )
0 commit comments