From f3998d59aae3e689210bff02f433b1299398ccff Mon Sep 17 00:00:00 2001 From: samzong Date: Fri, 21 Nov 2025 00:40:00 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(e2e/llm-d):=20add=20LLM-D=20pr?= =?UTF-8?q?ofile=20and=20test=20cases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: samzong --- .github/workflows/integration-test-k8s.yml | 5 +- e2e/README.md | 2 +- e2e/cmd/e2e/main.go | 4 + .../llm-d/manifests/httproute-services.yaml | 51 ++ .../llm-d/manifests/inference-sim.yaml | 101 ++++ e2e/profiles/llm-d/manifests/rbac.yaml | 27 + e2e/profiles/llm-d/profile.go | 562 ++++++++++++++++++ e2e/profiles/llm-d/values.yaml | 69 +++ e2e/testcases/llmd_auto_routing.go | 61 ++ e2e/testcases/llmd_distributed_inference.go | 97 +++ e2e/testcases/llmd_failover_recovery.go | 99 +++ e2e/testcases/llmd_health_check.go | 90 +++ e2e/testcases/llmd_helpers.go | 71 +++ e2e/testcases/llmd_performance_baseline.go | 109 ++++ tools/make/e2e.mk | 2 +- 15 files changed, 1345 insertions(+), 5 deletions(-) create mode 100644 e2e/profiles/llm-d/manifests/httproute-services.yaml create mode 100644 e2e/profiles/llm-d/manifests/inference-sim.yaml create mode 100644 e2e/profiles/llm-d/manifests/rbac.yaml create mode 100644 e2e/profiles/llm-d/profile.go create mode 100644 e2e/profiles/llm-d/values.yaml create mode 100644 e2e/testcases/llmd_auto_routing.go create mode 100644 e2e/testcases/llmd_distributed_inference.go create mode 100644 e2e/testcases/llmd_failover_recovery.go create mode 100644 e2e/testcases/llmd_health_check.go create mode 100644 e2e/testcases/llmd_helpers.go create mode 100644 e2e/testcases/llmd_performance_baseline.go diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml index aa1d14109..947f8c397 100644 --- a/.github/workflows/integration-test-k8s.yml +++ b/.github/workflows/integration-test-k8s.yml @@ -12,11 +12,11 @@ on: jobs: integration-test: runs-on: ubuntu-latest - timeout-minutes: 60 + timeout-minutes: 75 strategy: fail-fast: false # Continue testing other profiles even if one fails matrix: - profile: [ai-gateway, aibrix] + profile: [ai-gateway, aibrix, llm-d] steps: - name: Check out the repo @@ -159,4 +159,3 @@ jobs: if: always() run: | make e2e-cleanup || true - diff --git a/e2e/README.md b/e2e/README.md index 6e977e5d6..ec5cbf6fd 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -16,7 +16,7 @@ The framework follows a **separation of concerns** design: - **aibrix**: Tests Semantic Router with vLLM AIBrix integration - **istio**: Tests Semantic Router with Istio Gateway (future) - **production-stack**: Tests vLLM Production Stack configurations (future) -- **llm-d**: Tests with LLM-D (future) +- **llm-d**: Tests Semantic Router with LLM-D distributed inference - **dynamo**: Tests with Nvidia Dynamo (future) ## Directory Structure diff --git a/e2e/cmd/e2e/main.go b/e2e/cmd/e2e/main.go index 54ff691f0..5dca46c7a 100644 --- a/e2e/cmd/e2e/main.go +++ b/e2e/cmd/e2e/main.go @@ -12,10 +12,12 @@ import ( aigateway "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway" aibrix "github.com/vllm-project/semantic-router/e2e/profiles/aibrix" dynamicconfig "github.com/vllm-project/semantic-router/e2e/profiles/dynamic-config" + llmd "github.com/vllm-project/semantic-router/e2e/profiles/llm-d" // Import profiles to register test cases _ "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway" _ "github.com/vllm-project/semantic-router/e2e/profiles/aibrix" + _ "github.com/vllm-project/semantic-router/e2e/profiles/llm-d" ) const version = "v1.0.0" @@ -103,6 +105,8 @@ func getProfile(name string) (framework.Profile, error) { return dynamicconfig.NewProfile(), nil case "aibrix": return aibrix.NewProfile(), nil + case "llm-d": + return llmd.NewProfile(), nil // Add more profiles here as they are implemented // case "istio": // return istio.NewProfile(), nil diff --git a/e2e/profiles/llm-d/manifests/httproute-services.yaml b/e2e/profiles/llm-d/manifests/httproute-services.yaml new file mode 100644 index 000000000..8eed2015b --- /dev/null +++ b/e2e/profiles/llm-d/manifests/httproute-services.yaml @@ -0,0 +1,51 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vsr-llama8b-svc + namespace: default +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-llama3-8b-instruct + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: x-selected-model + value: llama3-8b + timeouts: + request: 300s +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: vsr-phi4-mini-svc + namespace: default +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.k8s.io + kind: InferencePool + name: vllm-phi4-mini + matches: + - path: + type: PathPrefix + value: / + headers: + - type: Exact + name: x-selected-model + value: phi4-mini + timeouts: + request: 300s diff --git a/e2e/profiles/llm-d/manifests/inference-sim.yaml b/e2e/profiles/llm-d/manifests/inference-sim.yaml new file mode 100644 index 000000000..91c8e221a --- /dev/null +++ b/e2e/profiles/llm-d/manifests/inference-sim.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + args: + - --model + - llama3-8b + - --port + - "8000" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct + namespace: default + labels: + app: vllm-llama3-8b-instruct +spec: + type: ClusterIP + selector: + app: vllm-llama3-8b-instruct + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi4-mini + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: phi4-mini + template: + metadata: + labels: + app: phi4-mini + spec: + containers: + - name: sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + args: + - --model + - phi4-mini + - --port + - "8000" + ports: + - containerPort: 8000 +--- +apiVersion: v1 +kind: Service +metadata: + name: phi4-mini + namespace: default + labels: + app: phi4-mini +spec: + type: ClusterIP + selector: + app: phi4-mini + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/e2e/profiles/llm-d/manifests/rbac.yaml b/e2e/profiles/llm-d/manifests/rbac.yaml new file mode 100644 index 000000000..60e4d6774 --- /dev/null +++ b/e2e/profiles/llm-d/manifests/rbac.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: llmd-epp-access +rules: + - apiGroups: ["inference.networking.k8s.io", "inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferenceobjectives"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: llmd-epp-access-binding +subjects: + - kind: ServiceAccount + name: vllm-llama3-8b-instruct-epp + namespace: default + - kind: ServiceAccount + name: vllm-phi4-mini-epp + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: llmd-epp-access diff --git a/e2e/profiles/llm-d/profile.go b/e2e/profiles/llm-d/profile.go new file mode 100644 index 000000000..277719244 --- /dev/null +++ b/e2e/profiles/llm-d/profile.go @@ -0,0 +1,562 @@ +package llmd + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + "github.com/vllm-project/semantic-router/e2e/pkg/framework" + "github.com/vllm-project/semantic-router/e2e/pkg/helm" + "github.com/vllm-project/semantic-router/e2e/pkg/helpers" + + _ "github.com/vllm-project/semantic-router/e2e/testcases" +) + +const ( + kindNamespace = "default" + semanticNamespace = "vllm-semantic-router-system" + gatewayNamespace = "istio-system" + inferenceGatewayName = "inference-gateway" + istioVersion = "1.28.0" + gatewayCRDURL = "https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml" + inferenceCRDURL = "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml" +) + +type Profile struct { + verbose bool + useExisting bool + skipSetup bool + versions struct { + istio string + gateway string + inference string + inferenceController string + } +} + +func NewProfile() *Profile { + p := &Profile{} + p.versions.istio = istioVersion + p.versions.gateway = gatewayCRDURL + p.versions.inference = inferenceCRDURL + ctrlURL := os.Getenv("GAIE_CONTROLLER_URL") + if ctrlURL == "" { + ctrlURL = inferenceCRDURL + } + p.versions.inferenceController = ctrlURL + return p +} + +func (p *Profile) Name() string { + return "llm-d" +} + +func (p *Profile) Description() string { + return "Tests Semantic Router with LLM-D distributed inference" +} + +func (p *Profile) Setup(ctx context.Context, opts *framework.SetupOptions) error { + p.verbose = opts.Verbose + p.useExisting = strings.EqualFold(os.Getenv("LLMD_USE_EXISTING"), "true") || os.Getenv("LLMD_USE_EXISTING") == "1" + p.skipSetup = strings.EqualFold(os.Getenv("LLMD_SKIP_SETUP"), "true") || os.Getenv("LLMD_SKIP_SETUP") == "1" + + fmt.Printf("[Profile] llm-d setup start (istio=%s, gatewayCRD=%s, inferenceCRD=%s, controller=%s, useExisting=%v, skipSetup=%v)\n", + p.versions.istio, p.versions.gateway, p.versions.inference, p.versions.inferenceController, p.useExisting, p.skipSetup) + + if p.skipSetup { + fmt.Println("[Profile] LLMD_SKIP_SETUP set; skipping deploy steps, running verification only") + return p.verifyEnvironment(ctx, opts) + } + + rollback := []func(){} + rollbackAll := func() { + for i := len(rollback) - 1; i >= 0; i-- { + rollback[i]() + } + } + + istioctlPath, err := p.ensureIstioctl(ctx) + if err != nil { + return err + } + if p.verbose { + fmt.Printf("[Profile] istioctl ready at %s\n", istioctlPath) + } + + if err := p.kubectlApply(ctx, gatewayCRDURL); err != nil { + return fmt.Errorf("gateway CRDs: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, gatewayCRDURL) }) + if p.verbose { + fmt.Println("[Profile] applied gateway CRDs") + } + if err := p.kubectlApply(ctx, inferenceCRDURL); err != nil { + rollbackAll() + return fmt.Errorf("inference CRDs: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, inferenceCRDURL) }) + if p.verbose { + fmt.Println("[Profile] applied inference CRDs") + } + + if err := p.installIstio(ctx, istioctlPath); err != nil { + rollbackAll() + return fmt.Errorf("install istio: %w", err) + } + rollback = append(rollback, func() { _ = p.uninstallIstio(ctx) }) + if p.verbose { + fmt.Println("[Profile] istio installed") + } + + if err := p.deploySemanticRouter(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("deploy semantic router: %w", err) + } + rollback = append(rollback, func() { + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + _ = deployer.Uninstall(ctx, "semantic-router", semanticNamespace) + }) + if p.verbose { + fmt.Println("[Profile] semantic-router deployed") + } + + if err := p.deployInferenceSim(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("deploy inference sim: %w", err) + } + rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") }) + if p.verbose { + fmt.Println("[Profile] inference simulators deployed") + } + + if err := p.deployLLMD(ctx); err != nil { + rollbackAll() + return fmt.Errorf("deploy llm-d resources: %w", err) + } + rollback = append(rollback, func() { + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml") + }) + if p.verbose { + fmt.Println("[Profile] llm-d schedulers and pools deployed") + } + + if err := p.deployGatewayRoutes(ctx); err != nil { + rollbackAll() + return fmt.Errorf("deploy gateway routes: %w", err) + } + rollback = append(rollback, func() { + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml") + }) + if p.verbose { + fmt.Println("[Profile] gateway routes deployed") + } + + if err := p.waitHTTPRouteAccepted(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteAccepted(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil { + rollbackAll() + return err + } + + if err := p.verifyEnvironment(ctx, opts); err != nil { + rollbackAll() + return fmt.Errorf("verify environment: %w", err) + } + + if p.verbose { + fmt.Println("[Profile] llm-d setup complete") + } + return nil +} + +func (p *Profile) Teardown(ctx context.Context, opts *framework.TeardownOptions) error { + p.verbose = opts.Verbose + fmt.Println("[Profile] llm-d teardown start") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") + _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml") + _ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml") + + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + deployer.Uninstall(ctx, "semantic-router", semanticNamespace) + + _ = p.uninstallIstio(ctx) + _ = p.kubectlDelete(ctx, gatewayCRDURL) + _ = p.kubectlDelete(ctx, inferenceCRDURL) + fmt.Println("[Profile] llm-d teardown complete") + + return nil +} + +func (p *Profile) GetTestCases() []string { + tests := []string{ + "llmd-health-check", + "llmd-distributed-inference", + "llmd-auto-routing", + "llmd-failover-recovery", + "llmd-performance-baseline", + } + if strings.EqualFold(os.Getenv("LLMD_PERF_SKIP"), "true") || os.Getenv("LLMD_PERF_SKIP") == "1" { + var filtered []string + for _, t := range tests { + if t == "llmd-performance-baseline" { + continue + } + filtered = append(filtered, t) + } + if p.verbose { + fmt.Println("[Profile] LLMD_PERF_SKIP set; skipping llmd-performance-baseline test") + } + return filtered + } + return tests +} + +func (p *Profile) GetServiceConfig() framework.ServiceConfig { + return framework.ServiceConfig{ + Name: "inference-gateway-istio", + Namespace: kindNamespace, + PortMapping: "8080:80", + } +} + +func (p *Profile) ensureIstioctl(ctx context.Context) (string, error) { + if path, err := exec.LookPath("istioctl"); err == nil { + return path, nil + } + + osPart := runtime.GOOS + if osPart == "darwin" { + osPart = "osx" + } + arch := runtime.GOARCH + platform := fmt.Sprintf("%s-%s", osPart, arch) + + cacheDir := filepath.Join(os.TempDir(), "istioctl-"+istioVersion+"-"+platform) + bin := filepath.Join(cacheDir, "istioctl") + if _, err := os.Stat(bin); err == nil { + return bin, nil + } + + if err := os.MkdirAll(cacheDir, 0o755); err != nil { + return "", err + } + + url := fmt.Sprintf("https://github.com/istio/istio/releases/download/%s/istioctl-%s-%s.tar.gz", istioVersion, istioVersion, platform) + tgz := filepath.Join(cacheDir, "istioctl.tgz") + + if err := p.runCmd(ctx, "curl", "-fL", "-o", tgz, url); err != nil { + return "", err + } + if err := p.runCmd(ctx, "tar", "-xzf", tgz, "-C", cacheDir); err != nil { + return "", err + } + if err := os.Chmod(bin, 0o755); err != nil { + return "", err + } + return bin, nil +} + +func (p *Profile) installIstio(ctx context.Context, istioctl string) error { + return p.runCmd(ctx, istioctl, "install", "-y", "--set", "profile=minimal", "--set", "values.pilot.env.ENABLE_GATEWAY_API=true", "--set", "values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true") +} + +func (p *Profile) uninstallIstio(ctx context.Context) error { + istioctl, err := exec.LookPath("istioctl") + if err != nil { + return nil + } + return p.runCmd(ctx, istioctl, "x", "uninstall", "--purge", "-y") +} + +func (p *Profile) deploySemanticRouter(ctx context.Context, opts *framework.SetupOptions) error { + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + installOpts := helm.InstallOptions{ + ReleaseName: "semantic-router", + Chart: "deploy/helm/semantic-router", + Namespace: semanticNamespace, + ValuesFiles: []string{"e2e/profiles/llm-d/values.yaml"}, + Set: map[string]string{ + "image.repository": "ghcr.io/vllm-project/semantic-router/extproc", + "image.tag": opts.ImageTag, + "image.pullPolicy": "Never", + }, + Wait: true, + Timeout: "20m", + } + if err := deployer.Install(ctx, installOpts); err != nil { + return err + } + return deployer.WaitForDeployment(ctx, semanticNamespace, "semantic-router", 10*time.Minute) +} + +func (p *Profile) deployInferenceSim(ctx context.Context, opts *framework.SetupOptions) error { + return p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") +} + +func (p *Profile) deployLLMD(ctx context.Context) error { + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml"); err != nil { + return err + } + return nil +} + +func (p *Profile) deployGatewayRoutes(ctx context.Context) error { + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/gateway.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/destinationrule.yaml"); err != nil { + return err + } + if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/envoyfilter.yaml"); err != nil { + return err + } + // Ensure EnvoyFilter ext-proc matches Gateway listener context for this e2e run + _ = p.patchEnvoyFilterForGateway(ctx) + return nil +} + +func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOptions) error { + config, err := clientcmd.BuildConfigFromFlags("", opts.KubeConfig) + if err != nil { + return err + } + client, err := kubernetes.NewForConfig(config) + if err != nil { + return err + } + + // Verify required CRDs/APIs from Gateway API and Inference Extension are registered. + type apiCheck struct { + groupVersion string + expectedResources []string + optional bool + } + checkAPIGroup := func(c apiCheck) error { + resources, err := client.Discovery().ServerResourcesForGroupVersion(c.groupVersion) + if err != nil { + if c.optional { + if p.verbose { + fmt.Printf("[Verify] API group %s not found (optional): %v\n", c.groupVersion, err) + } + return nil + } + return fmt.Errorf("discover %s: %w", c.groupVersion, err) + } + found := make(map[string]bool, len(resources.APIResources)) + for _, r := range resources.APIResources { + found[r.Name] = true + } + for _, r := range c.expectedResources { + if !found[r] { + if c.optional { + if p.verbose { + fmt.Printf("[Verify] Missing optional resource %s in %s\n", r, c.groupVersion) + } + return nil + } + return fmt.Errorf("missing %s in %s", r, c.groupVersion) + } + } + if p.verbose { + fmt.Printf("[Verify] API group %s present with %v\n", c.groupVersion, c.expectedResources) + } + return nil + } + + for _, c := range []apiCheck{ + {groupVersion: "gateway.networking.k8s.io/v1", expectedResources: []string{"gateways", "httproutes"}}, + {groupVersion: "inference.networking.k8s.io/v1", expectedResources: []string{"inferencepools"}}, + // EndpointPickerConfig CRD is optional in some environments; treat as best-effort. + {groupVersion: "inference.networking.x-k8s.io/v1alpha1", expectedResources: []string{"endpointpickerconfigs"}, optional: true}, + } { + if err := checkAPIGroup(c); err != nil { + return err + } + } + + // endpoints readiness check moved after deployments ready + + // Actively wait for critical deployments to become Available before checking readiness counts. + // This avoids flakiness when resources are still pulling images just after creation. + deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose) + deploymentsToWait := []struct { + ns, name string + }{ + {semanticNamespace, "semantic-router"}, + {gatewayNamespace, "istiod"}, + {"default", "vllm-llama3-8b-instruct"}, + {"default", "phi4-mini"}, + {"default", "llm-d-inference-scheduler-llama3-8b"}, + {"default", "llm-d-inference-scheduler-phi4-mini"}, + {"default", "inference-gateway-istio"}, + } + for _, d := range deploymentsToWait { + if err := deployer.WaitForDeployment(ctx, d.ns, d.name, 10*time.Minute); err != nil { + return fmt.Errorf("wait for deployment %s/%s: %w", d.ns, d.name, err) + } + } + + if err := helpers.CheckDeployment(ctx, client, semanticNamespace, "semantic-router", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, gatewayNamespace, "istiod", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "vllm-llama3-8b-instruct", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "phi4-mini", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-llama3-8b", p.verbose); err != nil { + return err + } + if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-phi4-mini", p.verbose); err != nil { + return err + } + if err := helpers.VerifyServicePodsRunning(ctx, client, "default", "inference-gateway-istio", p.verbose); err != nil { + return err + } + if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "vllm-llama3-8b-instruct", 2*time.Minute); err != nil { + return err + } + if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "phi4-mini", 2*time.Minute); err != nil { + return err + } + return nil +} + +// Note: GAIE controller is shipped by some providers (e.g., kgateway, nginx-gateway) or via provider-specific enable flags. +// For Istio-based profile we rely on pilot env ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true instead of a standalone controller manifest. + +func (p *Profile) runCmdOutput(ctx context.Context, name string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, name, args...) + out, err := cmd.CombinedOutput() + if err != nil { + return "", err + } + return string(out), nil +} + +func (p *Profile) waitHTTPRouteAccepted(ctx context.Context, name, ns string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"Accepted\")].status}") + if err == nil && strings.Contains(out, "True") { + return nil + } + time.Sleep(2 * time.Second) + } + if p.verbose { + _ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100") + _ = p.runCmd(ctx, "kubectl", "-n", "default", "logs", "deploy/inference-gateway-istio", "--tail=100") + } + return fmt.Errorf("HTTPRoute %s/%s not Accepted", ns, name) +} + +func (p *Profile) waitHTTPRouteResolvedRefs(ctx context.Context, name, ns string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"ResolvedRefs\")].status}") + if err == nil && strings.Contains(out, "True") { + return nil + } + time.Sleep(2 * time.Second) + } + if p.verbose { + _ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100") + } + return fmt.Errorf("HTTPRoute %s/%s not ResolvedRefs", ns, name) +} + +func (p *Profile) checkInferencePoolEndpointReady(ctx context.Context, client *kubernetes.Clientset, ns, name string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + ep, err := client.CoreV1().Endpoints(ns).Get(ctx, name, v1.GetOptions{}) + if err != nil { + return err + } + addrs := 0 + for _, s := range ep.Subsets { + addrs += len(s.Addresses) + } + if addrs > 0 { + return nil + } + time.Sleep(2 * time.Second) + } + return fmt.Errorf("endpoints %s/%s empty", ns, name) +} + +func (p *Profile) runCmd(ctx context.Context, name string, args ...string) error { + cmd := exec.CommandContext(ctx, name, args...) + if p.verbose { + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + } + return cmd.Run() +} + +func (p *Profile) kubectlApply(ctx context.Context, target string) error { + return p.runCmd(ctx, "kubectl", "apply", "-f", target) +} + +func (p *Profile) kubectlDelete(ctx context.Context, target string) error { + return p.runCmd(ctx, "kubectl", "delete", "-f", target, "--ignore-not-found") +} +func (p *Profile) patchEnvoyFilterForGateway(ctx context.Context) error { + // Add match.context=GATEWAY and listener.portNumber=80 to the first configPatch via JSON patch + patch := `[ + {"op":"add","path":"/spec/configPatches/0/match/context","value":"GATEWAY"}, + {"op":"add","path":"/spec/configPatches/0/match/listener/portNumber","value":80} + ]` + return p.runCmd(ctx, "kubectl", "-n", "default", "patch", "envoyfilter", "semantic-router", "--type=json", "-p", patch) +} diff --git a/e2e/profiles/llm-d/values.yaml b/e2e/profiles/llm-d/values.yaml new file mode 100644 index 000000000..21ea100fa --- /dev/null +++ b/e2e/profiles/llm-d/values.yaml @@ -0,0 +1,69 @@ +# Profile revision 2025-11-21: math -> phi4-mini; cs/default -> llama3-8b; removed global HTTPRoute catch-all. +config: + # Allow Envoy to re-run route matching after Semantic Router sets x-selected-model. + # Without this, Gateway API routes that depend on that header won't be chosen and return 404. + clear_route_cache: true + default_model: llama3-8b + # Enable domain classification to return the x-vsr-selected-category header + classifier: + category_model: + model_id: models/category_classifier_modernbert-base_model + threshold: 0.6 + use_modernbert: true + category_mapping_path: models/category_classifier_modernbert-base_model/category_mapping.json + pii_model: + model_id: "" + threshold: 1.0 + use_modernbert: false + pii_mapping_path: "" + # Optional domains used by decision rules + categories: + - name: math + description: "Mathematics, arithmetic, calculation" + - name: computer science + description: "Computer networks, programming, systems" + decisions: + - name: math_route + priority: 20 + rules: + operator: OR + conditions: + - type: domain + name: math + modelRefs: + - model: phi4-mini + use_reasoning: false + - name: cs_route + priority: 10 + rules: + operator: OR + conditions: + - type: domain + name: computer science + modelRefs: + - model: llama3-8b + use_reasoning: false + - name: default_route + priority: 1 + rules: + operator: OR + conditions: + - type: domain + name: other + modelRefs: + - model: llama3-8b + use_reasoning: false + semantic_cache: + enabled: false + prompt_guard: + enabled: false + tools: + enabled: false + bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +# Keep consistent with the default chart: initContainer, model downloads, and PVC use chart defaults +image: + pullPolicy: IfNotPresent diff --git a/e2e/testcases/llmd_auto_routing.go b/e2e/testcases/llmd_auto_routing.go new file mode 100644 index 000000000..dd38f98a3 --- /dev/null +++ b/e2e/testcases/llmd_auto_routing.go @@ -0,0 +1,61 @@ +package testcases + +import ( + "context" + "fmt" + "strings" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("llmd-auto-routing", pkgtestcases.TestCase{ + Description: "Auto model selection routes math and cs", + Tags: []string{"llmd", "routing"}, + Fn: llmdAutoRouting, + }) +} + +func llmdAutoRouting(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + localPort, stop, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stop() + + cases := []struct { + prompt string + model string + }{ + {prompt: "What is 2+2?", model: "phi4-mini"}, + {prompt: "Explain TCP three-way handshake", model: "llama3-8b"}, + } + + for _, c := range cases { + res, err := doLLMDChat(ctx, localPort, "auto", c.prompt, 45*time.Second) + if err != nil { + return err + } + selected := getSelectedModel(res.headers) + pod := getInferencePod(res.headers) + if selected == "" && pod != "" { + if strings.HasPrefix(pod, "phi4-mini-") { + selected = "phi4-mini" + } else if strings.HasPrefix(pod, "vllm-llama3-8b-instruct-") { + selected = "llama3-8b" + } + } + if selected != c.model { + return fmt.Errorf("prompt '%s' expected model %s got %s", c.prompt, c.model, selected) + } + if pod == "" { + return fmt.Errorf("missing x-inference-pod for prompt '%s'", c.prompt) + } + } + if opts.SetDetails != nil { + opts.SetDetails(map[string]interface{}{"cases": len(cases)}) + } + return nil +} diff --git a/e2e/testcases/llmd_distributed_inference.go b/e2e/testcases/llmd_distributed_inference.go new file mode 100644 index 000000000..4828936bf --- /dev/null +++ b/e2e/testcases/llmd_distributed_inference.go @@ -0,0 +1,97 @@ +package testcases + +import ( + "context" + "fmt" + "sync" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("llmd-distributed-inference", pkgtestcases.TestCase{ + Description: "Verify multi-replica backends serve requests", + Tags: []string{"llmd", "distributed"}, + Fn: llmdDistributed, + }) +} + +func llmdDistributed(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + backendDeploys := []string{"vllm-llama3-8b-instruct", "phi4-mini"} + for _, name := range backendDeploys { + dep, err := client.AppsV1().Deployments("default").Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + if dep.Status.ReadyReplicas < 2 { + return fmt.Errorf("%s ready replicas %d < 2", name, dep.Status.ReadyReplicas) + } + } + + localPort, stop, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stop() + + const total = 30 + var ( + success int + mu sync.Mutex + podHits = map[string]int{} + ) + var wg sync.WaitGroup + + for i := 0; i < total; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + res, err := doLLMDChat(ctx, localPort, "llama3-8b", fmt.Sprintf("req-%d", i), 60*time.Second) + if err != nil { + return + } + pod := getInferencePod(res.headers) + if pod == "" { + return + } + mu.Lock() + success++ + podHits[pod]++ + mu.Unlock() + }() + } + + wg.Wait() + + successRate := float64(success) / float64(total) + if successRate < 0.98 { + return fmt.Errorf("success rate %.2f below 0.98", successRate) + } + if len(podHits) < 2 { + return fmt.Errorf("expected hits on >=2 pods, got %d", len(podHits)) + } + var max, min int + for _, c := range podHits { + if c > max { + max = c + } + if min == 0 || c < min { + min = c + } + } + if min == 0 || float64(max)/float64(min) > 2.0 { + return fmt.Errorf("pod hit imbalance max/min=%d/%d", max, min) + } + if opts.SetDetails != nil { + opts.SetDetails(map[string]interface{}{ + "success_rate": successRate, + "total": total, + "pod_hits": podHits, + }) + } + return nil +} diff --git a/e2e/testcases/llmd_failover_recovery.go b/e2e/testcases/llmd_failover_recovery.go new file mode 100644 index 000000000..aa4fefde8 --- /dev/null +++ b/e2e/testcases/llmd_failover_recovery.go @@ -0,0 +1,99 @@ +package testcases + +import ( + "context" + "fmt" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("llmd-failover-recovery", pkgtestcases.TestCase{ + Description: "Traffic survives backend pod loss", + Tags: []string{"llmd", "failover"}, + Fn: llmdFailover, + }) +} + +func llmdFailover(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + pods, err := client.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "app=phi4-mini"}) + if err != nil { + return err + } + if len(pods.Items) < 2 { + return fmt.Errorf("need >=2 phi4-mini pods for failover, got %d", len(pods.Items)) + } + target := pods.Items[0].Name + if err := client.CoreV1().Pods("default").Delete(ctx, target, metav1.DeleteOptions{}); err != nil { + return err + } + deleteTime := time.Now() + + time.Sleep(5 * time.Second) + + localPort, stop, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stop() + + deadline := time.Now().Add(60 * time.Second) + total := 0 + success := 0 + podHits := map[string]int{} + var recoveredAt time.Time + + for time.Now().Before(deadline) { + total++ + res, err := doLLMDChat(ctx, localPort, "phi4-mini", fmt.Sprintf("failover-%d", total), 45*time.Second) + if err == nil { + success++ + pod := getInferencePod(res.headers) + if pod == target { + return fmt.Errorf("traffic routed to deleted pod %s", target) + } + if pod != "" { + podHits[pod]++ + } + if recoveredAt.IsZero() { + recoveredAt = time.Now() + } + } + time.Sleep(1 * time.Second) + } + rate := float64(success) / float64(total) + if rate < 0.95 { + return fmt.Errorf("success rate %.2f below 0.95", rate) + } + if len(podHits) == 0 { + ep, err := client.CoreV1().Endpoints("default").Get(ctx, "phi4-mini", metav1.GetOptions{}) + if err != nil { + return err + } + for _, s := range ep.Subsets { + for _, a := range s.Addresses { + if a.TargetRef != nil && a.TargetRef.Name == target { + return fmt.Errorf("deleted pod still present in endpoints %s", target) + } + } + } + } + recoverySeconds := time.Since(deleteTime).Seconds() + if !recoveredAt.IsZero() { + recoverySeconds = recoveredAt.Sub(deleteTime).Seconds() + } + if opts.SetDetails != nil { + opts.SetDetails(map[string]interface{}{ + "deleted_pod": target, + "success": success, + "total": total, + "success_rate": rate, + "pod_hits": podHits, + "recovery_seconds": recoverySeconds, + }) + } + return nil +} diff --git a/e2e/testcases/llmd_health_check.go b/e2e/testcases/llmd_health_check.go new file mode 100644 index 000000000..7d3dd7ddf --- /dev/null +++ b/e2e/testcases/llmd_health_check.go @@ -0,0 +1,90 @@ +package testcases + +import ( + "context" + "fmt" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("llmd-health-check", pkgtestcases.TestCase{ + Description: "LLM-D components readiness and basic chat call", + Tags: []string{"llmd", "health"}, + Fn: llmdHealth, + }) +} + +func llmdHealth(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + ns := "default" + crdGVs := []string{ + "gateway.networking.k8s.io/v1", + "inference.networking.k8s.io/v1", + } + for _, gv := range crdGVs { + if err := ensureGroupVersion(ctx, client, gv); err != nil { + return err + } + } + + required := []struct { + ns string + dep string + }{ + {"vllm-semantic-router-system", "semantic-router"}, + {"istio-system", "istiod"}, + {ns, "llm-d-inference-scheduler-llama3-8b"}, + {ns, "llm-d-inference-scheduler-phi4-mini"}, + {ns, "vllm-llama3-8b-instruct"}, + {ns, "phi4-mini"}, + } + for _, r := range required { + if err := waitDeploymentReady(ctx, client, r.ns, r.dep, 1); err != nil { + return err + } + } + + localPort, stop, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stop() + + res, err := doLLMDChat(ctx, localPort, "llama3-8b", "ping", 30*time.Second) + if err != nil { + return err + } + + if opts.SetDetails != nil { + opts.SetDetails(map[string]interface{}{ + "deployments": len(required), + "crds": crdGVs, + "latency_ms": res.duration.Milliseconds(), + "x-selected-model": getSelectedModel(res.headers), + "x-inference-pod": getInferencePod(res.headers), + }) + } + return nil +} + +func waitDeploymentReady(ctx context.Context, client *kubernetes.Clientset, ns, name string, want int32) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { + dep, err := client.AppsV1().Deployments(ns).Get(ctx, name, v1.GetOptions{}) + if err != nil { + return false, err + } + return dep.Status.ReadyReplicas >= want, nil + }) +} + +func ensureGroupVersion(ctx context.Context, client *kubernetes.Clientset, gv string) error { + _, err := client.Discovery().ServerResourcesForGroupVersion(gv) + if err != nil { + return fmt.Errorf("CRD groupVersion %s not present: %w", gv, err) + } + return nil +} diff --git a/e2e/testcases/llmd_helpers.go b/e2e/testcases/llmd_helpers.go new file mode 100644 index 000000000..e0d5aa5cd --- /dev/null +++ b/e2e/testcases/llmd_helpers.go @@ -0,0 +1,71 @@ +package testcases + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "sort" + "strings" + "time" +) + +type chatResult struct { + headers http.Header + duration time.Duration +} + +// doLLMDChat sends a chat completion request to the forwarded service and returns headers + latency. +func doLLMDChat(ctx context.Context, port, model, content string, timeout time.Duration) (chatResult, error) { + body := map[string]interface{}{ + "model": model, + "messages": []map[string]string{ + {"role": "user", "content": content}, + }, + } + data, _ := json.Marshal(body) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://localhost:%s/v1/chat/completions", port), bytes.NewBuffer(data)) + if err != nil { + return chatResult{}, err + } + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{Timeout: timeout} + start := time.Now() + resp, err := client.Do(req) + duration := time.Since(start) + if err != nil { + return chatResult{}, err + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return chatResult{}, fmt.Errorf("chat failed: %d %s", resp.StatusCode, truncateString(string(b), 120)) + } + return chatResult{headers: resp.Header, duration: duration}, nil +} + +func getInferencePod(headers http.Header) string { + return strings.TrimSpace(headers.Get("x-inference-pod")) +} + +func getSelectedModel(headers http.Header) string { + v := strings.TrimSpace(headers.Get("x-vsr-selected-model")) + if v != "" { + return v + } + return strings.TrimSpace(headers.Get("x-selected-model")) +} + +func percentileDuration(ds []time.Duration, p float64) time.Duration { + if len(ds) == 0 { + return 0 + } + sorted := make([]time.Duration, len(ds)) + copy(sorted, ds) + sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] }) + idx := int(float64(len(sorted)-1) * p) + return sorted[idx] +} diff --git a/e2e/testcases/llmd_performance_baseline.go b/e2e/testcases/llmd_performance_baseline.go new file mode 100644 index 000000000..8edf36899 --- /dev/null +++ b/e2e/testcases/llmd_performance_baseline.go @@ -0,0 +1,109 @@ +package testcases + +import ( + "context" + "fmt" + "sync" + "time" + + pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases" + "k8s.io/client-go/kubernetes" +) + +func init() { + pkgtestcases.Register("llmd-performance-baseline", pkgtestcases.TestCase{ + Description: "Measure success rate under moderate concurrency", + Tags: []string{"llmd", "perf"}, + Fn: llmdPerf, + }) +} + +type perfResult struct { + concurrency int + success int + total int + durations []time.Duration +} + +func llmdPerf(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error { + localPort, stop, err := setupServiceConnection(ctx, client, opts) + if err != nil { + return err + } + defer stop() + + stages := []int{15, 30, 60} + results := []perfResult{} + + for _, conc := range stages { + res := runPerfStage(ctx, localPort, conc, 20*time.Second) + if float64(res.success)/float64(res.total) < 0.95 { + return fmt.Errorf("stage %d success %d/%d", conc, res.success, res.total) + } + results = append(results, res) + time.Sleep(2 * time.Second) + } + + if opts.SetDetails != nil { + summary := map[string]interface{}{} + for _, r := range results { + p50, p95 := percentileDuration(r.durations, 0.5), percentileDuration(r.durations, 0.95) + key := fmt.Sprintf("c%d", r.concurrency) + summary[key] = map[string]interface{}{ + "success": r.success, + "total": r.total, + "success_rate": float64(r.success) / float64(r.total), + "p50_ms": p50.Milliseconds(), + "p95_ms": p95.Milliseconds(), + } + } + opts.SetDetails(summary) + } + return nil +} + +func runPerfStage(ctx context.Context, port string, conc int, duration time.Duration) perfResult { + res := perfResult{concurrency: conc} + stageCtx, cancel := context.WithTimeout(ctx, duration) + defer cancel() + + var wg sync.WaitGroup + var mu sync.Mutex + + for i := 0; i < conc; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-stageCtx.Done(): + return + default: + } + resItem, err := doLLMDChat(stageCtx, port, "auto", fmt.Sprintf("perf-%d-%d", conc, i), 60*time.Second) + if err != nil && stageCtx.Err() != nil { + continue + } + // On transient failure, retry once within stage window + if err != nil { + resItem2, err2 := doLLMDChat(stageCtx, port, "auto", fmt.Sprintf("perf-%d-%d", conc, i), 60*time.Second) + if err2 == nil { + err = nil + resItem = resItem2 + } + } + mu.Lock() + res.total++ + if err == nil { + res.success++ + res.durations = append(res.durations, resItem.duration) + } + mu.Unlock() + } + }() + } + + wg.Wait() + return res +} diff --git a/tools/make/e2e.mk b/tools/make/e2e.mk index cd8981b89..e417053ce 100644 --- a/tools/make/e2e.mk +++ b/tools/make/e2e.mk @@ -96,6 +96,7 @@ e2e-help: ## Show help for E2E testing @echo "Available Profiles:" @echo " ai-gateway - Test Semantic Router with Envoy AI Gateway" @echo " aibrix - Test Semantic Router with vLLM AIBrix" + @echo " llm-d - Test Semantic Router with LLM-D" @echo " istio - Test Semantic Router with Istio (coming soon)" @echo "" @echo "Environment Variables:" @@ -127,4 +128,3 @@ e2e-help: ## Show help for E2E testing @echo " 2. make e2e-test-only # Run all tests" @echo " 3. make e2e-test-only E2E_TESTS=\"test1\" # Run specific test" @echo " 4. make e2e-cleanup # Clean up when done" -