From f3998d59aae3e689210bff02f433b1299398ccff Mon Sep 17 00:00:00 2001
From: samzong <samzong.lu@gmail.com>
Date: Fri, 21 Nov 2025 00:40:00 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat(e2e/llm-d):=20add=20LLM-D=20pr?=
 =?UTF-8?q?ofile=20and=20test=20cases?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: samzong <samzong.lu@gmail.com>
---
 .github/workflows/integration-test-k8s.yml    |   5 +-
 e2e/README.md                                 |   2 +-
 e2e/cmd/e2e/main.go                           |   4 +
 .../llm-d/manifests/httproute-services.yaml   |  51 ++
 .../llm-d/manifests/inference-sim.yaml        | 101 ++++
 e2e/profiles/llm-d/manifests/rbac.yaml        |  27 +
 e2e/profiles/llm-d/profile.go                 | 562 ++++++++++++++++++
 e2e/profiles/llm-d/values.yaml                |  69 +++
 e2e/testcases/llmd_auto_routing.go            |  61 ++
 e2e/testcases/llmd_distributed_inference.go   |  97 +++
 e2e/testcases/llmd_failover_recovery.go       |  99 +++
 e2e/testcases/llmd_health_check.go            |  90 +++
 e2e/testcases/llmd_helpers.go                 |  71 +++
 e2e/testcases/llmd_performance_baseline.go    | 109 ++++
 tools/make/e2e.mk                             |   2 +-
 15 files changed, 1345 insertions(+), 5 deletions(-)
 create mode 100644 e2e/profiles/llm-d/manifests/httproute-services.yaml
 create mode 100644 e2e/profiles/llm-d/manifests/inference-sim.yaml
 create mode 100644 e2e/profiles/llm-d/manifests/rbac.yaml
 create mode 100644 e2e/profiles/llm-d/profile.go
 create mode 100644 e2e/profiles/llm-d/values.yaml
 create mode 100644 e2e/testcases/llmd_auto_routing.go
 create mode 100644 e2e/testcases/llmd_distributed_inference.go
 create mode 100644 e2e/testcases/llmd_failover_recovery.go
 create mode 100644 e2e/testcases/llmd_health_check.go
 create mode 100644 e2e/testcases/llmd_helpers.go
 create mode 100644 e2e/testcases/llmd_performance_baseline.go

diff --git a/.github/workflows/integration-test-k8s.yml b/.github/workflows/integration-test-k8s.yml
index aa1d14109..947f8c397 100644
--- a/.github/workflows/integration-test-k8s.yml
+++ b/.github/workflows/integration-test-k8s.yml
@@ -12,11 +12,11 @@ on:
 jobs:
   integration-test:
     runs-on: ubuntu-latest
-    timeout-minutes: 60
+    timeout-minutes: 75
     strategy:
       fail-fast: false  # Continue testing other profiles even if one fails
       matrix:
-        profile: [ai-gateway, aibrix]
+        profile: [ai-gateway, aibrix, llm-d]
 
     steps:
       - name: Check out the repo
@@ -159,4 +159,3 @@ jobs:
         if: always()
         run: |
           make e2e-cleanup || true
-
diff --git a/e2e/README.md b/e2e/README.md
index 6e977e5d6..ec5cbf6fd 100644
--- a/e2e/README.md
+++ b/e2e/README.md
@@ -16,7 +16,7 @@ The framework follows a **separation of concerns** design:
 - **aibrix**: Tests Semantic Router with vLLM AIBrix integration
 - **istio**: Tests Semantic Router with Istio Gateway (future)
 - **production-stack**: Tests vLLM Production Stack configurations (future)
-- **llm-d**: Tests with LLM-D (future)
+- **llm-d**: Tests Semantic Router with LLM-D distributed inference
 - **dynamo**: Tests with Nvidia Dynamo (future)
 
 ## Directory Structure
diff --git a/e2e/cmd/e2e/main.go b/e2e/cmd/e2e/main.go
index 54ff691f0..5dca46c7a 100644
--- a/e2e/cmd/e2e/main.go
+++ b/e2e/cmd/e2e/main.go
@@ -12,10 +12,12 @@ import (
 	aigateway "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway"
 	aibrix "github.com/vllm-project/semantic-router/e2e/profiles/aibrix"
 	dynamicconfig "github.com/vllm-project/semantic-router/e2e/profiles/dynamic-config"
+	llmd "github.com/vllm-project/semantic-router/e2e/profiles/llm-d"
 
 	// Import profiles to register test cases
 	_ "github.com/vllm-project/semantic-router/e2e/profiles/ai-gateway"
 	_ "github.com/vllm-project/semantic-router/e2e/profiles/aibrix"
+	_ "github.com/vllm-project/semantic-router/e2e/profiles/llm-d"
 )
 
 const version = "v1.0.0"
@@ -103,6 +105,8 @@ func getProfile(name string) (framework.Profile, error) {
 		return dynamicconfig.NewProfile(), nil
 	case "aibrix":
 		return aibrix.NewProfile(), nil
+	case "llm-d":
+		return llmd.NewProfile(), nil
 	// Add more profiles here as they are implemented
 	// case "istio":
 	//     return istio.NewProfile(), nil
diff --git a/e2e/profiles/llm-d/manifests/httproute-services.yaml b/e2e/profiles/llm-d/manifests/httproute-services.yaml
new file mode 100644
index 000000000..8eed2015b
--- /dev/null
+++ b/e2e/profiles/llm-d/manifests/httproute-services.yaml
@@ -0,0 +1,51 @@
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: vsr-llama8b-svc
+  namespace: default
+spec:
+  parentRefs:
+    - group: gateway.networking.k8s.io
+      kind: Gateway
+      name: inference-gateway
+  rules:
+    - backendRefs:
+        - group: inference.networking.k8s.io
+          kind: InferencePool
+          name: vllm-llama3-8b-instruct
+      matches:
+        - path:
+            type: PathPrefix
+            value: /
+          headers:
+            - type: Exact
+              name: x-selected-model
+              value: llama3-8b
+      timeouts:
+        request: 300s
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: vsr-phi4-mini-svc
+  namespace: default
+spec:
+  parentRefs:
+    - group: gateway.networking.k8s.io
+      kind: Gateway
+      name: inference-gateway
+  rules:
+    - backendRefs:
+        - group: inference.networking.k8s.io
+          kind: InferencePool
+          name: vllm-phi4-mini
+      matches:
+        - path:
+            type: PathPrefix
+            value: /
+          headers:
+            - type: Exact
+              name: x-selected-model
+              value: phi4-mini
+      timeouts:
+        request: 300s
diff --git a/e2e/profiles/llm-d/manifests/inference-sim.yaml b/e2e/profiles/llm-d/manifests/inference-sim.yaml
new file mode 100644
index 000000000..91c8e221a
--- /dev/null
+++ b/e2e/profiles/llm-d/manifests/inference-sim.yaml
@@ -0,0 +1,101 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama3-8b-instruct
+  namespace: default
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct
+    spec:
+      containers:
+        - name: sim
+          image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          args:
+            - --model
+            - llama3-8b
+            - --port
+            - "8000"
+          ports:
+            - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama3-8b-instruct
+  namespace: default
+  labels:
+    app: vllm-llama3-8b-instruct
+spec:
+  type: ClusterIP
+  selector:
+    app: vllm-llama3-8b-instruct
+  ports:
+    - port: 8000
+      targetPort: 8000
+      protocol: TCP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi4-mini
+  namespace: default
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: phi4-mini
+  template:
+    metadata:
+      labels:
+        app: phi4-mini
+    spec:
+      containers:
+        - name: sim
+          image: ghcr.io/llm-d/llm-d-inference-sim:v0.6.1
+          env:
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          args:
+            - --model
+            - phi4-mini
+            - --port
+            - "8000"
+          ports:
+            - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: phi4-mini
+  namespace: default
+  labels:
+    app: phi4-mini
+spec:
+  type: ClusterIP
+  selector:
+    app: phi4-mini
+  ports:
+    - port: 8000
+      targetPort: 8000
+      protocol: TCP
diff --git a/e2e/profiles/llm-d/manifests/rbac.yaml b/e2e/profiles/llm-d/manifests/rbac.yaml
new file mode 100644
index 000000000..60e4d6774
--- /dev/null
+++ b/e2e/profiles/llm-d/manifests/rbac.yaml
@@ -0,0 +1,27 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: llmd-epp-access
+rules:
+  - apiGroups: ["inference.networking.k8s.io", "inference.networking.x-k8s.io"]
+    resources: ["inferencepools", "inferenceobjectives"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: llmd-epp-access-binding
+subjects:
+  - kind: ServiceAccount
+    name: vllm-llama3-8b-instruct-epp
+    namespace: default
+  - kind: ServiceAccount
+    name: vllm-phi4-mini-epp
+    namespace: default
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: llmd-epp-access
diff --git a/e2e/profiles/llm-d/profile.go b/e2e/profiles/llm-d/profile.go
new file mode 100644
index 000000000..277719244
--- /dev/null
+++ b/e2e/profiles/llm-d/profile.go
@@ -0,0 +1,562 @@
+package llmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"time"
+
+	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/clientcmd"
+
+	"github.com/vllm-project/semantic-router/e2e/pkg/framework"
+	"github.com/vllm-project/semantic-router/e2e/pkg/helm"
+	"github.com/vllm-project/semantic-router/e2e/pkg/helpers"
+
+	_ "github.com/vllm-project/semantic-router/e2e/testcases"
+)
+
+const (
+	kindNamespace        = "default"
+	semanticNamespace    = "vllm-semantic-router-system"
+	gatewayNamespace     = "istio-system"
+	inferenceGatewayName = "inference-gateway"
+	istioVersion         = "1.28.0"
+	gatewayCRDURL        = "https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml"
+	inferenceCRDURL      = "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml"
+)
+
+type Profile struct {
+	verbose     bool
+	useExisting bool
+	skipSetup   bool
+	versions    struct {
+		istio               string
+		gateway             string
+		inference           string
+		inferenceController string
+	}
+}
+
+func NewProfile() *Profile {
+	p := &Profile{}
+	p.versions.istio = istioVersion
+	p.versions.gateway = gatewayCRDURL
+	p.versions.inference = inferenceCRDURL
+	ctrlURL := os.Getenv("GAIE_CONTROLLER_URL")
+	if ctrlURL == "" {
+		ctrlURL = inferenceCRDURL
+	}
+	p.versions.inferenceController = ctrlURL
+	return p
+}
+
+func (p *Profile) Name() string {
+	return "llm-d"
+}
+
+func (p *Profile) Description() string {
+	return "Tests Semantic Router with LLM-D distributed inference"
+}
+
+func (p *Profile) Setup(ctx context.Context, opts *framework.SetupOptions) error {
+	p.verbose = opts.Verbose
+	p.useExisting = strings.EqualFold(os.Getenv("LLMD_USE_EXISTING"), "true") || os.Getenv("LLMD_USE_EXISTING") == "1"
+	p.skipSetup = strings.EqualFold(os.Getenv("LLMD_SKIP_SETUP"), "true") || os.Getenv("LLMD_SKIP_SETUP") == "1"
+
+	fmt.Printf("[Profile] llm-d setup start (istio=%s, gatewayCRD=%s, inferenceCRD=%s, controller=%s, useExisting=%v, skipSetup=%v)\n",
+		p.versions.istio, p.versions.gateway, p.versions.inference, p.versions.inferenceController, p.useExisting, p.skipSetup)
+
+	if p.skipSetup {
+		fmt.Println("[Profile] LLMD_SKIP_SETUP set; skipping deploy steps, running verification only")
+		return p.verifyEnvironment(ctx, opts)
+	}
+
+	rollback := []func(){}
+	rollbackAll := func() {
+		for i := len(rollback) - 1; i >= 0; i-- {
+			rollback[i]()
+		}
+	}
+
+	istioctlPath, err := p.ensureIstioctl(ctx)
+	if err != nil {
+		return err
+	}
+	if p.verbose {
+		fmt.Printf("[Profile] istioctl ready at %s\n", istioctlPath)
+	}
+
+	if err := p.kubectlApply(ctx, gatewayCRDURL); err != nil {
+		return fmt.Errorf("gateway CRDs: %w", err)
+	}
+	rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, gatewayCRDURL) })
+	if p.verbose {
+		fmt.Println("[Profile] applied gateway CRDs")
+	}
+	if err := p.kubectlApply(ctx, inferenceCRDURL); err != nil {
+		rollbackAll()
+		return fmt.Errorf("inference CRDs: %w", err)
+	}
+	rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, inferenceCRDURL) })
+	if p.verbose {
+		fmt.Println("[Profile] applied inference CRDs")
+	}
+
+	if err := p.installIstio(ctx, istioctlPath); err != nil {
+		rollbackAll()
+		return fmt.Errorf("install istio: %w", err)
+	}
+	rollback = append(rollback, func() { _ = p.uninstallIstio(ctx) })
+	if p.verbose {
+		fmt.Println("[Profile] istio installed")
+	}
+
+	if err := p.deploySemanticRouter(ctx, opts); err != nil {
+		rollbackAll()
+		return fmt.Errorf("deploy semantic router: %w", err)
+	}
+	rollback = append(rollback, func() {
+		deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose)
+		_ = deployer.Uninstall(ctx, "semantic-router", semanticNamespace)
+	})
+	if p.verbose {
+		fmt.Println("[Profile] semantic-router deployed")
+	}
+
+	if err := p.deployInferenceSim(ctx, opts); err != nil {
+		rollbackAll()
+		return fmt.Errorf("deploy inference sim: %w", err)
+	}
+	rollback = append(rollback, func() { _ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml") })
+	if p.verbose {
+		fmt.Println("[Profile] inference simulators deployed")
+	}
+
+	if err := p.deployLLMD(ctx); err != nil {
+		rollbackAll()
+		return fmt.Errorf("deploy llm-d resources: %w", err)
+	}
+	rollback = append(rollback, func() {
+		_ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml")
+	})
+	if p.verbose {
+		fmt.Println("[Profile] llm-d schedulers and pools deployed")
+	}
+
+	if err := p.deployGatewayRoutes(ctx); err != nil {
+		rollbackAll()
+		return fmt.Errorf("deploy gateway routes: %w", err)
+	}
+	rollback = append(rollback, func() {
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml")
+		_ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml")
+		_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml")
+	})
+	if p.verbose {
+		fmt.Println("[Profile] gateway routes deployed")
+	}
+
+	if err := p.waitHTTPRouteAccepted(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil {
+		rollbackAll()
+		return err
+	}
+	if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-llama8b-svc", "default", 2*time.Minute); err != nil {
+		rollbackAll()
+		return err
+	}
+	if err := p.waitHTTPRouteAccepted(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil {
+		rollbackAll()
+		return err
+	}
+	if err := p.waitHTTPRouteResolvedRefs(ctx, "vsr-phi4-mini-svc", "default", 2*time.Minute); err != nil {
+		rollbackAll()
+		return err
+	}
+
+	if err := p.verifyEnvironment(ctx, opts); err != nil {
+		rollbackAll()
+		return fmt.Errorf("verify environment: %w", err)
+	}
+
+	if p.verbose {
+		fmt.Println("[Profile] llm-d setup complete")
+	}
+	return nil
+}
+
+func (p *Profile) Teardown(ctx context.Context, opts *framework.TeardownOptions) error {
+	p.verbose = opts.Verbose
+	fmt.Println("[Profile] llm-d teardown start")
+	_ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml")
+	_ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml")
+	_ = p.kubectlDelete(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/envoyfilter.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/destinationrule.yaml")
+	_ = p.kubectlDelete(ctx, "deploy/kubernetes/istio/gateway.yaml")
+
+	deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose)
+	deployer.Uninstall(ctx, "semantic-router", semanticNamespace)
+
+	_ = p.uninstallIstio(ctx)
+	_ = p.kubectlDelete(ctx, gatewayCRDURL)
+	_ = p.kubectlDelete(ctx, inferenceCRDURL)
+	fmt.Println("[Profile] llm-d teardown complete")
+
+	return nil
+}
+
+func (p *Profile) GetTestCases() []string {
+	tests := []string{
+		"llmd-health-check",
+		"llmd-distributed-inference",
+		"llmd-auto-routing",
+		"llmd-failover-recovery",
+		"llmd-performance-baseline",
+	}
+	if strings.EqualFold(os.Getenv("LLMD_PERF_SKIP"), "true") || os.Getenv("LLMD_PERF_SKIP") == "1" {
+		var filtered []string
+		for _, t := range tests {
+			if t == "llmd-performance-baseline" {
+				continue
+			}
+			filtered = append(filtered, t)
+		}
+		if p.verbose {
+			fmt.Println("[Profile] LLMD_PERF_SKIP set; skipping llmd-performance-baseline test")
+		}
+		return filtered
+	}
+	return tests
+}
+
+func (p *Profile) GetServiceConfig() framework.ServiceConfig {
+	return framework.ServiceConfig{
+		Name:        "inference-gateway-istio",
+		Namespace:   kindNamespace,
+		PortMapping: "8080:80",
+	}
+}
+
+func (p *Profile) ensureIstioctl(ctx context.Context) (string, error) {
+	if path, err := exec.LookPath("istioctl"); err == nil {
+		return path, nil
+	}
+
+	osPart := runtime.GOOS
+	if osPart == "darwin" {
+		osPart = "osx"
+	}
+	arch := runtime.GOARCH
+	platform := fmt.Sprintf("%s-%s", osPart, arch)
+
+	cacheDir := filepath.Join(os.TempDir(), "istioctl-"+istioVersion+"-"+platform)
+	bin := filepath.Join(cacheDir, "istioctl")
+	if _, err := os.Stat(bin); err == nil {
+		return bin, nil
+	}
+
+	if err := os.MkdirAll(cacheDir, 0o755); err != nil {
+		return "", err
+	}
+
+	url := fmt.Sprintf("https://github.com/istio/istio/releases/download/%s/istioctl-%s-%s.tar.gz", istioVersion, istioVersion, platform)
+	tgz := filepath.Join(cacheDir, "istioctl.tgz")
+
+	if err := p.runCmd(ctx, "curl", "-fL", "-o", tgz, url); err != nil {
+		return "", err
+	}
+	if err := p.runCmd(ctx, "tar", "-xzf", tgz, "-C", cacheDir); err != nil {
+		return "", err
+	}
+	if err := os.Chmod(bin, 0o755); err != nil {
+		return "", err
+	}
+	return bin, nil
+}
+
+func (p *Profile) installIstio(ctx context.Context, istioctl string) error {
+	return p.runCmd(ctx, istioctl, "install", "-y", "--set", "profile=minimal", "--set", "values.pilot.env.ENABLE_GATEWAY_API=true", "--set", "values.pilot.env.ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true")
+}
+
+func (p *Profile) uninstallIstio(ctx context.Context) error {
+	istioctl, err := exec.LookPath("istioctl")
+	if err != nil {
+		return nil
+	}
+	return p.runCmd(ctx, istioctl, "x", "uninstall", "--purge", "-y")
+}
+
+func (p *Profile) deploySemanticRouter(ctx context.Context, opts *framework.SetupOptions) error {
+	deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose)
+	installOpts := helm.InstallOptions{
+		ReleaseName: "semantic-router",
+		Chart:       "deploy/helm/semantic-router",
+		Namespace:   semanticNamespace,
+		ValuesFiles: []string{"e2e/profiles/llm-d/values.yaml"},
+		Set: map[string]string{
+			"image.repository": "ghcr.io/vllm-project/semantic-router/extproc",
+			"image.tag":        opts.ImageTag,
+			"image.pullPolicy": "Never",
+		},
+		Wait:    true,
+		Timeout: "20m",
+	}
+	if err := deployer.Install(ctx, installOpts); err != nil {
+		return err
+	}
+	return deployer.WaitForDeployment(ctx, semanticNamespace, "semantic-router", 10*time.Minute)
+}
+
+func (p *Profile) deployInferenceSim(ctx context.Context, opts *framework.SetupOptions) error {
+	return p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/inference-sim.yaml")
+}
+
+func (p *Profile) deployLLMD(ctx context.Context) error {
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-llama.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/inferencepool-phi4.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/rbac.yaml"); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (p *Profile) deployGatewayRoutes(ctx context.Context) error {
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/gateway.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "e2e/profiles/llm-d/manifests/httproute-services.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/destinationrule.yaml"); err != nil {
+		return err
+	}
+	if err := p.kubectlApply(ctx, "deploy/kubernetes/istio/envoyfilter.yaml"); err != nil {
+		return err
+	}
+	// Ensure EnvoyFilter ext-proc matches Gateway listener context for this e2e run
+	_ = p.patchEnvoyFilterForGateway(ctx)
+	return nil
+}
+
+func (p *Profile) verifyEnvironment(ctx context.Context, opts *framework.SetupOptions) error {
+	config, err := clientcmd.BuildConfigFromFlags("", opts.KubeConfig)
+	if err != nil {
+		return err
+	}
+	client, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return err
+	}
+
+	// Verify required CRDs/APIs from Gateway API and Inference Extension are registered.
+	type apiCheck struct {
+		groupVersion      string
+		expectedResources []string
+		optional          bool
+	}
+	checkAPIGroup := func(c apiCheck) error {
+		resources, err := client.Discovery().ServerResourcesForGroupVersion(c.groupVersion)
+		if err != nil {
+			if c.optional {
+				if p.verbose {
+					fmt.Printf("[Verify] API group %s not found (optional): %v\n", c.groupVersion, err)
+				}
+				return nil
+			}
+			return fmt.Errorf("discover %s: %w", c.groupVersion, err)
+		}
+		found := make(map[string]bool, len(resources.APIResources))
+		for _, r := range resources.APIResources {
+			found[r.Name] = true
+		}
+		for _, r := range c.expectedResources {
+			if !found[r] {
+				if c.optional {
+					if p.verbose {
+						fmt.Printf("[Verify] Missing optional resource %s in %s\n", r, c.groupVersion)
+					}
+					return nil
+				}
+				return fmt.Errorf("missing %s in %s", r, c.groupVersion)
+			}
+		}
+		if p.verbose {
+			fmt.Printf("[Verify] API group %s present with %v\n", c.groupVersion, c.expectedResources)
+		}
+		return nil
+	}
+
+	for _, c := range []apiCheck{
+		{groupVersion: "gateway.networking.k8s.io/v1", expectedResources: []string{"gateways", "httproutes"}},
+		{groupVersion: "inference.networking.k8s.io/v1", expectedResources: []string{"inferencepools"}},
+		// EndpointPickerConfig CRD is optional in some environments; treat as best-effort.
+		{groupVersion: "inference.networking.x-k8s.io/v1alpha1", expectedResources: []string{"endpointpickerconfigs"}, optional: true},
+	} {
+		if err := checkAPIGroup(c); err != nil {
+			return err
+		}
+	}
+
+	// endpoints readiness check moved after deployments ready
+
+	// Actively wait for critical deployments to become Available before checking readiness counts.
+	// This avoids flakiness when resources are still pulling images just after creation.
+	deployer := helm.NewDeployer(opts.KubeConfig, opts.Verbose)
+	deploymentsToWait := []struct {
+		ns, name string
+	}{
+		{semanticNamespace, "semantic-router"},
+		{gatewayNamespace, "istiod"},
+		{"default", "vllm-llama3-8b-instruct"},
+		{"default", "phi4-mini"},
+		{"default", "llm-d-inference-scheduler-llama3-8b"},
+		{"default", "llm-d-inference-scheduler-phi4-mini"},
+		{"default", "inference-gateway-istio"},
+	}
+	for _, d := range deploymentsToWait {
+		if err := deployer.WaitForDeployment(ctx, d.ns, d.name, 10*time.Minute); err != nil {
+			return fmt.Errorf("wait for deployment %s/%s: %w", d.ns, d.name, err)
+		}
+	}
+
+	if err := helpers.CheckDeployment(ctx, client, semanticNamespace, "semantic-router", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.CheckDeployment(ctx, client, gatewayNamespace, "istiod", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.CheckDeployment(ctx, client, "default", "vllm-llama3-8b-instruct", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.CheckDeployment(ctx, client, "default", "phi4-mini", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-llama3-8b", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.CheckDeployment(ctx, client, "default", "llm-d-inference-scheduler-phi4-mini", p.verbose); err != nil {
+		return err
+	}
+	if err := helpers.VerifyServicePodsRunning(ctx, client, "default", "inference-gateway-istio", p.verbose); err != nil {
+		return err
+	}
+	if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "vllm-llama3-8b-instruct", 2*time.Minute); err != nil {
+		return err
+	}
+	if err := p.checkInferencePoolEndpointReady(ctx, client, "default", "phi4-mini", 2*time.Minute); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Note: GAIE controller is shipped by some providers (e.g., kgateway, nginx-gateway) or via provider-specific enable flags.
+// For Istio-based profile we rely on pilot env ENABLE_GATEWAY_API_INFERENCE_EXTENSION=true instead of a standalone controller manifest.
+
+func (p *Profile) runCmdOutput(ctx context.Context, name string, args ...string) (string, error) {
+	cmd := exec.CommandContext(ctx, name, args...)
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		return "", err
+	}
+	return string(out), nil
+}
+
+func (p *Profile) waitHTTPRouteAccepted(ctx context.Context, name, ns string, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"Accepted\")].status}")
+		if err == nil && strings.Contains(out, "True") {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
+	}
+	if p.verbose {
+		_ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100")
+		_ = p.runCmd(ctx, "kubectl", "-n", "default", "logs", "deploy/inference-gateway-istio", "--tail=100")
+	}
+	return fmt.Errorf("HTTPRoute %s/%s not Accepted", ns, name)
+}
+
+func (p *Profile) waitHTTPRouteResolvedRefs(ctx context.Context, name, ns string, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		out, err := p.runCmdOutput(ctx, "kubectl", "get", "httproute", name, "-n", ns, "-o", "jsonpath={.status.parents[*].conditions[?(@.type==\"ResolvedRefs\")].status}")
+		if err == nil && strings.Contains(out, "True") {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
+	}
+	if p.verbose {
+		_ = p.runCmd(ctx, "kubectl", "-n", "gateway-inference-system", "logs", "deploy/gateway-api-inference-extension-controller", "--tail=100")
+	}
+	return fmt.Errorf("HTTPRoute %s/%s not ResolvedRefs", ns, name)
+}
+
+func (p *Profile) checkInferencePoolEndpointReady(ctx context.Context, client *kubernetes.Clientset, ns, name string, timeout time.Duration) error {
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		ep, err := client.CoreV1().Endpoints(ns).Get(ctx, name, v1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		addrs := 0
+		for _, s := range ep.Subsets {
+			addrs += len(s.Addresses)
+		}
+		if addrs > 0 {
+			return nil
+		}
+		time.Sleep(2 * time.Second)
+	}
+	return fmt.Errorf("endpoints %s/%s empty", ns, name)
+}
+
+func (p *Profile) runCmd(ctx context.Context, name string, args ...string) error {
+	cmd := exec.CommandContext(ctx, name, args...)
+	if p.verbose {
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+	}
+	return cmd.Run()
+}
+
+func (p *Profile) kubectlApply(ctx context.Context, target string) error {
+	return p.runCmd(ctx, "kubectl", "apply", "-f", target)
+}
+
+func (p *Profile) kubectlDelete(ctx context.Context, target string) error {
+	return p.runCmd(ctx, "kubectl", "delete", "-f", target, "--ignore-not-found")
+}
+func (p *Profile) patchEnvoyFilterForGateway(ctx context.Context) error {
+	// Add match.context=GATEWAY and listener.portNumber=80 to the first configPatch via JSON patch
+	patch := `[
+      {"op":"add","path":"/spec/configPatches/0/match/context","value":"GATEWAY"},
+      {"op":"add","path":"/spec/configPatches/0/match/listener/portNumber","value":80}
+    ]`
+	return p.runCmd(ctx, "kubectl", "-n", "default", "patch", "envoyfilter", "semantic-router", "--type=json", "-p", patch)
+}
diff --git a/e2e/profiles/llm-d/values.yaml b/e2e/profiles/llm-d/values.yaml
new file mode 100644
index 000000000..21ea100fa
--- /dev/null
+++ b/e2e/profiles/llm-d/values.yaml
@@ -0,0 +1,69 @@
+# Profile revision 2025-11-21: math -> phi4-mini; cs/default -> llama3-8b; removed global HTTPRoute catch-all.
+config:
+  # Allow Envoy to re-run route matching after Semantic Router sets x-selected-model.
+  # Without this, Gateway API routes that depend on that header won't be chosen and return 404.
+  clear_route_cache: true
+  default_model: llama3-8b
+  # Enable domain classification to return the x-vsr-selected-category header
+  classifier:
+    category_model:
+      model_id: models/category_classifier_modernbert-base_model
+      threshold: 0.6
+      use_modernbert: true
+      category_mapping_path: models/category_classifier_modernbert-base_model/category_mapping.json
+    pii_model:
+      model_id: ""
+      threshold: 1.0
+      use_modernbert: false
+      pii_mapping_path: ""
+  # Optional domains used by decision rules
+  categories:
+    - name: math
+      description: "Mathematics, arithmetic, calculation"
+    - name: computer science
+      description: "Computer networks, programming, systems"
+  decisions:
+    - name: math_route
+      priority: 20
+      rules:
+        operator: OR
+        conditions:
+          - type: domain
+            name: math
+      modelRefs:
+        - model: phi4-mini
+          use_reasoning: false
+    - name: cs_route
+      priority: 10
+      rules:
+        operator: OR
+        conditions:
+          - type: domain
+            name: computer science
+      modelRefs:
+        - model: llama3-8b
+          use_reasoning: false
+    - name: default_route
+      priority: 1
+      rules:
+        operator: OR
+        conditions:
+          - type: domain
+            name: other
+      modelRefs:
+        - model: llama3-8b
+          use_reasoning: false
+  semantic_cache:
+    enabled: false
+  prompt_guard:
+    enabled: false
+  tools:
+    enabled: false
+  bert_model:
+    model_id: models/all-MiniLM-L12-v2
+    threshold: 0.6
+    use_cpu: true
+
+# Keep consistent with the default chart: initContainer, model downloads, and PVC use chart defaults
+image:
+  pullPolicy: IfNotPresent
diff --git a/e2e/testcases/llmd_auto_routing.go b/e2e/testcases/llmd_auto_routing.go
new file mode 100644
index 000000000..dd38f98a3
--- /dev/null
+++ b/e2e/testcases/llmd_auto_routing.go
@@ -0,0 +1,61 @@
+package testcases
+
+import (
+    "context"
+    "fmt"
+    "strings"
+    "time"
+
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("llmd-auto-routing", pkgtestcases.TestCase{
+		Description: "Auto model selection routes math and cs",
+		Tags:        []string{"llmd", "routing"},
+		Fn:          llmdAutoRouting,
+	})
+}
+
+func llmdAutoRouting(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error {
+	localPort, stop, err := setupServiceConnection(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer stop()
+
+	cases := []struct {
+		prompt string
+		model  string
+	}{
+		{prompt: "What is 2+2?", model: "phi4-mini"},
+		{prompt: "Explain TCP three-way handshake", model: "llama3-8b"},
+	}
+
+    for _, c := range cases {
+        res, err := doLLMDChat(ctx, localPort, "auto", c.prompt, 45*time.Second)
+        if err != nil {
+            return err
+        }
+        selected := getSelectedModel(res.headers)
+        pod := getInferencePod(res.headers)
+        if selected == "" && pod != "" {
+            if strings.HasPrefix(pod, "phi4-mini-") {
+                selected = "phi4-mini"
+            } else if strings.HasPrefix(pod, "vllm-llama3-8b-instruct-") {
+                selected = "llama3-8b"
+            }
+        }
+        if selected != c.model {
+            return fmt.Errorf("prompt '%s' expected model %s got %s", c.prompt, c.model, selected)
+        }
+        if pod == "" {
+            return fmt.Errorf("missing x-inference-pod for prompt '%s'", c.prompt)
+        }
+    }
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{"cases": len(cases)})
+	}
+	return nil
+}
diff --git a/e2e/testcases/llmd_distributed_inference.go b/e2e/testcases/llmd_distributed_inference.go
new file mode 100644
index 000000000..4828936bf
--- /dev/null
+++ b/e2e/testcases/llmd_distributed_inference.go
@@ -0,0 +1,97 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("llmd-distributed-inference", pkgtestcases.TestCase{
+		Description: "Verify multi-replica backends serve requests",
+		Tags:        []string{"llmd", "distributed"},
+		Fn:          llmdDistributed,
+	})
+}
+
+func llmdDistributed(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error {
+	backendDeploys := []string{"vllm-llama3-8b-instruct", "phi4-mini"}
+	for _, name := range backendDeploys {
+		dep, err := client.AppsV1().Deployments("default").Get(ctx, name, metav1.GetOptions{})
+		if err != nil {
+			return err
+		}
+		if dep.Status.ReadyReplicas < 2 {
+			return fmt.Errorf("%s ready replicas %d < 2", name, dep.Status.ReadyReplicas)
+		}
+	}
+
+	localPort, stop, err := setupServiceConnection(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer stop()
+
+	const total = 30
+	var (
+		success int
+		mu      sync.Mutex
+		podHits = map[string]int{}
+	)
+	var wg sync.WaitGroup
+
+	for i := 0; i < total; i++ {
+		i := i
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+            res, err := doLLMDChat(ctx, localPort, "llama3-8b", fmt.Sprintf("req-%d", i), 60*time.Second)
+			if err != nil {
+				return
+			}
+			pod := getInferencePod(res.headers)
+			if pod == "" {
+				return
+			}
+			mu.Lock()
+			success++
+			podHits[pod]++
+			mu.Unlock()
+		}()
+	}
+
+	wg.Wait()
+
+	successRate := float64(success) / float64(total)
+	if successRate < 0.98 {
+		return fmt.Errorf("success rate %.2f below 0.98", successRate)
+	}
+	if len(podHits) < 2 {
+		return fmt.Errorf("expected hits on >=2 pods, got %d", len(podHits))
+	}
+	var max, min int
+	for _, c := range podHits {
+		if c > max {
+			max = c
+		}
+		if min == 0 || c < min {
+			min = c
+		}
+	}
+	if min == 0 || float64(max)/float64(min) > 2.0 {
+		return fmt.Errorf("pod hit imbalance max/min=%d/%d", max, min)
+	}
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"success_rate": successRate,
+			"total":        total,
+			"pod_hits":     podHits,
+		})
+	}
+	return nil
+}
diff --git a/e2e/testcases/llmd_failover_recovery.go b/e2e/testcases/llmd_failover_recovery.go
new file mode 100644
index 000000000..aa4fefde8
--- /dev/null
+++ b/e2e/testcases/llmd_failover_recovery.go
@@ -0,0 +1,99 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("llmd-failover-recovery", pkgtestcases.TestCase{
+		Description: "Traffic survives backend pod loss",
+		Tags:        []string{"llmd", "failover"},
+		Fn:          llmdFailover,
+	})
+}
+
+func llmdFailover(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error {
+	pods, err := client.CoreV1().Pods("default").List(ctx, metav1.ListOptions{LabelSelector: "app=phi4-mini"})
+	if err != nil {
+		return err
+	}
+	if len(pods.Items) < 2 {
+		return fmt.Errorf("need >=2 phi4-mini pods for failover, got %d", len(pods.Items))
+	}
+	target := pods.Items[0].Name
+	if err := client.CoreV1().Pods("default").Delete(ctx, target, metav1.DeleteOptions{}); err != nil {
+		return err
+	}
+	deleteTime := time.Now()
+
+	time.Sleep(5 * time.Second)
+
+	localPort, stop, err := setupServiceConnection(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer stop()
+
+	deadline := time.Now().Add(60 * time.Second)
+	total := 0
+	success := 0
+	podHits := map[string]int{}
+	var recoveredAt time.Time
+
+	for time.Now().Before(deadline) {
+		total++
+		res, err := doLLMDChat(ctx, localPort, "phi4-mini", fmt.Sprintf("failover-%d", total), 45*time.Second)
+		if err == nil {
+			success++
+			pod := getInferencePod(res.headers)
+			if pod == target {
+				return fmt.Errorf("traffic routed to deleted pod %s", target)
+			}
+			if pod != "" {
+				podHits[pod]++
+			}
+			if recoveredAt.IsZero() {
+				recoveredAt = time.Now()
+			}
+		}
+		time.Sleep(1 * time.Second)
+	}
+	rate := float64(success) / float64(total)
+	if rate < 0.95 {
+		return fmt.Errorf("success rate %.2f below 0.95", rate)
+	}
+    if len(podHits) == 0 {
+        ep, err := client.CoreV1().Endpoints("default").Get(ctx, "phi4-mini", metav1.GetOptions{})
+        if err != nil {
+            return err
+        }
+        for _, s := range ep.Subsets {
+            for _, a := range s.Addresses {
+                if a.TargetRef != nil && a.TargetRef.Name == target {
+                    return fmt.Errorf("deleted pod still present in endpoints %s", target)
+                }
+            }
+        }
+    }
+	recoverySeconds := time.Since(deleteTime).Seconds()
+	if !recoveredAt.IsZero() {
+		recoverySeconds = recoveredAt.Sub(deleteTime).Seconds()
+	}
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"deleted_pod":      target,
+			"success":          success,
+			"total":            total,
+			"success_rate":     rate,
+			"pod_hits":         podHits,
+			"recovery_seconds": recoverySeconds,
+		})
+	}
+	return nil
+}
diff --git a/e2e/testcases/llmd_health_check.go b/e2e/testcases/llmd_health_check.go
new file mode 100644
index 000000000..7d3dd7ddf
--- /dev/null
+++ b/e2e/testcases/llmd_health_check.go
@@ -0,0 +1,90 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("llmd-health-check", pkgtestcases.TestCase{
+		Description: "LLM-D components readiness and basic chat call",
+		Tags:        []string{"llmd", "health"},
+		Fn:          llmdHealth,
+	})
+}
+
+func llmdHealth(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error {
+	ns := "default"
+	crdGVs := []string{
+		"gateway.networking.k8s.io/v1",
+		"inference.networking.k8s.io/v1",
+	}
+	for _, gv := range crdGVs {
+		if err := ensureGroupVersion(ctx, client, gv); err != nil {
+			return err
+		}
+	}
+
+	required := []struct {
+		ns  string
+		dep string
+	}{
+		{"vllm-semantic-router-system", "semantic-router"},
+		{"istio-system", "istiod"},
+		{ns, "llm-d-inference-scheduler-llama3-8b"},
+		{ns, "llm-d-inference-scheduler-phi4-mini"},
+		{ns, "vllm-llama3-8b-instruct"},
+		{ns, "phi4-mini"},
+	}
+	for _, r := range required {
+		if err := waitDeploymentReady(ctx, client, r.ns, r.dep, 1); err != nil {
+			return err
+		}
+	}
+
+	localPort, stop, err := setupServiceConnection(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer stop()
+
+	res, err := doLLMDChat(ctx, localPort, "llama3-8b", "ping", 30*time.Second)
+	if err != nil {
+		return err
+	}
+
+	if opts.SetDetails != nil {
+		opts.SetDetails(map[string]interface{}{
+			"deployments":      len(required),
+			"crds":             crdGVs,
+			"latency_ms":       res.duration.Milliseconds(),
+			"x-selected-model": getSelectedModel(res.headers),
+			"x-inference-pod":  getInferencePod(res.headers),
+		})
+	}
+	return nil
+}
+
+func waitDeploymentReady(ctx context.Context, client *kubernetes.Clientset, ns, name string, want int32) error {
+	return wait.PollUntilContextTimeout(ctx, 2*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
+		dep, err := client.AppsV1().Deployments(ns).Get(ctx, name, v1.GetOptions{})
+		if err != nil {
+			return false, err
+		}
+		return dep.Status.ReadyReplicas >= want, nil
+	})
+}
+
+func ensureGroupVersion(ctx context.Context, client *kubernetes.Clientset, gv string) error {
+	_, err := client.Discovery().ServerResourcesForGroupVersion(gv)
+	if err != nil {
+		return fmt.Errorf("CRD groupVersion %s not present: %w", gv, err)
+	}
+	return nil
+}
diff --git a/e2e/testcases/llmd_helpers.go b/e2e/testcases/llmd_helpers.go
new file mode 100644
index 000000000..e0d5aa5cd
--- /dev/null
+++ b/e2e/testcases/llmd_helpers.go
@@ -0,0 +1,71 @@
+package testcases
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"sort"
+	"strings"
+	"time"
+)
+
+type chatResult struct {
+	headers  http.Header
+	duration time.Duration
+}
+
+// doLLMDChat sends a chat completion request to the forwarded service and returns headers + latency.
+func doLLMDChat(ctx context.Context, port, model, content string, timeout time.Duration) (chatResult, error) {
+	body := map[string]interface{}{
+		"model": model,
+		"messages": []map[string]string{
+			{"role": "user", "content": content},
+		},
+	}
+	data, _ := json.Marshal(body)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://localhost:%s/v1/chat/completions", port), bytes.NewBuffer(data))
+	if err != nil {
+		return chatResult{}, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	client := &http.Client{Timeout: timeout}
+	start := time.Now()
+	resp, err := client.Do(req)
+	duration := time.Since(start)
+	if err != nil {
+		return chatResult{}, err
+	}
+	defer resp.Body.Close()
+	b, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode != http.StatusOK {
+		return chatResult{}, fmt.Errorf("chat failed: %d %s", resp.StatusCode, truncateString(string(b), 120))
+	}
+	return chatResult{headers: resp.Header, duration: duration}, nil
+}
+
+func getInferencePod(headers http.Header) string {
+	return strings.TrimSpace(headers.Get("x-inference-pod"))
+}
+
+func getSelectedModel(headers http.Header) string {
+	v := strings.TrimSpace(headers.Get("x-vsr-selected-model"))
+	if v != "" {
+		return v
+	}
+	return strings.TrimSpace(headers.Get("x-selected-model"))
+}
+
+func percentileDuration(ds []time.Duration, p float64) time.Duration {
+	if len(ds) == 0 {
+		return 0
+	}
+	sorted := make([]time.Duration, len(ds))
+	copy(sorted, ds)
+	sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] })
+	idx := int(float64(len(sorted)-1) * p)
+	return sorted[idx]
+}
diff --git a/e2e/testcases/llmd_performance_baseline.go b/e2e/testcases/llmd_performance_baseline.go
new file mode 100644
index 000000000..8edf36899
--- /dev/null
+++ b/e2e/testcases/llmd_performance_baseline.go
@@ -0,0 +1,109 @@
+package testcases
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	pkgtestcases "github.com/vllm-project/semantic-router/e2e/pkg/testcases"
+	"k8s.io/client-go/kubernetes"
+)
+
+func init() {
+	pkgtestcases.Register("llmd-performance-baseline", pkgtestcases.TestCase{
+		Description: "Measure success rate under moderate concurrency",
+		Tags:        []string{"llmd", "perf"},
+		Fn:          llmdPerf,
+	})
+}
+
+type perfResult struct {
+	concurrency int
+	success     int
+	total       int
+	durations   []time.Duration
+}
+
+func llmdPerf(ctx context.Context, client *kubernetes.Clientset, opts pkgtestcases.TestCaseOptions) error {
+	localPort, stop, err := setupServiceConnection(ctx, client, opts)
+	if err != nil {
+		return err
+	}
+	defer stop()
+
+	stages := []int{15, 30, 60}
+	results := []perfResult{}
+
+	for _, conc := range stages {
+		res := runPerfStage(ctx, localPort, conc, 20*time.Second)
+		if float64(res.success)/float64(res.total) < 0.95 {
+			return fmt.Errorf("stage %d success %d/%d", conc, res.success, res.total)
+		}
+		results = append(results, res)
+		time.Sleep(2 * time.Second)
+	}
+
+	if opts.SetDetails != nil {
+		summary := map[string]interface{}{}
+		for _, r := range results {
+			p50, p95 := percentileDuration(r.durations, 0.5), percentileDuration(r.durations, 0.95)
+			key := fmt.Sprintf("c%d", r.concurrency)
+			summary[key] = map[string]interface{}{
+				"success":      r.success,
+				"total":        r.total,
+				"success_rate": float64(r.success) / float64(r.total),
+				"p50_ms":       p50.Milliseconds(),
+				"p95_ms":       p95.Milliseconds(),
+			}
+		}
+		opts.SetDetails(summary)
+	}
+	return nil
+}
+
+func runPerfStage(ctx context.Context, port string, conc int, duration time.Duration) perfResult {
+	res := perfResult{concurrency: conc}
+	stageCtx, cancel := context.WithTimeout(ctx, duration)
+	defer cancel()
+
+	var wg sync.WaitGroup
+	var mu sync.Mutex
+
+	for i := 0; i < conc; i++ {
+		i := i
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for {
+				select {
+				case <-stageCtx.Done():
+					return
+				default:
+				}
+				resItem, err := doLLMDChat(stageCtx, port, "auto", fmt.Sprintf("perf-%d-%d", conc, i), 60*time.Second)
+				if err != nil && stageCtx.Err() != nil {
+					continue
+				}
+				// On transient failure, retry once within stage window
+				if err != nil {
+					resItem2, err2 := doLLMDChat(stageCtx, port, "auto", fmt.Sprintf("perf-%d-%d", conc, i), 60*time.Second)
+					if err2 == nil {
+						err = nil
+						resItem = resItem2
+					}
+				}
+				mu.Lock()
+				res.total++
+				if err == nil {
+					res.success++
+					res.durations = append(res.durations, resItem.duration)
+				}
+				mu.Unlock()
+			}
+		}()
+	}
+
+	wg.Wait()
+	return res
+}
diff --git a/tools/make/e2e.mk b/tools/make/e2e.mk
index cd8981b89..e417053ce 100644
--- a/tools/make/e2e.mk
+++ b/tools/make/e2e.mk
@@ -96,6 +96,7 @@ e2e-help: ## Show help for E2E testing
 	@echo "Available Profiles:"
 	@echo "  ai-gateway  - Test Semantic Router with Envoy AI Gateway"
 	@echo "  aibrix      - Test Semantic Router with vLLM AIBrix"
+	@echo "  llm-d       - Test Semantic Router with LLM-D"
 	@echo "  istio       - Test Semantic Router with Istio (coming soon)"
 	@echo ""
 	@echo "Environment Variables:"
@@ -127,4 +128,3 @@ e2e-help: ## Show help for E2E testing
 	@echo "  2. make e2e-test-only                            # Run all tests"
 	@echo "  3. make e2e-test-only E2E_TESTS=\"test1\"          # Run specific test"
 	@echo "  4. make e2e-cleanup                              # Clean up when done"
-