vllm-project
diff --git a/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/intelligent-routing/in-tree/generic_categories.yaml‎
Lines changed: 1 addition & 2 deletions b/‎config/intelligent-routing/in-tree/generic_categories.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/README.md‎
Lines changed: 660 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/README.md‎
Lines changed: 660 additions & 0 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/dynamo-graph-deployment.yaml‎
Lines changed: 237 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/dynamo-graph-deployment.yaml‎
Lines changed: 237 additions & 0 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/envoy-gateway-values.yaml‎
Lines changed: 8 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/envoy-gateway-values.yaml‎
Lines changed: 8 additions & 0 deletions
@@ -28,6 +28,7 @@ jobs:
     strategy:
       fail-fast: false  # Continue testing other profiles even if one fails
       matrix:
+        # Note: dynamo profile requires GPU, run manually with: make e2e-test-dynamo
         profile: [ai-gateway, aibrix, routing-strategies, llm-d, istio, production-stack]
 
     steps:
 
@@ -23,8 +23,7 @@ categories:
     mmlu_categories: ["computer science", "engineering"]
   - name: finance
     mmlu_categories: ["economics"]
-  - name: politics
-    # If omitted, identity mapping applies when this name matches MMLU
+  - name: politics  # If omitted, identity mapping applies when this name matches MMLU
 
 # Decisions define routing logic by combining rules and model selection
 decisions:
 
@@ -0,0 +1,237 @@
+---
+# Disaggregated vLLM Deployment for Dynamo
+# GPU-enabled configuration for Kind cluster with NVIDIA support
+#
+# Architecture:
+#   Frontend: HTTP API server (GPU 0)
+#   VLLMPrefillWorker: Specialized prefill-only worker (GPU 1)
+#   VLLMDecodeWorker: Specialized decode-only worker (GPU 2)
+#
+# GPU Allocation (4 GPUs total):
+#   GPU 0: Frontend
+#   GPU 1: Prefill Worker
+#   GPU 2: Decode Worker
+#   GPU 3: (spare)
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm
+  namespace: dynamo-system
+spec:
+  backendFramework: vllm
+  envs:
+    - name: DYN_LOG
+      value: "info"
+  services:
+    # Frontend - HTTP API server
+    Frontend:
+      dynamoNamespace: dynamo-vllm
+      componentType: frontend
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=0 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.frontend --http-port 8000"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
+
+    # VLLMPrefillWorker - Specialized prefill-only worker (GPU 1)
+    VLLMPrefillWorker:
+      dynamoNamespace: dynamo-vllm
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=1 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --is-prefill-worker --connector null"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 180
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 120
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
+
+    # VLLMDecodeWorker - Specialized decode-only worker (GPU 2)
+    VLLMDecodeWorker:
+      dynamoNamespace: dynamo-vllm
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=2 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --connector null"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 180
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 120
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
@@ -0,0 +1,8 @@
+# Envoy Gateway values for Dynamo E2E Testing
+# Enables ExtensionAPIs (EnvoyPatchPolicy) for Semantic Router integration
+
+config:
+  envoyGateway:
+    extensionApis:
+      enableEnvoyPatchPolicy: true
+