vllm-project
diff --git a/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 8 additions & 4 deletions b/‎.github/workflows/integration-test-k8s.yml‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/README.md‎
Lines changed: 628 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/README.md‎
Lines changed: 628 additions & 0 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/dynamo-graph-deployment.yaml‎
Lines changed: 238 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/dynamo-graph-deployment.yaml‎
Lines changed: 238 additions & 0 deletions
diff --git a/‎deploy/kubernetes/dynamo/dynamo-resources/envoy-gateway-values.yaml‎
Lines changed: 9 additions & 0 deletions b/‎deploy/kubernetes/dynamo/dynamo-resources/envoy-gateway-values.yaml‎
Lines changed: 9 additions & 0 deletions
@@ -12,7 +12,11 @@ on:
 jobs:
   integration-test:
     runs-on: ubuntu-latest
-    timeout-minutes: 60
+    timeout-minutes: 75
+    strategy:
+      fail-fast: false # Continue testing other profiles even if one fails
+      matrix:
+        profile: [ai-gateway]
 
     steps:
       - name: Check out the repo
@@ -61,11 +65,11 @@ jobs:
         run: |
           make build-e2e
 
-      - name: Run Integration E2E tests
+      - name: Run Integration E2E tests (${{ matrix.profile }})
         id: e2e-test
         run: |
           set +e  # Don't exit on error, we want to capture the result
-          make e2e-test E2E_PROFILE=ai-gateway E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
+          make e2e-test E2E_PROFILE=${{ matrix.profile }} E2E_VERBOSE=true E2E_KEEP_CLUSTER=false
           TEST_EXIT_CODE=$?
           echo "test_exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
           exit ${TEST_EXIT_CODE}
@@ -74,7 +78,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: test-reports
+          name: test-reports-${{ matrix.profile }}
           path: |
             test-report.json
             test-report.md
 
@@ -0,0 +1,238 @@
+---
+# Disaggregated vLLM Deployment for Dynamo
+# GPU-enabled configuration for Kind cluster with NVIDIA support
+#
+# Architecture:
+#   - Frontend: HTTP API server (GPU 0)
+#   - VLLMPrefillWorker: Specialized prefill-only worker (GPU 1)
+#   - VLLMDecodeWorker: Specialized decode-only worker (GPU 2)
+#
+# GPU Allocation (4 GPUs total):
+#   - GPU 0: Frontend
+#   - GPU 1: Prefill Worker
+#   - GPU 2: Decode Worker
+#   - GPU 3: (spare)
+#
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm
+  namespace: dynamo-system
+spec:
+  backendFramework: vllm
+  envs:
+    - name: DYN_LOG
+      value: "info"
+  services:
+    # Frontend - HTTP API server
+    Frontend:
+      dynamoNamespace: dynamo-vllm
+      componentType: frontend
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=0 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.frontend --http-port 8000"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 8000
+            initialDelaySeconds: 20
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
+
+    # VLLMPrefillWorker - Specialized prefill-only worker (GPU 1)
+    VLLMPrefillWorker:
+      dynamoNamespace: dynamo-vllm
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=1 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --is-prefill-worker --connector null"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 180
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 120
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
+
+    # VLLMDecodeWorker - Specialized decode-only worker (GPU 2)
+    VLLMDecodeWorker:
+      dynamoNamespace: dynamo-vllm
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "4Gi"
+          gpu: "1"
+        limits:
+          cpu: "2"
+          memory: "8Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          command:
+            - /bin/sh
+            - -c
+          args:
+            - "sleep 15 && export CUDA_VISIBLE_DEVICES=2 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --connector null"
+          securityContext:
+            privileged: true
+          livenessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 180
+            periodSeconds: 30
+            failureThreshold: 5
+          readinessProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 120
+            periodSeconds: 10
+            failureThreshold: 10
+          startupProbe:
+            tcpSocket:
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            failureThreshold: 30
+          env:
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
+            - name: NATS_URL
+              value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.dynamo-system:4222"
+            - name: DYN_SYSTEM_ENABLED
+              value: "true"
+            - name: DYN_SYSTEM_PORT
+              value: "9090"
+            - name: LD_LIBRARY_PATH
+              value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
+            - name: NVIDIA_DRIVER_CAPABILITIES
+              value: "compute,utility"
+          volumeMounts:
+            - name: nvidia-driver-libs
+              mountPath: /nvidia-driver-libs
+              readOnly: true
+            - name: dev
+              mountPath: /dev
+        volumes:
+          - name: nvidia-driver-libs
+            hostPath:
+              path: /nvidia-driver-libs
+          - name: dev
+            hostPath:
+              path: /dev
@@ -0,0 +1,9 @@
+# Envoy Gateway values for Dynamo E2E Testing
+# Enables ExtensionAPIs (EnvoyPatchPolicy) for Semantic Router integration
+
+config:
+  envoyGateway:
+    extensionApis:
+      enableEnvoyPatchPolicy: true
+
+