Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/integration-test-k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
strategy:
fail-fast: false # Continue testing other profiles even if one fails
matrix:
# Note: dynamo profile requires GPU, run manually with: make e2e-test-dynamo
profile: [ai-gateway, aibrix, routing-strategies, llm-d, istio, production-stack]

steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ categories:
mmlu_categories: ["computer science", "engineering"]
- name: finance
mmlu_categories: ["economics"]
- name: politics
# If omitted, identity mapping applies when this name matches MMLU
- name: politics # If omitted, identity mapping applies when this name matches MMLU

# Decisions define routing logic by combining rules and model selection
decisions:
Expand Down
660 changes: 660 additions & 0 deletions deploy/kubernetes/dynamo/dynamo-resources/README.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
---
# Disaggregated vLLM Deployment for Dynamo
# GPU-enabled configuration for Kind cluster with NVIDIA support
#
# Architecture:
# Frontend: HTTP API server (GPU 0)
# VLLMPrefillWorker: Specialized prefill-only worker (GPU 1)
# VLLMDecodeWorker: Specialized decode-only worker (GPU 2)
#
# GPU Allocation (4 GPUs total):
# GPU 0: Frontend
# GPU 1: Prefill Worker
# GPU 2: Decode Worker
# GPU 3: (spare)
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm
namespace: dynamo-system
spec:
backendFramework: vllm
envs:
- name: DYN_LOG
value: "info"
services:
# Frontend - HTTP API server
Frontend:
dynamoNamespace: dynamo-vllm
componentType: frontend
replicas: 1
resources:
requests:
cpu: "1"
memory: "4Gi"
gpu: "1"
limits:
cpu: "2"
memory: "8Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
command:
- /bin/sh
- -c
args:
- "sleep 15 && export CUDA_VISIBLE_DEVICES=0 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.frontend --http-port 8000"
securityContext:
privileged: true
livenessProbe:
tcpSocket:
port: 8000
initialDelaySeconds: 60
periodSeconds: 30
failureThreshold: 5
readinessProbe:
tcpSocket:
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 10
startupProbe:
tcpSocket:
port: 8000
initialDelaySeconds: 20
periodSeconds: 10
failureThreshold: 30
env:
- name: ETCD_ENDPOINTS
value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
- name: NATS_URL
value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
- name: NATS_SERVER
value: "nats://dynamo-platform-nats.dynamo-system:4222"
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "9090"
- name: LD_LIBRARY_PATH
value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
volumeMounts:
- name: nvidia-driver-libs
mountPath: /nvidia-driver-libs
readOnly: true
- name: dev
mountPath: /dev
volumes:
- name: nvidia-driver-libs
hostPath:
path: /nvidia-driver-libs
- name: dev
hostPath:
path: /dev

# VLLMPrefillWorker - Specialized prefill-only worker (GPU 1)
VLLMPrefillWorker:
dynamoNamespace: dynamo-vllm
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "4Gi"
gpu: "1"
limits:
cpu: "2"
memory: "8Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
command:
- /bin/sh
- -c
args:
- "sleep 15 && export CUDA_VISIBLE_DEVICES=1 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --is-prefill-worker --connector null"
securityContext:
privileged: true
livenessProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 180
periodSeconds: 30
failureThreshold: 5
readinessProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 120
periodSeconds: 10
failureThreshold: 10
startupProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 30
env:
- name: ETCD_ENDPOINTS
value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
- name: NATS_URL
value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
- name: NATS_SERVER
value: "nats://dynamo-platform-nats.dynamo-system:4222"
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "9090"
- name: LD_LIBRARY_PATH
value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
volumeMounts:
- name: nvidia-driver-libs
mountPath: /nvidia-driver-libs
readOnly: true
- name: dev
mountPath: /dev
volumes:
- name: nvidia-driver-libs
hostPath:
path: /nvidia-driver-libs
- name: dev
hostPath:
path: /dev

# VLLMDecodeWorker - Specialized decode-only worker (GPU 2)
VLLMDecodeWorker:
dynamoNamespace: dynamo-vllm
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "4Gi"
gpu: "1"
limits:
cpu: "2"
memory: "8Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
command:
- /bin/sh
- -c
args:
- "sleep 15 && export CUDA_VISIBLE_DEVICES=2 && export LD_LIBRARY_PATH=/nvidia-driver-libs:/usr/local/cuda/lib64:$LD_LIBRARY_PATH && python3 -m dynamo.vllm --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 1 --enforce-eager --connector null"
securityContext:
privileged: true
livenessProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 180
periodSeconds: 30
failureThreshold: 5
readinessProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 120
periodSeconds: 10
failureThreshold: 10
startupProbe:
tcpSocket:
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
failureThreshold: 30
env:
- name: ETCD_ENDPOINTS
value: "dynamo-platform-etcd.dynamo-system.svc.cluster.local:2379"
- name: NATS_URL
value: "nats://dynamo-platform-nats.dynamo-system.svc.cluster.local:4222"
- name: NATS_SERVER
value: "nats://dynamo-platform-nats.dynamo-system:4222"
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_PORT
value: "9090"
- name: LD_LIBRARY_PATH
value: "/nvidia-driver-libs:/usr/local/cuda/lib64"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
volumeMounts:
- name: nvidia-driver-libs
mountPath: /nvidia-driver-libs
readOnly: true
- name: dev
mountPath: /dev
volumes:
- name: nvidia-driver-libs
hostPath:
path: /nvidia-driver-libs
- name: dev
hostPath:
path: /dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Envoy Gateway values for Dynamo E2E Testing
# Enables ExtensionAPIs (EnvoyPatchPolicy) for Semantic Router integration

config:
envoyGateway:
extensionApis:
enableEnvoyPatchPolicy: true

Loading