-
Notifications
You must be signed in to change notification settings - Fork 361
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
I am following the following documentation for Disaggregated Prefill and the deployment router fails and the pod is in crashloopbackoff. The detailed error is as follows:
➜ kubectl logs pd-deployment-router-665b996f8c-mfd6w
usage: vllm-router [-h] [--host HOST] [--port PORT]
[--service-discovery {static,k8s}]
[--static-backends STATIC_BACKENDS]
[--static-models STATIC_MODELS]
[--static-aliases STATIC_ALIASES]
[--static-model-types STATIC_MODEL_TYPES]
[--static-model-labels STATIC_MODEL_LABELS]
[--k8s-port K8S_PORT] [--k8s-namespace K8S_NAMESPACE]
[--k8s-label-selector K8S_LABEL_SELECTOR]
[--routing-logic {roundrobin,session,kvaware,prefixaware,disaggregated_prefill}]
[--lmcache-controller-port LMCACHE_CONTROLLER_PORT]
[--session-key SESSION_KEY] [--callbacks CALLBACKS]
[--request-rewriter {noop}] [--enable-batch-api]
[--file-storage-class {local_file}]
[--file-storage-path FILE_STORAGE_PATH]
[--batch-processor {local}]
[--engine-stats-interval ENGINE_STATS_INTERVAL]
[--request-stats-window REQUEST_STATS_WINDOW] [--log-stats]
[--log-stats-interval LOG_STATS_INTERVAL]
[--dynamic-config-json DYNAMIC_CONFIG_JSON] [--version]
[--feature-gates FEATURE_GATES]
[--log-level {critical,error,warning,info,debug,trace}]
[--sentry-dsn SENTRY_DSN]
[--prefill-model-labels PREFILL_MODEL_LABELS]
[--decode-model-labels DECODE_MODEL_LABELS]
vllm-router: error: unrecognized arguments: --k8s-service-discovery-type pod-ipDocs
- Website: https://docs.vllm.ai/projects/production-stack/en/latest/use_cases/disaggregated-prefill.html
- In the repo: https://github.com/vllm-project/production-stack/blob/main/docs/source/use_cases/disaggregated-prefill.rst
To Reproduce
Deployed the helm chart from this commit: 528f6d8
Follow instructions here: https://docs.vllm.ai/projects/production-stack/en/latest/use_cases/disaggregated-prefill.html
cat << EOF | tee /tmp/values-16-disagg-prefill.yaml
# Unified configuration for disaggregated prefill setup
servingEngineSpec:
enableEngine: true
runtimeClassName: ""
containerPort: 8000
modelSpec:
# Prefill node configuration
- name: "llama-prefill"
repository: "lmcache/vllm-openai"
tag: "2025-05-17-v1"
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
replicaCount: 1
requestCPU: 8
requestMemory: "30Gi"
requestGPU: 1
vllmConfig:
enableChunkedPrefill: false
enablePrefixCaching: false
maxModelLen: 32000
v1: 1
lmcacheConfig:
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "pd-llama-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
hf_token: "${HF_TOKEN}"
labels:
model: "llama-prefill"
# Decode node configuration
- name: "llama-decode"
repository: "lmcache/vllm-openai"
tag: "2025-05-17-v1"
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
replicaCount: 1
requestCPU: 8
requestMemory: "30Gi"
requestGPU: 1
vllmConfig:
enableChunkedPrefill: false
enablePrefixCaching: false
maxModelLen: 32000
v1: 1
lmcacheConfig:
enabled: true
kvRole: "kv_consumer"
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
hf_token: "${HF_TOKEN}"
labels:
model: "llama-decode"
routerSpec:
enableRouter: true
repository: "lmcache/lmstack-router"
tag: "pd-05-26"
replicaCount: 1
containerPort: 8000
servicePort: 80
routingLogic: "disaggregated_prefill"
engineScrapeInterval: 15
requestStatsWindow: 60
enablePD: true
resources:
requests:
cpu: "4"
memory: "16G"
limits:
cpu: "4"
memory: "32G"
labels:
environment: "router"
release: "router"
extraArgs:
- "--prefill-model-labels"
- "llama-prefill"
- "--decode-model-labels"
- "llama-decode"
EOFgit checkout 528f6d8c8daff03e448449e8bc0efd3db3d1a899
helm upgrade -i pd helm/ -f /tmp/values-16-disagg-prefill.yaml
kubectl get podsExpected behavior
I expected the pod to be up and running.
Additional context
Other Info
Here is the deployment config for the router pod:
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
meta.helm.sh/release-name: pd
meta.helm.sh/release-namespace: default
creationTimestamp: "2025-10-31T19:17:51Z"
generation: 1
labels:
app.kubernetes.io/managed-by: Helm
environment: router
release: router
name: pd-deployment-router
namespace: default
resourceVersion: "24113316"
uid: dda14d90-40e4-4221-8eef-76a9129f68be
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
environment: router
release: router
strategy:
rollingUpdate:
maxSurge: 100%
maxUnavailable: 0
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
environment: router
release: router
spec:
containers:
- args:
- --host
- 0.0.0.0
- --port
- "8000"
- --service-discovery
- k8s
- --k8s-namespace
- default
- --k8s-service-discovery-type
- pod-ip
- --k8s-label-selector
- environment=test,release=test
- --routing-logic
- disaggregated_prefill
- --engine-stats-interval
- "15"
- --request-stats-window
- "60"
- --prefill-model-labels
- llama-prefill
- --decode-model-labels
- llama-decode
env:
- name: LMCACHE_LOG_LEVEL
value: DEBUG
image: lmcache/lmstack-router:pd-05-26
imagePullPolicy: Always
livenessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
name: router-container
ports:
- containerPort: 8000
name: router-cport
protocol: TCP
- containerPort: 9000
name: lmcache-port
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: "4"
memory: 32G
requests:
cpu: "4"
memory: 16G
startupProbe:
failureThreshold: 3
httpGet:
path: /health
port: 8000
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
dnsPolicy: ClusterFirst
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: pd-router-service-account
serviceAccountName: pd-router-service-account
terminationGracePeriodSeconds: 30
status:
conditions:
- lastTransitionTime: "2025-10-31T19:17:51Z"
lastUpdateTime: "2025-10-31T19:17:51Z"
message: Deployment does not have minimum availability.
reason: MinimumReplicasUnavailable
status: "False"
type: Available
- lastTransitionTime: "2025-10-31T19:17:51Z"
lastUpdateTime: "2025-10-31T19:17:51Z"
message: ReplicaSet "pd-deployment-router-665b996f8c" is progressing.
reason: ReplicaSetUpdated
status: "True"
type: Progressing
observedGeneration: 1
replicas: 1
unavailableReplicas: 1
updatedReplicas: 1Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working