From 3c4914202a34e32d7f9a285b3414aa83ee9e6906 Mon Sep 17 00:00:00 2001 From: googs1025 Date: Sun, 9 Nov 2025 17:09:09 +0800 Subject: [PATCH] docs: add P/D disaggregation example in manifests/disaggregation Signed-off-by: googs1025 --- README.md | 3 + manifests/disaggregation/README.md | 63 ++++++++++++++++ manifests/disaggregation/vllm-sim-pd.yaml | 91 +++++++++++++++++++++++ 3 files changed, 157 insertions(+) create mode 100644 manifests/disaggregation/README.md create mode 100644 manifests/disaggregation/vllm-sim-pd.yaml diff --git a/README.md b/README.md index b162c7b..8e910aa 100644 --- a/README.md +++ b/README.md @@ -362,3 +362,6 @@ curl -X POST http://localhost:8000/v1/chat/completions \ ] }' ``` + +### Prefill/Decode (P/D) Separation Example +An example configuration for P/D (Prefill/Decode) disaggregation deployment can be found in [manifests/disaggregation](manifests/disaggregation). diff --git a/manifests/disaggregation/README.md b/manifests/disaggregation/README.md new file mode 100644 index 0000000..595de70 --- /dev/null +++ b/manifests/disaggregation/README.md @@ -0,0 +1,63 @@ +## Prefill/Decode Disaggregation Deployment Guide + +This guide demonstrates how to deploy the LLM Disaggregation Simulator (llm-d-sim) in a Kubernetes cluster using a separated Prefill and Decode (P/D) architecture. +The [`routing-sidecar`](https://github.com/llm-d/llm-d-routing-sidecar) intelligently routes client requests to dedicated Prefill and Decode simulation services, enabling validation of disaggregated inference workflows. + +### Quick Start + +1. Deploy the Application + Apply the provided manifest (e.g., vllm-sim-pd.yaml) to your Kubernetes cluster: + +```bash +kubectl apply -f vllm-sim-pd.yaml +``` + +> This manifest defines two Deployments (vllm-sim-p for Prefill, vllm-sim-d for Decode) and two Services for internal and external communication. + +2. Verify Pods Are Ready + Check that all pods are running: + +```bash +kubectl get pods -l 'llm-d.ai/role in (prefill,decode)' +``` + +Expected output: + +```bash +NAME READY STATUS RESTARTS AGE +vllm-sim-d-685b57d694-d6qxg 2/2 Running 0 12m +vllm-sim-p-7b768565d9-79j97 1/1 Running 0 12m +``` + +### Send a Disaggregated Request Using kubectl port-forward +To access both the Decode services from your local machine, use kubectl port-forward to forward their ports to your localhost. + +### Forward the Decode Service Port +Open a terminal and run: + +```bash +kubectl port-forward svc/vllm-sim-d-service 8000:8000 +``` + +This command forwards port 8000 from the `vllm-sim-d-service` to your local machine's port 8000. + +#### Test the Disaggregated Flow + +Now, send a request to the forwarded Decode service port with the necessary headers: + +```bash +curl -v http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "x-prefiller-host-port: vllm-sim-p-service:8000" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "messages": [{"role": "user", "content": "Hello from P/D architecture!"}], + "max_tokens": 32 + }' +``` + +> Critical Header: +>``` +>x-prefiller-host-port: vllm-sim-p-service:8000 +>``` +>This header tells the sidecar where to send the prefill request. Since we have `vllm-sim-p-service:8000`, we specify it here. \ No newline at end of file diff --git a/manifests/disaggregation/vllm-sim-pd.yaml b/manifests/disaggregation/vllm-sim-pd.yaml new file mode 100644 index 0000000..069d995 --- /dev/null +++ b/manifests/disaggregation/vllm-sim-pd.yaml @@ -0,0 +1,91 @@ +--- +# Prefill Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-sim-p +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/role: prefill + template: + metadata: + labels: + llm-d.ai/role: prefill + spec: + containers: + - name: vllm-prefill + image: ghcr.io/llm-d/llm-d-inference-sim:latest + imagePullPolicy: IfNotPresent + args: + - "--v=4" + - "--port=8000" + - "--model=meta-llama/Llama-3.1-8B-Instruct" + - "--data-parallel-size=1" + ports: + - containerPort: 8000 +--- +# Decode Deployment (with routing-sidecar + vLLM simulator) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-sim-d +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/role: decode + spec: + containers: + - name: routing-sidecar + image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.1-rc.1 + imagePullPolicy: IfNotPresent + args: + - "--v=4" + - "--port=8000" + - "--vllm-port=8200" + - "--connector=nixlv2" + - "--secure-proxy=false" + ports: + - containerPort: 8000 + - name: vllm-decode + image: ghcr.io/llm-d/llm-d-inference-sim:latest + imagePullPolicy: IfNotPresent + args: + - "--v=4" + - "--port=8200" + - "--model=meta-llama/Llama-3.1-8B-Instruct" + - "--data-parallel-size=1" + ports: + - containerPort: 8200 +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-sim-p-service +spec: + selector: + llm-d.ai/role: prefill + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-sim-d-service +spec: + selector: + llm-d.ai/role: decode + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP \ No newline at end of file