Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 48 additions & 41 deletions .github/values-10-disagg-prefill.yaml
Original file line number Diff line number Diff line change
@@ -1,88 +1,90 @@
# Unified configuration for disaggregated prefill setup
servingEngineSpec:
strategy:
type: Recreate
enableEngine: true
runtimeClassName: ""
runtimeClassName: "nvidia"
containerPort: 8000
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "llama-prefill"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
tag: "nightly-2025-09-04"
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
replicaCount: 1
requestCPU: 8
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
enablePrefixCaching: false
# maxModelLen: 2048
extraArgs:
- "--enforce-eager"
- "--disable-log-requests"
lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
localCpu: true
maxLocalCpuSize: 5
maxLocalDiskSize: 0
enableNixl: true
enableXpyd: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlProxyHost: "vllm-router-service"
nixlProxyPort: 7500
nixlBufferSize: "1073741824"
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
rpcPort: "producer1"
labels:
model: "opt125m-prefill"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
model: "llama-prefill"
hf_token: <hf-token>
# Decode node configuration
- name: "opt125m-decode"
- name: "llama-decode"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
tag: "nightly-2025-09-04"
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
replicaCount: 1
requestCPU: 8
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
enablePrefixCaching: false
# maxModelLen: 2048
extraArgs:
- "--enforce-eager"
- "--disable-log-requests"
lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
localCpu: false
maxLocalCpuSize: 0
enableNixl: true
enableXpyd: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlPeerInitPort: 7300
nixlPeerAllocPort: 7400
nixlBufferSize: "2147483648"
nixlBufferDevice: "cuda"
nixlEnableGc: true
# nixlBackends: ["UCX"]
enablePD: true
rpcPort: "consumer1"
skipLastNTokens: 1
hf_token: <hf-token>
labels:
model: "opt125m-decode"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
model: "llama-decode"
containerSecurityContext:
capabilities:
add:
- SYS_PTRACE

routerSpec:
enableRouter: true
repository: "git-act-router"
imagePullPolicy: "IfNotPresent"
strategy:
type: Recreate
repository: "xiaokunchen/vllm-router"
tag: "09-05-v1"
imagePullPolicy: "Always"
replicaCount: 1
containerPort: 8000
servicePort: 80
Expand All @@ -102,6 +104,11 @@ routerSpec:
release: "router"
extraArgs:
- "--prefill-model-labels"
- "opt125m-prefill"
- "llama-prefill"
- "--decode-model-labels"
- "opt125m-decode"
- "llama-decode"
nixlPeerHost: "vllm-llama-decode-engine-service"
nixlPeerInitPort: 7300
nixlPeerAllocPort: 7400
nixlProxyHost: "0.0.0.0"
nixlProxyPort: 7500
2 changes: 1 addition & 1 deletion .github/workflows/functionality-helm-chart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
run: |
cd ${{ github.workspace }}
kubectl config use-context minikube
sudo docker build --build-arg INSTALL_OPTIONAL_DEP=default -t localhost:5000/git-act-router -f docker/Dockerfile .
sudo docker build --build-arg INSTALL_OPTIONAL_DEP=default -t localhost:5000/git-act-router -f docker/Dockerfile.pd .
sudo docker push localhost:5000/git-act-router
sudo sysctl fs.protected_regular=0
minikube image load localhost:5000/git-act-router
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/router-e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ jobs:
echo "🔨 Building router docker image"
cd ${{ github.workspace }}
eval "$(minikube docker-env)"
docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.pd .
- name: Run all k8s discovery routing tests
run: |
Expand Down
31 changes: 31 additions & 0 deletions docker/Dockerfile.pd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM python:3.12-slim

WORKDIR /app

# hadolint ignore=DL3008
RUN --mount=type=cache,target=/var/lib/apt --mount=type=cache,target=/var/cache/apt \
apt-get update && \
apt-get install -y --no-install-recommends git curl && \
rm -rf /var/lib/apt/lists/* && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
/root/.local/bin/uv venv /opt/venv

# Copy the pyproject.toml and the git metadata first (leverage Docker layer caching)
COPY pyproject.toml .
COPY .git/ .git/

# Copy the rest of the application code
COPY src/ src/

ARG INSTALL_OPTIONAL_DEP=semantic_cache,lmcache
ENV INSTALL_OPTIONAL_DEP=${INSTALL_OPTIONAL_DEP}

# hadolint ignore=SC1091
RUN . /opt/venv/bin/activate && \
/root/.local/bin/uv pip install --upgrade --no-cache-dir pip setuptools_scm && \
/root/.local/bin/uv pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP] && \
/root/.local/bin/uv pip install zmq msgspec

# Set the entrypoint
ENTRYPOINT ["/opt/venv/bin/vllm-router"]
CMD []
2 changes: 1 addition & 1 deletion docs/source/developer_guide/docker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ Run this command from the root folder path of the project:

.. code-block:: bash
docker build -t <image_name>:<tag> -f docker/Dockerfile .
docker build -t <image_name>:<tag> -f docker/Dockerfile.pd .
30 changes: 30 additions & 0 deletions helm/templates/deployment-router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,26 @@ spec:
- "--lmcache-controller-port"
- "{{ .Values.routerSpec.lmcacheControllerPort }}"
{{- end }}
{{- if .Values.routerSpec.nixlPeerHost }}
- "--nixl-peer-host"
- "{{ .Values.routerSpec.nixlPeerHost }}"
{{- end }}
{{- if .Values.routerSpec.nixlPeerInitPort }}
- "--nixl-peer-init-port"
- "{{ .Values.routerSpec.nixlPeerInitPort }}"
{{- end }}
{{- if .Values.routerSpec.nixlPeerAllocPort }}
- "--nixl-peer-alloc-port"
- "{{ .Values.routerSpec.nixlPeerAllocPort }}"
{{- end }}
{{- if .Values.routerSpec.nixlProxyHost }}
- "--nixl-proxy-host"
- "{{ .Values.routerSpec.nixlProxyHost }}"
{{- end }}
{{- if .Values.routerSpec.nixlProxyPort }}
- "--nixl-proxy-port"
- "{{ .Values.routerSpec.nixlProxyPort }}"
{{- end }}
{{- if .Values.routerSpec.resources }}
resources:
{{- if .Values.routerSpec.resources.requests }}
Expand All @@ -135,6 +155,16 @@ spec:
containerPort: {{ .Values.routerSpec.containerPort }}
- name: "lmcache-port"
containerPort: 9000
- name: pd-port-1
containerPort: 7100
- name: pd-port-2
containerPort: 7200
- name: pd-port-3
containerPort: 7300
- name: pd-port-4
containerPort: 7400
- name: pd-port-5
containerPort: 7500
Comment on lines +158 to +167

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The pd-port-* container ports are hardcoded. This could be made more flexible and maintainable by using a range loop in the Helm template. This would allow configuring the number of ports through values.yaml instead of modifying the template for future changes.

For example, you could introduce routerSpec.pdPortCount and routerSpec.pdStartPort in values.yaml and loop like this:

{{- $pdStartPort := .Values.routerSpec.pdStartPort | default 7100 -}}
{{- $pdPortCount := .Values.routerSpec.pdPortCount | default 5 -}}
{{- range $i := until (int $pdPortCount) }}
          - name: pd-port-{{ add1 $i }}
            containerPort: {{ add $pdStartPort $i }}
{{- end }}

livenessProbe:
initialDelaySeconds: 30
periodSeconds: 5
Expand Down
69 changes: 52 additions & 17 deletions helm/templates/deployment-vllm-multi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,11 @@ spec:
{{- if $modelSpec.lmcacheConfig.enabled }}
{{- if hasKey $modelSpec.lmcacheConfig "enablePD" }}
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": {{ $modelSpec.lmcacheConfig.nixlRole | quote }}}}'
{{- if eq $kv_role "kv_producer" }}
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": "{{ $modelSpec.lmcacheConfig.rpcPort | default "producer1" }}"}}'
{{- else }}
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": "{{ $modelSpec.lmcacheConfig.rpcPort | default "consumer1" }}", "skip_last_n_tokens": {{ $modelSpec.lmcacheConfig.skipLastNTokens | default 1 }}}}'
{{- end }}
{{- else if and (hasKey $modelSpec.vllmConfig "v0") (eq (toString $modelSpec.vllmConfig.v0) "1") }}
- "--kv-transfer-config"
- '{"kv_connector":"LMCacheConnector","kv_role":"{{ $kv_role }}"}'
Expand Down Expand Up @@ -259,16 +263,18 @@ spec:
value: "True"
- name: VLLM_RPC_TIMEOUT
value: "1000000"
- name: PYTHONHASHSEED
value: "0"
- name: VLLM_ENABLE_V1_MULTIPROCESSING
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "cudaVisibleDevices" }}
- name: CUDA_VISIBLE_DEVICES
value: {{ $modelSpec.lmcacheConfig.cudaVisibleDevices | quote }}
{{- end }}
{{- if and (hasKey $modelSpec.lmcacheConfig "enablePD") ($modelSpec.lmcacheConfig.enablePD) }}
- name: LMCACHE_LOCAL_CPU
value: "False"
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
value: "0"
- name: LMCACHE_REMOTE_SERDE
value: "NULL"
- name: UCX_TLS
Expand All @@ -281,14 +287,29 @@ spec:
- name: LMCACHE_NIXL_ROLE
value: {{ $modelSpec.lmcacheConfig.nixlRole | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "enableXpyd" }}
- name: LMCACHE_ENABLE_XPYD
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableXpyd | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlProxyHost" }}
- name: LMCACHE_NIXL_PROXY_HOST
value: {{ $modelSpec.lmcacheConfig.nixlProxyHost | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlProxyPort" }}
- name: LMCACHE_NIXL_PROXY_PORT
value: {{ $modelSpec.lmcacheConfig.nixlProxyPort | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerHost" }}
- name: LMCACHE_NIXL_RECEIVER_HOST
# value: "0.0.0.0"
- name: LMCACHE_NIXL_PEER_HOST
value: {{ $modelSpec.lmcacheConfig.nixlPeerHost | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerPort" }}
- name: LMCACHE_NIXL_RECEIVER_PORT
value: {{ $modelSpec.lmcacheConfig.nixlPeerPort | quote }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerInitPort" }}
- name: LMCACHE_NIXL_PEER_INIT_PORT
value: {{ $modelSpec.lmcacheConfig.nixlPeerInitPort | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerAllocPort" }}
- name: LMCACHE_NIXL_PEER_ALLOC_PORT
value: {{ $modelSpec.lmcacheConfig.nixlPeerAllocPort | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlBufferSize" }}
- name: LMCACHE_NIXL_BUFFER_SIZE
Expand All @@ -298,22 +319,26 @@ spec:
- name: LMCACHE_NIXL_BUFFER_DEVICE
value: {{ $modelSpec.lmcacheConfig.nixlBufferDevice | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlBackends" }}
- name: LMCACHE_NIXL_BACKENDS
value: {{ $modelSpec.lmcacheConfig.nixlBackends | toJson | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "nixlEnableGc" }}
- name: LMCACHE_NIXL_ENABLE_GC
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.nixlEnableGc | quote }}
{{- end }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "cpuOffloadingBufferSize" }}
{{- if gt (int $modelSpec.lmcacheConfig.cpuOffloadingBufferSize) 0 }}
{{- if hasKey $modelSpec.lmcacheConfig "localCpu" }}
- name: LMCACHE_LOCAL_CPU
value: "True"
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.localCpu | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "maxLocalCpuSize" }}
- name: LMCACHE_MAX_LOCAL_CPU_SIZE
value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}"
{{- end}}
value: {{ $modelSpec.lmcacheConfig.maxLocalCpuSize | quote }}
{{- end }}
{{- if hasKey $modelSpec.lmcacheConfig "diskOffloadingBufferSize" }}
{{- if hasKey $modelSpec.lmcacheConfig "maxLocalDiskSize" }}
- name: LMCACHE_MAX_LOCAL_DISK_SIZE
value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}"
value: {{ $modelSpec.lmcacheConfig.maxLocalDiskSize | quote }}
{{- end }}
{{- if .Values.cacheserverSpec }}
- name: LMCACHE_REMOTE_URL
Expand Down Expand Up @@ -360,6 +385,16 @@ spec:
containerPort: 55555
- name: ucx-port
containerPort: 9999
- name: pd-port-1
containerPort: 7100
- name: pd-port-2
containerPort: 7200
- name: pd-port-3
containerPort: 7300
- name: pd-port-4
containerPort: 7400
- name: pd-port-5
containerPort: 7500
Comment on lines +388 to +397

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to deployment-router.yaml, the pd-port-* container ports are hardcoded. Using a range loop with values from values.yaml would make this more flexible and easier to maintain.

For example, you could introduce servingEngineSpec.pdPortCount and servingEngineSpec.pdStartPort in values.yaml and loop like this:

{{- $pdStartPort := .Values.servingEngineSpec.pdStartPort | default 7100 -}}
{{- $pdPortCount := .Values.servingEngineSpec.pdPortCount | default 5 -}}
{{- range $i := until (int $pdPortCount) }}
            - name: pd-port-{{ add1 $i }}
              containerPort: {{ add $pdStartPort $i }}
{{- end }}

{{- include "chart.probes" . | indent 10 }}
resources: {{- include "chart.resources" $modelSpec | nindent 12 }}
{{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }}
Expand Down
20 changes: 20 additions & 0 deletions helm/templates/service-router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@ spec:
port: 9000
targetPort: lmcache-port
protocol: TCP
- name: pd-port-1
port: 7100
targetPort: pd-port-1
protocol: TCP
- name: pd-port-2
port: 7200
targetPort: pd-port-2
protocol: TCP
- name: pd-port-3
port: 7300
targetPort: pd-port-3
protocol: TCP
- name: pd-port-4
port: 7400
targetPort: pd-port-4
protocol: TCP
- name: pd-port-5
port: 7500
targetPort: pd-port-5
protocol: TCP
Comment on lines +23 to +42

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The pd-port-* definitions are hardcoded. This can be made more maintainable by using a range loop. This would allow configuring the number of ports through values.yaml instead of modifying the template.

For example:

{{- $pdStartPort := .Values.routerSpec.pdStartPort | default 7100 -}}
{{- $pdPortCount := .Values.routerSpec.pdPortCount | default 5 -}}
{{- range $i := until (int $pdPortCount) }}
    - name: pd-port-{{ add1 $i }}
      port: {{ add $pdStartPort $i }}
      targetPort: pd-port-{{ add1 $i }}
      protocol: TCP
{{- end }}

selector:
{{- include "chart.routerLabels" . | nindent 4 }}
{{- end }}
Loading
Loading