-
Notifications
You must be signed in to change notification settings - Fork 278
[Feat][PD] lastest PD support from LMCache with NIXL #669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1aa1feb
b507357
72141a0
7623b48
6b5a533
099161f
b92826e
1db19aa
7998394
0b65402
4ebc430
73f88a0
7dcc043
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
FROM python:3.12-slim | ||
|
||
WORKDIR /app | ||
|
||
# hadolint ignore=DL3008 | ||
RUN --mount=type=cache,target=/var/lib/apt --mount=type=cache,target=/var/cache/apt \ | ||
apt-get update && \ | ||
apt-get install -y --no-install-recommends git curl && \ | ||
rm -rf /var/lib/apt/lists/* && \ | ||
curl -LsSf https://astral.sh/uv/install.sh | sh && \ | ||
/root/.local/bin/uv venv /opt/venv | ||
|
||
# Copy the pyproject.toml and the git metadata first (leverage Docker layer caching) | ||
COPY pyproject.toml . | ||
COPY .git/ .git/ | ||
|
||
# Copy the rest of the application code | ||
COPY src/ src/ | ||
|
||
ARG INSTALL_OPTIONAL_DEP=semantic_cache,lmcache | ||
ENV INSTALL_OPTIONAL_DEP=${INSTALL_OPTIONAL_DEP} | ||
|
||
# hadolint ignore=SC1091 | ||
RUN . /opt/venv/bin/activate && \ | ||
/root/.local/bin/uv pip install --upgrade --no-cache-dir pip setuptools_scm && \ | ||
/root/.local/bin/uv pip install --no-cache-dir .[$INSTALL_OPTIONAL_DEP] && \ | ||
/root/.local/bin/uv pip install zmq msgspec | ||
|
||
# Set the entrypoint | ||
ENTRYPOINT ["/opt/venv/bin/vllm-router"] | ||
CMD [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -183,7 +183,11 @@ spec: | |
{{- if $modelSpec.lmcacheConfig.enabled }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "enablePD" }} | ||
- "--kv-transfer-config" | ||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": {{ $modelSpec.lmcacheConfig.nixlRole | quote }}}}' | ||
{{- if eq $kv_role "kv_producer" }} | ||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": "{{ $modelSpec.lmcacheConfig.rpcPort | default "producer1" }}"}}' | ||
{{- else }} | ||
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"{{ $kv_role }}","kv_connector_extra_config":{"discard_partial_chunks": false, "lmcache_rpc_port": "{{ $modelSpec.lmcacheConfig.rpcPort | default "consumer1" }}", "skip_last_n_tokens": {{ $modelSpec.lmcacheConfig.skipLastNTokens | default 1 }}}}' | ||
{{- end }} | ||
{{- else if and (hasKey $modelSpec.vllmConfig "v0") (eq (toString $modelSpec.vllmConfig.v0) "1") }} | ||
- "--kv-transfer-config" | ||
- '{"kv_connector":"LMCacheConnector","kv_role":"{{ $kv_role }}"}' | ||
|
@@ -259,16 +263,18 @@ spec: | |
value: "True" | ||
- name: VLLM_RPC_TIMEOUT | ||
value: "1000000" | ||
- name: PYTHONHASHSEED | ||
value: "0" | ||
- name: VLLM_ENABLE_V1_MULTIPROCESSING | ||
value: "1" | ||
- name: VLLM_WORKER_MULTIPROC_METHOD | ||
value: "spawn" | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "cudaVisibleDevices" }} | ||
- name: CUDA_VISIBLE_DEVICES | ||
value: {{ $modelSpec.lmcacheConfig.cudaVisibleDevices | quote }} | ||
{{- end }} | ||
{{- if and (hasKey $modelSpec.lmcacheConfig "enablePD") ($modelSpec.lmcacheConfig.enablePD) }} | ||
- name: LMCACHE_LOCAL_CPU | ||
value: "False" | ||
- name: LMCACHE_MAX_LOCAL_CPU_SIZE | ||
value: "0" | ||
- name: LMCACHE_REMOTE_SERDE | ||
value: "NULL" | ||
- name: UCX_TLS | ||
|
@@ -281,14 +287,29 @@ spec: | |
- name: LMCACHE_NIXL_ROLE | ||
value: {{ $modelSpec.lmcacheConfig.nixlRole | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "enableXpyd" }} | ||
- name: LMCACHE_ENABLE_XPYD | ||
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.enableXpyd | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlProxyHost" }} | ||
- name: LMCACHE_NIXL_PROXY_HOST | ||
value: {{ $modelSpec.lmcacheConfig.nixlProxyHost | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlProxyPort" }} | ||
- name: LMCACHE_NIXL_PROXY_PORT | ||
value: {{ $modelSpec.lmcacheConfig.nixlProxyPort | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerHost" }} | ||
- name: LMCACHE_NIXL_RECEIVER_HOST | ||
# value: "0.0.0.0" | ||
- name: LMCACHE_NIXL_PEER_HOST | ||
value: {{ $modelSpec.lmcacheConfig.nixlPeerHost | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerPort" }} | ||
- name: LMCACHE_NIXL_RECEIVER_PORT | ||
value: {{ $modelSpec.lmcacheConfig.nixlPeerPort | quote }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerInitPort" }} | ||
- name: LMCACHE_NIXL_PEER_INIT_PORT | ||
value: {{ $modelSpec.lmcacheConfig.nixlPeerInitPort | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlPeerAllocPort" }} | ||
- name: LMCACHE_NIXL_PEER_ALLOC_PORT | ||
value: {{ $modelSpec.lmcacheConfig.nixlPeerAllocPort | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlBufferSize" }} | ||
- name: LMCACHE_NIXL_BUFFER_SIZE | ||
|
@@ -298,22 +319,26 @@ spec: | |
- name: LMCACHE_NIXL_BUFFER_DEVICE | ||
value: {{ $modelSpec.lmcacheConfig.nixlBufferDevice | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlBackends" }} | ||
- name: LMCACHE_NIXL_BACKENDS | ||
value: {{ $modelSpec.lmcacheConfig.nixlBackends | toJson | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "nixlEnableGc" }} | ||
- name: LMCACHE_NIXL_ENABLE_GC | ||
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.nixlEnableGc | quote }} | ||
{{- end }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "cpuOffloadingBufferSize" }} | ||
{{- if gt (int $modelSpec.lmcacheConfig.cpuOffloadingBufferSize) 0 }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "localCpu" }} | ||
- name: LMCACHE_LOCAL_CPU | ||
value: "True" | ||
value: {{ ternary "True" "False" $modelSpec.lmcacheConfig.localCpu | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "maxLocalCpuSize" }} | ||
- name: LMCACHE_MAX_LOCAL_CPU_SIZE | ||
value: "{{ $modelSpec.lmcacheConfig.cpuOffloadingBufferSize }}" | ||
{{- end}} | ||
value: {{ $modelSpec.lmcacheConfig.maxLocalCpuSize | quote }} | ||
{{- end }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "diskOffloadingBufferSize" }} | ||
{{- if hasKey $modelSpec.lmcacheConfig "maxLocalDiskSize" }} | ||
- name: LMCACHE_MAX_LOCAL_DISK_SIZE | ||
value: "{{ $modelSpec.lmcacheConfig.diskOffloadingBufferSize }}" | ||
value: {{ $modelSpec.lmcacheConfig.maxLocalDiskSize | quote }} | ||
{{- end }} | ||
{{- if .Values.cacheserverSpec }} | ||
- name: LMCACHE_REMOTE_URL | ||
|
@@ -360,6 +385,16 @@ spec: | |
containerPort: 55555 | ||
- name: ucx-port | ||
containerPort: 9999 | ||
- name: pd-port-1 | ||
containerPort: 7100 | ||
- name: pd-port-2 | ||
containerPort: 7200 | ||
- name: pd-port-3 | ||
containerPort: 7300 | ||
- name: pd-port-4 | ||
containerPort: 7400 | ||
- name: pd-port-5 | ||
containerPort: 7500 | ||
Comment on lines
+388
to
+397
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to For example, you could introduce {{- $pdStartPort := .Values.servingEngineSpec.pdStartPort | default 7100 -}}
{{- $pdPortCount := .Values.servingEngineSpec.pdPortCount | default 5 -}}
{{- range $i := until (int $pdPortCount) }}
- name: pd-port-{{ add1 $i }}
containerPort: {{ add $pdStartPort $i }}
{{- end }} |
||
{{- include "chart.probes" . | indent 10 }} | ||
resources: {{- include "chart.resources" $modelSpec | nindent 12 }} | ||
{{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumeMounts") }} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,26 @@ spec: | |
port: 9000 | ||
targetPort: lmcache-port | ||
protocol: TCP | ||
- name: pd-port-1 | ||
port: 7100 | ||
targetPort: pd-port-1 | ||
protocol: TCP | ||
- name: pd-port-2 | ||
port: 7200 | ||
targetPort: pd-port-2 | ||
protocol: TCP | ||
- name: pd-port-3 | ||
port: 7300 | ||
targetPort: pd-port-3 | ||
protocol: TCP | ||
- name: pd-port-4 | ||
port: 7400 | ||
targetPort: pd-port-4 | ||
protocol: TCP | ||
- name: pd-port-5 | ||
port: 7500 | ||
targetPort: pd-port-5 | ||
protocol: TCP | ||
Comment on lines
+23
to
+42
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The For example: {{- $pdStartPort := .Values.routerSpec.pdStartPort | default 7100 -}}
{{- $pdPortCount := .Values.routerSpec.pdPortCount | default 5 -}}
{{- range $i := until (int $pdPortCount) }}
- name: pd-port-{{ add1 $i }}
port: {{ add $pdStartPort $i }}
targetPort: pd-port-{{ add1 $i }}
protocol: TCP
{{- end }} |
||
selector: | ||
{{- include "chart.routerLabels" . | nindent 4 }} | ||
{{- end }} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
pd-port-*
container ports are hardcoded. This could be made more flexible and maintainable by using arange
loop in the Helm template. This would allow configuring the number of ports throughvalues.yaml
instead of modifying the template for future changes.For example, you could introduce
routerSpec.pdPortCount
androuterSpec.pdStartPort
invalues.yaml
and loop like this: