Skip to content

Commit f14d9a4

Browse files
Merge branch 'main' into main
2 parents 213fd0c + b5c0f0f commit f14d9a4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+4352
-348
lines changed

.github/values-06-session-routing.yaml

Lines changed: 27 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,68 +3,30 @@ servingEngineSpec:
33
type: Recreate
44
runtimeClassName: ""
55
modelSpec:
6-
# Prefill node configuration
7-
- name: "opt125m-prefill"
6+
- name: "opt125m"
87
repository: "lmcache/vllm-openai"
9-
tag: "2025-05-27-v1"
8+
tag: "v0.3.9post2"
109
modelURL: "facebook/opt-125m"
11-
replicaCount: 1
12-
requestCPU: 8
10+
replicaCount: 2
11+
requestCPU: 6
1312
requestMemory: "30Gi"
14-
# requestGPU: 1
13+
requestGPU: 1
1514
pvcStorage: "50Gi"
1615
vllmConfig:
1716
enablePrefixCaching: true
1817
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
18+
gpuMemoryUtilization: 0.8
2119
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2320
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
34-
labels:
35-
model: "opt125m-prefill"
36-
chatTemplate: "chat.jinja2"
37-
chatTemplateConfigMap: |-
38-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
39-
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
40-
# Decode node configuration
41-
- name: "opt125m-decode"
42-
repository: "lmcache/vllm-openai"
43-
tag: "2025-05-27-v1"
44-
modelURL: "facebook/opt-125m"
45-
replicaCount: 1
46-
requestCPU: 8
47-
requestMemory: "30Gi"
48-
# requestGPU: 1
49-
pvcStorage: "50Gi"
50-
vllmConfig:
51-
enablePrefixCaching: true
52-
maxModelLen: 1024
53-
v1: 1
54-
lmcacheConfig:
55-
cudaVisibleDevices: "1"
56-
enabled: true
57-
kvRole: "kv_consumer" # Set decode node as consumer
58-
enableNixl: true
59-
nixlRole: "receiver"
60-
nixlPeerHost: "0.0.0.0"
61-
nixlPeerPort: "55555"
62-
nixlBufferSize: "1073741824" # 1GB
63-
nixlBufferDevice: "cuda"
64-
nixlEnableGc: true
65-
enablePD: true
66-
labels:
67-
model: "opt125m-decode"
21+
cpuOffloadingBufferSize: "10"
22+
enableController: true
23+
controllerPort: 9000
24+
workerPorts: "8001"
25+
p2pHost: "localhost"
26+
p2pInitPorts: "30081"
27+
env:
28+
- name: LMCACHE_LOG_LEVEL
29+
value: "DEBUG"
6830
chatTemplate: "chat.jinja2"
6931
chatTemplateConfigMap: |-
7032
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -81,7 +43,19 @@ routerSpec:
8143
type: Recreate
8244
enableRouter: true
8345
routingLogic: "session"
46+
resources:
47+
requests:
48+
cpu: "1"
49+
memory: "2G"
50+
limits:
51+
cpu: "1"
52+
memory: "2G"
53+
lmcacheControllerPort: 9000
8454
sessionKey: "x-user-id"
8555
extraArgs:
8656
- "--log-level"
8757
- "info"
58+
startupProbe:
59+
initialDelaySeconds: 20
60+
periodSeconds: 5
61+
failureThreshold: 3

.github/values-07-prefix-routing.yaml

Lines changed: 27 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,68 +3,30 @@ servingEngineSpec:
33
type: Recreate
44
runtimeClassName: ""
55
modelSpec:
6-
# Prefill node configuration
7-
- name: "opt125m-prefill"
6+
- name: "opt125m"
87
repository: "lmcache/vllm-openai"
9-
tag: "2025-05-27-v1"
8+
tag: "v0.3.9post2"
109
modelURL: "facebook/opt-125m"
11-
replicaCount: 1
12-
requestCPU: 8
10+
replicaCount: 2
11+
requestCPU: 6
1312
requestMemory: "30Gi"
14-
# requestGPU: 1
13+
requestGPU: 1
1514
pvcStorage: "50Gi"
1615
vllmConfig:
1716
enablePrefixCaching: true
1817
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
18+
gpuMemoryUtilization: 0.8
2119
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2320
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
34-
labels:
35-
model: "opt125m-prefill"
36-
chatTemplate: "chat.jinja2"
37-
chatTemplateConfigMap: |-
38-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
39-
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
40-
# Decode node configuration
41-
- name: "opt125m-decode"
42-
repository: "lmcache/vllm-openai"
43-
tag: "2025-05-27-v1"
44-
modelURL: "facebook/opt-125m"
45-
replicaCount: 1
46-
requestCPU: 8
47-
requestMemory: "30Gi"
48-
# requestGPU: 1
49-
pvcStorage: "50Gi"
50-
vllmConfig:
51-
enablePrefixCaching: true
52-
maxModelLen: 1024
53-
v1: 1
54-
lmcacheConfig:
55-
cudaVisibleDevices: "1"
56-
enabled: true
57-
kvRole: "kv_consumer" # Set decode node as consumer
58-
enableNixl: true
59-
nixlRole: "receiver"
60-
nixlPeerHost: "0.0.0.0"
61-
nixlPeerPort: "55555"
62-
nixlBufferSize: "1073741824" # 1GB
63-
nixlBufferDevice: "cuda"
64-
nixlEnableGc: true
65-
enablePD: true
66-
labels:
67-
model: "opt125m-decode"
21+
cpuOffloadingBufferSize: "10"
22+
enableController: true
23+
controllerPort: 9000
24+
workerPorts: "8001"
25+
p2pHost: "localhost"
26+
p2pInitPorts: "30081"
27+
env:
28+
- name: LMCACHE_LOG_LEVEL
29+
value: "DEBUG"
6830
chatTemplate: "chat.jinja2"
6931
chatTemplateConfigMap: |-
7032
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -79,8 +41,20 @@ routerSpec:
7941
imagePullPolicy: "IfNotPresent"
8042
strategy:
8143
type: Recreate
44+
resources:
45+
requests:
46+
cpu: "1"
47+
memory: "2G"
48+
limits:
49+
cpu: "1"
50+
memory: "2G"
8251
enableRouter: true
8352
routingLogic: "prefixaware"
8453
extraArgs:
8554
- "--log-level"
8655
- "info"
56+
lmcacheControllerPort: 9000
57+
startupProbe:
58+
initialDelaySeconds: 20
59+
periodSeconds: 5
60+
failureThreshold: 3

.github/values-08-roundrobin-routing.yaml

Lines changed: 27 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,68 +3,30 @@ servingEngineSpec:
33
type: Recreate
44
runtimeClassName: ""
55
modelSpec:
6-
# Prefill node configuration
7-
- name: "opt125m-prefill"
6+
- name: "opt125m"
87
repository: "lmcache/vllm-openai"
9-
tag: "2025-05-27-v1"
8+
tag: "v0.3.9post2"
109
modelURL: "facebook/opt-125m"
11-
replicaCount: 1
12-
requestCPU: 8
10+
replicaCount: 2
11+
requestCPU: 6
1312
requestMemory: "30Gi"
14-
# requestGPU: 1
13+
requestGPU: 1
1514
pvcStorage: "50Gi"
1615
vllmConfig:
1716
enablePrefixCaching: true
1817
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
18+
gpuMemoryUtilization: 0.8
2119
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2320
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
34-
labels:
35-
model: "opt125m-prefill"
36-
chatTemplate: "chat.jinja2"
37-
chatTemplateConfigMap: |-
38-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
39-
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
40-
# Decode node configuration
41-
- name: "opt125m-decode"
42-
repository: "lmcache/vllm-openai"
43-
tag: "2025-05-27-v1"
44-
modelURL: "facebook/opt-125m"
45-
replicaCount: 1
46-
requestCPU: 8
47-
requestMemory: "30Gi"
48-
# requestGPU: 1
49-
pvcStorage: "50Gi"
50-
vllmConfig:
51-
enablePrefixCaching: true
52-
maxModelLen: 1024
53-
v1: 1
54-
lmcacheConfig:
55-
cudaVisibleDevices: "1"
56-
enabled: true
57-
kvRole: "kv_consumer" # Set decode node as consumer
58-
enableNixl: true
59-
nixlRole: "receiver"
60-
nixlPeerHost: "0.0.0.0"
61-
nixlPeerPort: "55555"
62-
nixlBufferSize: "1073741824" # 1GB
63-
nixlBufferDevice: "cuda"
64-
nixlEnableGc: true
65-
enablePD: true
66-
labels:
67-
model: "opt125m-decode"
21+
cpuOffloadingBufferSize: "10"
22+
enableController: true
23+
controllerPort: 9000
24+
workerPorts: "8001"
25+
p2pHost: "localhost"
26+
p2pInitPorts: "30081"
27+
env:
28+
- name: LMCACHE_LOG_LEVEL
29+
value: "DEBUG"
6830
chatTemplate: "chat.jinja2"
6931
chatTemplateConfigMap: |-
7032
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
@@ -84,3 +46,15 @@ routerSpec:
8446
extraArgs:
8547
- "--log-level"
8648
- "info"
49+
resources:
50+
requests:
51+
cpu: "1"
52+
memory: "2G"
53+
limits:
54+
cpu: "1"
55+
memory: "2G"
56+
lmcacheControllerPort: 9000
57+
startupProbe:
58+
initialDelaySeconds: 20
59+
periodSeconds: 5
60+
failureThreshold: 3

0 commit comments

Comments
 (0)