Skip to content

Commit ddf3e75

Browse files
authored
[Cherry-pick] [0.11.0] pd proxy support ipv6 and fix proxy (#4242)
### What this PR does / why we need it? pd proxy support ipv6, mooncake connector check whether the IPv6 address is used and notify the user. --------- Signed-off-by: liziyu <[email protected]>
1 parent 378e92a commit ddf3e75

File tree

4 files changed

+29
-5
lines changed

4 files changed

+29
-5
lines changed

examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
import asyncio
8989
import functools
9090
import heapq
91+
import ipaddress
9192
import os
9293
import sys
9394
import threading
@@ -116,6 +117,12 @@ def __init__(self, host, port):
116117
self.host = host
117118
self.port = port
118119
self.url = f'http://{host}:{port}/v1'
120+
try:
121+
ip = ipaddress.ip_address(self.host)
122+
if isinstance(ip, ipaddress.IPv6Address):
123+
self.url = f'http://[{host}]:{port}/v1'
124+
except Exception:
125+
pass
119126
self.client = httpx.AsyncClient(timeout=None,
120127
base_url=self.url,
121128
limits=httpx.Limits(
@@ -356,6 +363,9 @@ async def send_request_to_service(client: httpx.AsyncClient,
356363
req_data = req_data.copy()
357364
req_data["stream"] = False
358365
req_data["max_tokens"] = 1
366+
req_data["min_tokens"] = 1
367+
if "max_completion_tokens" in req_data:
368+
req_data["max_completion_tokens"] = 1
359369
if "stream_options" in req_data:
360370
del req_data["stream_options"]
361371
headers = {

examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
import asyncio
8989
import functools
9090
import heapq
91+
import ipaddress
9192
import json
9293
import os
9394
import sys
@@ -118,6 +119,12 @@ def __init__(self, host, port):
118119
self.host = host
119120
self.port = port
120121
self.url = f'http://{host}:{port}/v1'
122+
try:
123+
ip = ipaddress.ip_address(self.host)
124+
if isinstance(ip, ipaddress.IPv6Address):
125+
self.url = f'http://[{host}]:{port}/v1'
126+
except Exception:
127+
pass
121128
self.client = httpx.AsyncClient(timeout=None,
122129
base_url=self.url,
123130
limits=httpx.Limits(
@@ -366,6 +373,8 @@ async def send_request_to_service(client: httpx.AsyncClient,
366373
req_data["stream"] = False
367374
req_data["max_tokens"] = 1
368375
req_data["min_tokens"] = 1
376+
if "max_completion_tokens" in req_data:
377+
req_data["max_completion_tokens"] = 1
369378
if "stream_options" in req_data:
370379
del req_data["stream_options"]
371380
headers = {

vllm_ascend/distributed/mooncake/transfer_engine.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ipaddress
12
import threading
23
from typing import Optional
34

@@ -8,6 +9,15 @@
89

910

1011
def get_global_te(hostname: str, device_name: Optional[str]):
12+
try:
13+
ip = ipaddress.ip_address(hostname)
14+
if isinstance(ip, ipaddress.IPv6Address):
15+
raise RuntimeError(
16+
"The backend of mooncake's Ascend Direct Xfer Library currently does not support IPv6."
17+
)
18+
except ValueError:
19+
pass
20+
1121
global _global_te
1222
if _global_te is None:
1323
with _global_te_lock:

vllm_ascend/envs.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,6 @@
162162
# Whether to enable msMonitor tool to monitor the performance of vllm-ascend.
163163
"MSMONITOR_USE_DAEMON":
164164
lambda: bool(int(os.getenv("MSMONITOR_USE_DAEMON", '0'))),
165-
# Timeout (in seconds) for delayed KVCache block release. In the prefill
166-
# node, if a request is marked for delayed KV block release and the blocks
167-
# are not freed within this timeout, they will be forcibly released.
168-
"VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT":
169-
lambda: int(os.getenv("VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT", 250)),
170165
"VLLM_ASCEND_ENABLE_MLAPO":
171166
lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLAPO", '0'))),
172167
# Whether to enable transpose weight and cast format to FRACTAL_NZ.

0 commit comments

Comments
 (0)