Skip to content

Commit 8aa2632

Browse files
committed
fix
1 parent 5e4073e commit 8aa2632

File tree

13 files changed

+12
-36
lines changed

13 files changed

+12
-36
lines changed

docs/CN/source/tutorial/deepseek_deployment.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
187187
export host=$1
188188
export pd_master_ip=$2
189189
nvidia-cuda-mps-control -d
190-
MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
190+
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
191191
--model_dir /path/DeepSeek-R1 \
192192
--run_mode "prefill" \
193193
--tp 8 \
@@ -211,7 +211,7 @@ PD (Prefill-Decode) 分离模式将预填充和解码阶段分离部署,可以
211211
export host=$1
212212
export pd_master_ip=$2
213213
nvidia-cuda-mps-control -d
214-
MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
214+
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
215215
--model_dir /path/DeepSeek-R1 \
216216
--run_mode "decode" \
217217
--tp 8 \

docs/EN/source/tutorial/deepseek_deployment.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
187187
export host=$1
188188
export pd_master_ip=$2
189189
nvidia-cuda-mps-control -d
190-
MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
190+
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
191191
--model_dir /path/DeepSeek-R1 \
192192
--run_mode "prefill" \
193193
--tp 8 \
@@ -208,7 +208,7 @@ PD (Prefill-Decode) disaggregation mode separates prefill and decode stages for
208208
export host=$1
209209
export pd_master_ip=$2
210210
nvidia-cuda-mps-control -d
211-
MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
211+
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
212212
--model_dir /path/DeepSeek-R1 \
213213
--run_mode "decode" \
214214
--tp 8 \

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/decode_node_impl/decode_impl.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from rpyc.utils.server import ThreadedServer
1212
from lightllm.common.basemodel.infer_lock import g_router_lock
1313
from .decode_task_cache import g_success_kv_move_task_cache, KVMoveTask
14-
from lightllm.utils.device_utils import kv_trans_use_p2p
1514
from lightllm.utils.envs_utils import get_unique_server_name
1615
from lightllm.utils.dist_utils import create_new_group_for_current_dp
1716

@@ -39,12 +38,6 @@ def init_custom(self):
3938
PDDecodeInferRpcServer(self), socket_path=socket_path, protocol_config={"allow_pickle": True}
4039
)
4140
threading.Thread(target=lambda: t.start(), daemon=True).start()
42-
43-
if kv_trans_use_p2p():
44-
from ..p2p_fix import reduce_tensor
45-
46-
mp.reductions.reduce_tensor.__code__ = reduce_tensor.__code__
47-
4841
return
4942

5043
def _init_reqs(self, reqs: List[Tuple]):

lightllm/server/router/model_infer/mode_backend/continues_batch/pd_mode/prefill_node_impl/prefill_impl.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from lightllm.common.basemodel.infer_lock import g_router_lock, g_infer_state_lock
1212
from rpyc.utils.server import ThreadedServer
1313
from .prefill_task_cache import g_kv_move_task_cache
14-
from lightllm.utils.device_utils import kv_trans_use_p2p
1514
from lightllm.utils.envs_utils import get_unique_server_name
1615
from lightllm.utils.dist_utils import create_new_group_for_current_dp
1716
from lightllm.server.router.model_infer.mode_backend.chunked_prefill.impl import ChunkedPrefillBackend
@@ -41,12 +40,6 @@ def init_custom(self):
4140
PDPrefillInferRpcServer(self), socket_path=socket_path, protocol_config={"allow_pickle": True}
4241
)
4342
threading.Thread(target=lambda: t.start(), daemon=True).start()
44-
45-
if kv_trans_use_p2p():
46-
from ..p2p_fix import reduce_tensor
47-
48-
mp.reductions.reduce_tensor.__code__ = reduce_tensor.__code__
49-
5043
return
5144

5245
def _pre_handle_finished_reqs(self, finished_reqs):

lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_impl.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,7 @@ def __init__(self, info_queue: mp.Queue) -> None:
1818
self.classed_req_strict_prefill = False
1919

2020
def init_custom(self):
21-
2221
assert kv_trans_use_p2p()
23-
if kv_trans_use_p2p():
24-
from ..p2p_fix import reduce_tensor
25-
26-
mp.reductions.reduce_tensor.__code__ = reduce_tensor.__code__
2722

2823
# TODO 如何支持不支持 P2P的场景
2924
return

lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_impl.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ def __init__(self, info_queue: mp.Queue) -> None:
2121

2222
def init_custom(self):
2323
assert kv_trans_use_p2p()
24-
25-
if kv_trans_use_p2p():
26-
from ..p2p_fix import reduce_tensor
27-
28-
mp.reductions.reduce_tensor.__code__ = reduce_tensor.__code__
2924
return
3025

3126
def _filter_not_ready_reqs(self, req_ids: List[int]) -> List[InferReq]:

lightllm/utils/device_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def init_p2p(device_index):
107107

108108
@lru_cache(maxsize=None)
109109
def kv_trans_use_p2p():
110-
return os.getenv("KV_TRANS_USE_P2P", "False").upper() in ["1", "TRUE", "ON"]
110+
return not (os.getenv("DISABLE_KV_TRANS_USE_P2P", "False").upper() in ["1", "TRUE", "ON"])
111111

112112

113113
def has_nvlink():

test/start_scripts/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ sh multi_pd_master/pd_decode.sh <host> <config_server_host>
100100

101101
- `LOADWORKER`: Model loading thread count, recommended 8-18
102102
- `MOE_MODE`: Expert parallelism mode, set to EP to enable expert parallelism
103-
- `KV_TRANS_USE_P2P`: Enable P2P communication optimization
103+
- `DISABLE_KV_TRANS_USE_P2P`: Disable P2P communication optimization to transfer kv data
104104
- `CUDA_VISIBLE_DEVICES`: Specify GPU devices to use
105105

106106
### Important Parameters

test/start_scripts/multi_pd_master.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Ch
66
python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat --run_mode "pd_master" --host 10.120.114.74 --port 60012 --config_server_host 10.120.114.74 --config_server_port 60088
77

88
nvidia-cuda-mps-control -d
9-
CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
9+
CUDA_VISIBLE_DEVICES=0 LOADWORKER=1 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
1010
--run_mode "prefill" \
1111
--host 10.120.178.74 \
1212
--port 8019 \
@@ -20,7 +20,7 @@ CUDA_VISIBLE_DEVICES=0 KV_TRANS_USE_P2P=1 LOADWORKER=1 python -m lightllm.server
2020
--config_server_host 10.120.114.74 \
2121
--config_server_port 60088
2222

23-
CUDA_VISIBLE_DEVICES=1 KV_TRANS_USE_P2P=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
23+
CUDA_VISIBLE_DEVICES=1 LOADWORKER=10 python -m lightllm.server.api_server --model_dir /mtc/models/DeepSeek-V2-Lite-Chat \
2424
--run_mode "decode" \
2525
--host 10.120.178.74 \
2626
--port 8121 \

test/start_scripts/single_pd_master/pd_decode.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
export host=$1
66
export pd_master_ip=$2
77
nvidia-cuda-mps-control -d
8-
MOE_MODE=EP KV_TRANS_USE_P2P=1 LOADWORKER=18 python -m lightllm.server.api_server \
8+
MOE_MODE=EP LOADWORKER=18 python -m lightllm.server.api_server \
99
--model_dir /path/DeepSeek-R1 \
1010
--run_mode "decode" \
1111
--tp 8 \

0 commit comments

Comments
 (0)