File tree Expand file tree Collapse file tree 2 files changed +5
-4
lines changed
Expand file tree Collapse file tree 2 files changed +5
-4
lines changed Original file line number Diff line number Diff line change @@ -249,7 +249,8 @@ async def generate(
249249
250250 # 将请求转发给其他节点
251251 await self .order_req_manager .add_request (req_status .group_req_objs )
252- await self .transfer_to_next_module ()
252+ async with self .order_req_manager .lock :
253+ await self .transfer_to_next_module ()
253254
254255 results_generator = self ._wait_to_token_package (
255256 start_time ,
Original file line number Diff line number Diff line change 1313# node_world_size 指一个推理节点的使用的卡数,如两机 tp 推理,如果两机器8卡,则 node_world_size 为 8.
1414# rank_in_node 指在一个node内的rank序号,如两机8卡推理,每机上的rank序号都是0-8
1515
16+
1617def set_environ (environ_name , value ):
1718 os .environ [environ_name ] = str (value )
1819
@@ -37,8 +38,7 @@ def _init_distributed_env(kvargs):
3738 set_current_rank_in_node (get_global_rank () % node_world_size )
3839 set_node_world_size (node_world_size )
3940
40-
41- device_id = kvargs ["rank_id" ] % size_per_node
41+ device_id = kvargs ["rank_id" ] % get_node_world_size ()
4242 set_current_device_id (device_id )
4343 torch .cuda .set_device (device_id )
4444 if kvargs ["world_size" ] > 1 :
@@ -113,7 +113,7 @@ def get_current_device_id():
113113 return int (get_environ ("LIGHTLLM_CURRENT_DEVICE_ID" ))
114114
115115
116- def set_current_rank_in_node (rank :int ):
116+ def set_current_rank_in_node (rank : int ):
117117 set_environ ("LIGHTLLM_CURRENT_RANK_IN_NODE" , rank )
118118
119119
You can’t perform that action at this time.
0 commit comments