[ascend] fix dp multinode rank_table mapping (#4268)

tangzhiyi11 · web-flow · commit 2ef2a046b9c4 · 2026-01-22T13:06:02.000+08:00
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -54,15 +54,24 @@ def get_ascend_device_rank_mapping(master_addr):
         rank_table = json.load(f)
     try:
         assert master_addr == rank_table['server_list'][0]['server_id'], 'Master address does not match rank table'
-        rank_mapping = {}
-        worker_ips = []
+        rank_mapping: Dict[int, int] = {}
+        worker_ip_by_rank: Dict[int, str] = {}
         for server in rank_table['server_list']:
             node_ip = server['server_id']
             for idx, device in enumerate(server['device']):
-                local_rank = idx
+                # Prefer explicit device_id if present; fall back to enumeration order.
+                local_rank = int(device.get('device_id', idx))
                 global_rank = int(device['rank_id'])
                 rank_mapping[global_rank] = local_rank
-                worker_ips.append(node_ip)
+                worker_ip_by_rank[global_rank] = node_ip
+
+        if len(worker_ip_by_rank) == 0:
+            raise ValueError('Rank table contains no devices.')
+
+        ranks = sorted(worker_ip_by_rank.keys())
+        if ranks[0] != 0 or ranks[-1] != len(ranks) - 1:
+            raise ValueError(f'Rank ids are not contiguous starting from 0: {ranks[:8]}...{ranks[-8:]}')
+        worker_ips = [worker_ip_by_rank[r] for r in range(len(ranks))]
     except Exception as e:
         logger.error(f'Parse rank table file({rank_table})  failed')
         raise e
@@ -625,8 +634,19 @@ def _init_ascend_distributed_environment(self, driver_ip):
         if rank_table_file:
             # if rank table file is set, use it to get rank mapping, multiple nodes
             rank_mapping, worker_ips, envs = get_ascend_device_rank_mapping(driver_ip)
-            self.workers = self._sort_workers_by_ip(worker_ips, self.workers)
-            ray.get([worker.set_device.remote(rank_mapping[idx]) for idx, worker in enumerate(self.workers)])
+            rank_start = self.rank_offset
+            rank_end = rank_start + len(self.workers)
+            if rank_end > len(worker_ips):
+                raise ValueError(
+                    'Rank table world_size is smaller than required ranks for current dp_rank. '
+                    f'rank_table_world_size={len(worker_ips)}, required_rank_range=[{rank_start}, {rank_end})')
+
+            # In dp mode each process only owns a slice of global ranks.
+            expected_worker_ips = worker_ips[rank_start:rank_end]
+            self.workers = self._sort_workers_by_ip(expected_worker_ips, self.workers)
+
+            ray.get(
+                [worker.set_device.remote(rank_mapping[rank_start + idx]) for idx, worker in enumerate(self.workers)])
             ray.get([worker.set_env.remote(envs) for worker in self.workers])
         elif not set_rt_visable_devices_by_ray:
             # if rank table file is not set, treat as single node
diff --git a/lmdeploy/pytorch/ray.py b/lmdeploy/pytorch/ray.py
@@ -126,11 +126,10 @@ def init_ray_cluster(world_size: int, ray_address: str = None, dp: int = 1, devi
         # Create a new placement group
         placement_group_specs: List[Dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)])
 
-        gcs_addr = ray.get_runtime_context().gcs_address
-        master_addr = gcs_addr.split(':')[0]
-        current_ip = master_addr
-        # This way, at least bundle is required to be created in a current
-        # node.
+        # Pin at least one bundle to the local node.
+        # This helps multi-node DP keep each dp_rank process's workers co-located with
+        # the node where the process is launched.
+        current_ip = ray.util.get_node_ip_address()
         placement_group_specs[0][f'node:{current_ip}'] = 0.001
 
         # By default, Ray packs resources as much as possible.