[PIR-Auto-Parallel] fix comm group hang in sync shared param pass (#71524) (#71613)

waliwali777 · web-flow · commit aa721498d74b · 2025-03-13T15:29:07.000+08:00
diff --git a/python/paddle/distributed/passes/auto_parallel_sync_shared_params.py b/python/paddle/distributed/passes/auto_parallel_sync_shared_params.py
@@ -63,7 +63,7 @@ def _find_fist_opt_user(self, main_program):
     def _get_comm_group(self, ranks=[]):
         ranks = sorted(ranks)
         if tuple(ranks) in self.comm_group:
-            return self.comm_group[tuple(ranks)].id
+            return self.comm_group[tuple(ranks)]
         # The communication group of this `all_reduce` op satisfies len (ranks)==2.
         # When `force_new_group=False` is set, the `send&recv` group will be returned,
         # At this point, `all_reduce` and `send&recv` share the same group, and
@@ -205,6 +205,14 @@ def sync_shared_parameters(self, main_program, startup_program):
             logger.info("No parameter need to share, skip pass.")
             return []
 
+        # Must initialize the redundant communication group for the allreduce op here.
+        # Otherwise, it will hang during gradient synchronization.
+        for idx in range(len(self.src_ranks)):
+            rank_1 = self.src_ranks[idx]
+            rank_2 = self.dst_ranks[idx]
+            new_process_group(sorted([rank_1, rank_2]))
+            self._get_comm_group([rank_1, rank_2])
+
         return new_shared_params
 
     def sync_shared_parameter_gradient(
@@ -228,6 +236,9 @@ def sync_shared_parameter_gradient(
 
         cur_rank = paddle.distributed.get_rank()
 
+        if cur_rank not in self.src_ranks and cur_rank not in self.dst_ranks:
+            return params_grads
+
         pre_name = ""
         if cur_rank in self.dst_ranks:
             pre_name = "shared_"