update auto ddp code

HydrogenSulfate · HydrogenSulfate · commit 35783bbeddff · 2025-08-13T20:32:11.000+08:00
diff --git a/deepmd/pd/model/model/make_model.py b/deepmd/pd/model/model/make_model.py
@@ -37,6 +37,9 @@
 from deepmd.utils.path import (
     DPPath,
 )
+import paddle.distributed as dist
+from paddle.distributed import fleet
+import functools
 
 
 def make_model(T_AtomicModel: type[BaseAtomicModel]):
@@ -163,29 +166,60 @@ def forward_common(
                 coord, box=box, fparam=fparam, aparam=aparam
             )
             del coord, box, fparam, aparam
+            # (
+            #     extended_coord,
+            #     extended_atype,
+            #     mapping,
+            #     nlist,
+            # ) = extend_input_and_build_neighbor_list(
+            #     cc,
+            #     atype,
+            #     self.get_rcut(),
+            #     self.get_sel(),
+            #     # types will be distinguished in the lower interface,
+            #     # so it doesn't need to be distinguished here
+            #     mixed_types=True,
+            #     box=bb,
+            # )
+            wrapped_func_1 = dist.local_map(
+                func=lambda a,b,c: extend_input_and_build_neighbor_list(a,b,self.get_rcut(), self.get_sel(), True, c),
+                in_placements=[ele.placements for ele in [cc, atype, bb]],
+                out_placements=[[dist.Shard(0)] for _ in range(4)],
+                process_mesh=fleet.auto.get_mesh()
+            )
+
             (
                 extended_coord,
                 extended_atype,
                 mapping,
                 nlist,
-            ) = extend_input_and_build_neighbor_list(
+            ) = wrapped_func_1(
                 cc,
                 atype,
-                self.get_rcut(),
-                self.get_sel(),
-                # types will be distinguished in the lower interface,
-                # so it doesn't need to be distinguished here
-                mixed_types=True,
-                box=bb,
+                bb,
+            )
+            # model_predict_lower = self.forward_common_lower(
+            #     extended_coord,
+            #     extended_atype,
+            #     nlist,
+            #     mapping,
+            #     do_atomic_virial=do_atomic_virial,
+            #     fparam=fp,
+            #     aparam=ap,
+            # )
+
+            wrapped_func_2 = dist.local_map(
+                func=functools.partial(self.forward_common_lower, do_atomic_virial=do_atomic_virial, fparam=fp, aparam=ap),
+                in_placements=[ele.placements for ele in [extended_coord, extended_atype, nlist, mapping]],
+                out_placements=[[dist.Shard(0)] for _ in range(6)],
+                process_mesh=fleet.auto.get_mesh(),
+                reshard_inputs=True
             )
-            model_predict_lower = self.forward_common_lower(
+            model_predict_lower = wrapped_func_2(
                 extended_coord,
                 extended_atype,
                 nlist,
                 mapping,
-                do_atomic_virial=do_atomic_virial,
-                fparam=fp,
-                aparam=ap,
             )
             model_predict = communicate_extended_output(
                 model_predict_lower,
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -26,7 +26,9 @@
 from paddle.io import (
     DataLoader,
 )
-
+import paddle.distributed as dist
+from paddle.distributed import fleet
+import functools
 from deepmd.common import (
     symlink_prefix_files,
 )
@@ -101,6 +103,11 @@ def __init__(
         Args:
         - config: The Dict-like configuration with training options.
         """
+        from paddle.distributed import fleet
+        mesh_dims = [("dp", 32)]
+        fleet.auto.create_mesh(mesh_dims)
+        fleet.init(is_collective=True)
+
         enable_prim(True)
         if init_model is not None:
             resume_model = init_model
@@ -748,22 +755,39 @@ def step(_step_id, task_key="Default") -> None:
                     if self.world_size > 1
                     else contextlib.nullcontext
                 )
-                with sync_context():
-                    with nvprof_context(enable_profiling, "Forward pass"):
-                        model_pred, loss, more_loss = self.wrapper(
-                            **input_dict,
-                            cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
-                            label=label_dict,
-                            task_key=task_key,
-                        )
-
-                    with nvprof_context(enable_profiling, "Backward pass"):
-                        loss.backward()
+                
+                # with sync_context():
+                #     with nvprof_context(enable_profiling, "Forward pass"):
+                #         model_pred, loss, more_loss = self.wrapper(
+                #             **input_dict,
+                #             cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
+                #             label=label_dict,
+                #             task_key=task_key,
+                #         )
+
+                #     with nvprof_context(enable_profiling, "Backward pass"):
+                #         loss.backward()
+
+                # if self.world_size > 1:
+                #     # fuse + allreduce manually before optimization if use DDP + no_sync
+                #     # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
+                #     hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)
+
+                with nvprof_context(enable_profiling, "Forward pass"):
+                    for __key in ('coord', 'atype', 'box'):
+                        input_dict[__key] = dist.shard_tensor(input_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
+                    for __key, _ in label_dict.items():
+                        if isinstance(label_dict[__key], paddle.Tensor):
+                            label_dict[__key] = dist.shard_tensor(label_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
+                    model_pred, loss, more_loss = self.wrapper(
+                        **input_dict,
+                        cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
+                        label=label_dict,
+                        task_key=task_key,
+                    )
 
-                if self.world_size > 1:
-                    # fuse + allreduce manually before optimization if use DDP + no_sync
-                    # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
-                    hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)
+                with nvprof_context(enable_profiling, "Backward pass"):
+                    loss.backward()
 
                 if self.gradient_max_norm > 0.0:
                     with nvprof_context(enable_profiling, "Gradient clip"):