deepmodeling
diff --git a/‎deepmd/pd/loss/ener.py‎
Lines changed: 9 additions & 2 deletions b/‎deepmd/pd/loss/ener.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎deepmd/pd/train/training.py‎
Lines changed: 117 additions & 113 deletions b/‎deepmd/pd/train/training.py‎
Lines changed: 117 additions & 113 deletions
diff --git a/‎deepmd/pd/utils/dataloader.py‎
Lines changed: 21 additions & 21 deletions b/‎deepmd/pd/utils/dataloader.py‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎examples/water/dpa3/input_atype.pd‎
24.2 KB b/‎examples/water/dpa3/input_atype.pd‎
24.2 KB
diff --git a/‎examples/water/dpa3/input_box.pd‎
2.44 KB b/‎examples/water/dpa3/input_box.pd‎
2.44 KB
diff --git a/‎examples/water/dpa3/input_coord.pd‎
144 KB b/‎examples/water/dpa3/input_coord.pd‎
144 KB
diff --git a/‎examples/water/dpa3/input_torch.json‎
Lines changed: 2 additions & 2 deletions b/‎examples/water/dpa3/input_torch.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/water/dpa3/label_energy.pd‎
453 Bytes b/‎examples/water/dpa3/label_energy.pd‎
453 Bytes
diff --git a/‎examples/water/dpa3/label_force.pd‎
144 KB b/‎examples/water/dpa3/label_force.pd‎
144 KB
diff --git a/‎examples/water/dpa3/label_natoms.pd‎
709 Bytes b/‎examples/water/dpa3/label_natoms.pd‎
709 Bytes
@@ -21,6 +21,7 @@
 from deepmd.utils.version import (
     check_version_compatibility,
 )
+import paddle.distributed as dist
 
 
 def custom_huber_loss(predictions, targets, delta=1.0):
@@ -205,7 +206,11 @@ def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
             find_energy = label.get("find_energy", 0.0)
             pref_e = pref_e * find_energy
             if not self.use_l1_all:
-                l2_ener_loss = paddle.mean(paddle.square(energy_pred - energy_label))
+
+                tmp = energy_pred - energy_label
+                logit = dist.reshard(tmp, tmp.process_mesh, [dist.Replicate()])
+                
+                l2_ener_loss = paddle.mean(paddle.square(logit))
                 if not self.inference:
                     more_loss["l2_ener_loss"] = self.display_if_exist(
                         l2_ener_loss.detach(), find_energy
@@ -258,7 +263,8 @@ def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
             force_pred = model_pred["force"]
             force_label = label["force"]
             diff_f = (force_label - force_pred).reshape([-1])
-
+            diff_f = dist.reshard(diff_f, diff_f.process_mesh, [dist.Replicate()])
+            
             if self.relative_f is not None:
                 force_label_3 = force_label.reshape([-1, 3])
                 norm_f = force_label_3.norm(axis=1, keepdim=True) + self.relative_f
@@ -354,6 +360,7 @@ def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
             find_virial = label.get("find_virial", 0.0)
             pref_v = pref_v * find_virial
             diff_v = label["virial"] - model_pred["virial"].reshape([-1, 9])
+            diff_v = dist.reshard(diff_v, diff_v.process_mesh, [dist.Replicate()])
             l2_virial_loss = paddle.mean(paddle.square(diff_v))
             if not self.inference:
                 more_loss["l2_virial_loss"] = self.display_if_exist(
 
@@ -164,17 +164,14 @@ def get_opt_param(params):
 
         def get_data_loader(_training_data, _validation_data, _training_params):
             def get_dataloader_and_buffer(_data, _params):
-                _sampler = get_sampler_from_params(_data, _params)
-                if _sampler is None:
-                    log.warning(
-                        "Sampler not specified!"
-                    )  # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration.
+                # _sampler = get_sampler_from_params(_data, _params)
+                # if _sampler is None:
+                #     log.warning(
+                #         "Sampler not specified!"
+                #     )  # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration.
                 _dataloader = DataLoader(
                     _data,
-                    batch_sampler=paddle.io.BatchSampler(
-                        sampler=_sampler,
-                        drop_last=False,
-                    ),
+                    batch_size=1,
                     num_workers=NUM_WORKERS
                     if dist.is_available()
                     else 0,  # setting to 0 diverges the behavior of its iterator; should be >=1
@@ -325,17 +322,18 @@ def get_lr(lr_params):
                 self.validation_data,
                 self.valid_numb_batch,
             ) = get_data_loader(training_data, validation_data, training_params)
-            training_data.print_summary(
-                "training",
-                to_numpy_array(self.training_dataloader.batch_sampler.sampler.weights),
-            )
-            if validation_data is not None:
-                validation_data.print_summary(
-                    "validation",
-                    to_numpy_array(
-                        self.validation_dataloader.batch_sampler.sampler.weights
-                    ),
-                )
+            # no sampler, do not need print!
+            # training_data.print_summary(
+            #     "training",
+            #     to_numpy_array(self.training_dataloader.batch_sampler.sampler.weights),
+            # )
+            # if validation_data is not None:
+            #     validation_data.print_summary(
+            #         "validation",
+            #         to_numpy_array(
+            #             self.validation_dataloader.batch_sampler.sampler.weights
+            #         ),
+            #     )
         else:
             (
                 self.training_dataloader,
@@ -370,27 +368,27 @@ def get_lr(lr_params):
                     validation_data[model_key],
                     training_params["data_dict"][model_key],
                 )
-
-                training_data[model_key].print_summary(
-                    f"training in {model_key}",
-                    to_numpy_array(
-                        self.training_dataloader[
-                            model_key
-                        ].batch_sampler.sampler.weights
-                    ),
-                )
-                if (
-                    validation_data is not None
-                    and validation_data[model_key] is not None
-                ):
-                    validation_data[model_key].print_summary(
-                        f"validation in {model_key}",
-                        to_numpy_array(
-                            self.validation_dataloader[
-                                model_key
-                            ].batch_sampler.sampler.weights
-                        ),
-                    )
+                # no sampler, do not need print!
+                # training_data[model_key].print_summary(
+                #     f"training in {model_key}",
+                #     to_numpy_array(
+                #         self.training_dataloader[
+                #             model_key
+                #         ].batch_sampler.sampler.weights
+                #     ),
+                # )
+                # if (
+                #     validation_data is not None
+                #     and validation_data[model_key] is not None
+                # ):
+                #     validation_data[model_key].print_summary(
+                #         f"validation in {model_key}",
+                #         to_numpy_array(
+                #             self.validation_dataloader[
+                #                 model_key
+                #             ].batch_sampler.sampler.weights
+                #         ),
+                #     )
 
         # Learning rate
         self.warmup_steps = training_params.get("warmup_steps", 0)
@@ -706,7 +704,7 @@ def run(self) -> None:
             fout1 = open(record_file, mode="w", buffering=1)
         log.info("Start to train %d steps.", self.num_steps)
         if dist.is_available() and dist.is_initialized():
-            log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}")
+            log.info(f"xxx Rank: {dist.get_rank()}/{dist.get_world_size()}")
         if self.enable_tensorboard:
             from tensorboardX import (
                 SummaryWriter,
@@ -755,50 +753,54 @@ def step(_step_id, task_key="Default") -> None:
                     if self.world_size > 1
                     else contextlib.nullcontext
                 )
+
+                # with nvprof_context(enable_profiling, "Forward pass"):
+                log_dict = {}
+
+                input_dict = {
+                    "spin": None,
+                    "fparam": None,
+                    "aparam": None,
+                }
+                label_dict = {
+                    "find_box": 1.0,
+                    "find_coord": 1.0,
+                    "find_numb_copy": 0.0,
+                    "find_energy": 1.0,
+                    "find_force": 1.0,
+                    "find_virial": 0.0,
+                }
+                for k in ["atype", "box", "coord"]:
+                    input_dict[k] = paddle.load(f"./input_{k}.pd")
+                for k in ["energy", "force", "natoms", "numb_copy", "virial"]:
+                    label_dict[k] = paddle.load(f"./label_{k}.pd")
+
+                for __key in ('coord', 'atype', 'box'):
+                    input_dict[__key] = dist.shard_tensor(input_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
+                for __key, _ in label_dict.items():
+                    if isinstance(label_dict[__key], paddle.Tensor):
+                        label_dict[__key] = dist.shard_tensor(label_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
 
-                # with sync_context():
-                #     with nvprof_context(enable_profiling, "Forward pass"):
-                #         model_pred, loss, more_loss = self.wrapper(
-                #             **input_dict,
-                #             cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
-                #             label=label_dict,
-                #             task_key=task_key,
-                #         )
-
-                #     with nvprof_context(enable_profiling, "Backward pass"):
-                #         loss.backward()
-
-                # if self.world_size > 1:
-                #     # fuse + allreduce manually before optimization if use DDP + no_sync
-                #     # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
-                #     hpu.fused_allreduce_gradients(list(self.wrapper.parameters()), None)
-
-                with nvprof_context(enable_profiling, "Forward pass"):
-                    for __key in ('coord', 'atype', 'box'):
-                        input_dict[__key] = dist.shard_tensor(input_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
-                    for __key, _ in label_dict.items():
-                        if isinstance(label_dict[__key], paddle.Tensor):
-                            label_dict[__key] = dist.shard_tensor(label_dict[__key], mesh=dist.get_mesh(), placements=[dist.Shard(0)])
-                    model_pred, loss, more_loss = self.wrapper(
-                        **input_dict,
-                        cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
-                        label=label_dict,
-                        task_key=task_key,
-                    )
+                model_pred, loss, more_loss = self.wrapper(
+                    **input_dict,
+                    cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
+                    label=label_dict,
+                    task_key=task_key,
+                )
 
-                with nvprof_context(enable_profiling, "Backward pass"):
-                    loss.backward()
+                # with nvprof_context(enable_profiling, "Backward pass"):
+                loss.backward()
 
                 if self.gradient_max_norm > 0.0:
-                    with nvprof_context(enable_profiling, "Gradient clip"):
-                        paddle.nn.utils.clip_grad_norm_(
-                            self.wrapper.parameters(),
-                            self.gradient_max_norm,
-                            error_if_nonfinite=True,
-                        )
+                    # with nvprof_context(enable_profiling, "Gradient clip"):
+                    paddle.nn.utils.clip_grad_norm_(
+                        self.wrapper.parameters(),
+                        self.gradient_max_norm,
+                        error_if_nonfinite=True,
+                    )
 
-                with nvprof_context(enable_profiling, "Adam update"):
-                    self.optimizer.step()
+                # with nvprof_context(enable_profiling, "Adam update"):
+                self.optimizer.step()
                 self.scheduler.step()
 
             else:
@@ -856,7 +858,9 @@ def log_loss_valid(_task_key="Default"):
 
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
-                    valid_results = log_loss_valid()
+                    # valid_results = log_loss_valid()
+                    # no run valid!
+                    valid_results = None
                     if self.rank == 0:
                         log.info(
                             format_training_message_per_task(
@@ -938,39 +942,39 @@ def log_loss_valid(_task_key="Default"):
                 ):
                     self.total_train_time += train_time
 
-                if fout:
-                    if self.lcurve_should_print_header:
-                        self.print_header(fout, train_results, valid_results)
-                        self.lcurve_should_print_header = False
-                    self.print_on_training(
-                        fout, display_step_id, cur_lr, train_results, valid_results
-                    )
-
-            if (
-                ((_step_id + 1) % self.save_freq == 0 and _step_id != self.start_step)
-                or (_step_id + 1) == self.num_steps
-            ) and (self.rank == 0 or dist.get_rank() == 0):
-                # Handle the case if rank 0 aborted and re-assigned
-                self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pd")
-                self.save_model(self.latest_model, lr=cur_lr, step=_step_id)
-                log.info(f"Saved model to {self.latest_model}")
-                symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
-                with open("checkpoint", "w") as f:
-                    f.write(str(self.latest_model))
+                # if fout:
+                #     if self.lcurve_should_print_header:
+                #         self.print_header(fout, train_results, valid_results)
+                #         self.lcurve_should_print_header = False
+                #     self.print_on_training(
+                #         fout, display_step_id, cur_lr, train_results, valid_results
+                #     )
+
+            # if (
+            #     ((_step_id + 1) % self.save_freq == 0 and _step_id != self.start_step)
+            #     or (_step_id + 1) == self.num_steps
+            # ) and (self.rank == 0 or dist.get_rank() == 0):
+            #     # Handle the case if rank 0 aborted and re-assigned
+            #     self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pd")
+            #     self.save_model(self.latest_model, lr=cur_lr, step=_step_id)
+            #     log.info(f"Saved model to {self.latest_model}")
+            #     symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
+            #     with open("checkpoint", "w") as f:
+            #         f.write(str(self.latest_model))
 
             # tensorboard
-            if self.enable_tensorboard and (
-                display_step_id % self.tensorboard_freq == 0 or display_step_id == 1
-            ):
-                writer.add_scalar(f"{task_key}/lr", cur_lr, display_step_id)
-                writer.add_scalar(f"{task_key}/loss", loss.item(), display_step_id)
-                for item in more_loss:
-                    writer.add_scalar(
-                        f"{task_key}/{item}", more_loss[item].item(), display_step_id
-                    )
-
-            if enable_profiling:
-                core.nvprof_nvtx_pop()
+            # if self.enable_tensorboard and (
+            #     display_step_id % self.tensorboard_freq == 0 or display_step_id == 1
+            # ):
+            #     writer.add_scalar(f"{task_key}/lr", cur_lr, display_step_id)
+            #     writer.add_scalar(f"{task_key}/loss", loss.item(), display_step_id)
+            #     for item in more_loss:
+            #         writer.add_scalar(
+            #             f"{task_key}/{item}", more_loss[item].item(), display_step_id
+            #         )
+
+            # if enable_profiling:
+            #     core.nvprof_nvtx_pop()
 
         self.wrapper.train()
         self.t0 = time.time()
 
@@ -168,30 +168,30 @@ def construct_dataset(system):
             self.batch_sizes = batch_size * np.ones(len(systems), dtype=int)
         assert len(self.systems) == len(self.batch_sizes)
         for system, batch_size in zip(self.systems, self.batch_sizes):
-            if dist.is_available() and dist.is_initialized():
-                system_batch_sampler = DistributedBatchSampler(
-                    system,
-                    shuffle=(
-                        (not (dist.is_available() and dist.is_initialized()))
-                        and shuffle
-                    ),
-                    batch_size=int(batch_size),
-                )
-                self.sampler_list.append(system_batch_sampler)
-            else:
-                system_batch_sampler = BatchSampler(
-                    system,
-                    shuffle=(
-                        (not (dist.is_available() and dist.is_initialized()))
-                        and shuffle
-                    ),
-                    batch_size=int(batch_size),
-                )
-                self.sampler_list.append(system_batch_sampler)
+            # if dist.is_available() and dist.is_initialized():
+            #     system_batch_sampler = DistributedBatchSampler(
+            #         system,
+            #         shuffle=(
+            #             (not (dist.is_available() and dist.is_initialized()))
+            #             and shuffle
+            #         ),
+            #         batch_size=int(batch_size),
+            #     )
+            #     self.sampler_list.append(system_batch_sampler)
+            # else:
+            #     system_batch_sampler = BatchSampler(
+            #         system,
+            #         shuffle=(
+            #             (not (dist.is_available() and dist.is_initialized()))
+            #             and shuffle
+            #         ),
+            #         batch_size=int(batch_size),
+            #     )
+            #     self.sampler_list.append(system_batch_sampler)
             system_dataloader = DataLoader(
                 dataset=system,
                 num_workers=0,  # Should be 0 to avoid too many threads forked
-                batch_sampler=system_batch_sampler,
+                batch_size=int(batch_size),
                 collate_fn=collate_batch,
                 use_buffer_reader=False,
                 places=["cpu"],
 
@@ -75,14 +75,14 @@
         "../data/data_1",
         "../data/data_2"
       ],
-      "batch_size": 1,
+      "batch_size": 32,
       "_comment": "that's all"
     },
     "validation_data": {
       "systems": [
         "../data/data_3"
       ],
-      "batch_size": 1,
+      "batch_size": 32,
       "_comment": "that's all"
     },
     "numb_steps": 2000,