diff --git a/paddlenlp/data/dist_dataloader.py b/paddlenlp/data/dist_dataloader.py index 0f125f4b76c1..3bf5c1dd72e3 100644 --- a/paddlenlp/data/dist_dataloader.py +++ b/paddlenlp/data/dist_dataloader.py @@ -197,7 +197,12 @@ def __next__(self): data = nested_copy_place(data, place=paddle.framework._current_expected_place()) except Exception as e: logger.debug(e) - data = self._broadcast_data(data) + # data = self._broadcast_data(data) + if data is None: + data = { + "input_ids": paddle.empty([1, 4097], paddle.int64), + "labels": paddle.empty([1, 4097], paddle.int64), + } return data diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 663932720a51..74b7aee70bcf 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -156,7 +156,8 @@ ) from .training_args import TrainingArguments from .unified_checkpoint import UnifiedCheckpointHandler -from .utils import reshard as reshard_util + +# from .utils import reshard as reshard_util from .utils.async_save import AsyncSaver from .utils.helper import ( # nested_truncate, broadcast_dataset_rank0_model, @@ -1185,10 +1186,10 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg): fused_allreduce_gradients_no_sync(list(model.parameters()), None) # Pipeline parallel mode, handle gradient reduce here to overlap - enable_dp_comm_overlap = ( - self.args.pipeline_parallel_degree > 1 - and "enable_dp_comm_overlap" in args.pipeline_parallel_config - ) + # enable_dp_comm_overlap = ( + # self.args.pipeline_parallel_degree > 1 + # and "enable_dp_comm_overlap" in args.pipeline_parallel_config + # ) enable_release_grads = False if args.sharding_parallel_degree > 1: @@ -1200,13 +1201,13 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg): if isinstance(self.optimizer, HybridParallelOptimizer) and not self.do_grad_scaling: parameters_list = _obtain_optimizer_parameters_list(self.optimizer._inner_opt) - if not enable_dp_comm_overlap: - if self.optimizer._sharding_enable: - assert reshard_util.is_sharding_opt(self.optimizer) - self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg) + # if not enable_dp_comm_overlap: + # if self.optimizer._sharding_enable: + # assert reshard_util.is_sharding_opt(self.optimizer) + # self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg) - if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False): - fused_allreduce_gradients_no_sync(list(parameters_list), self.optimizer._hcg) + # if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False): + # fused_allreduce_gradients_no_sync(list(parameters_list), self.optimizer._hcg) self.timers and self.timers("all-reduce").stop() self.timers and self.timers("optimizer-step").start()