diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 73715b9c8af1..2e2775d696b3 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -868,6 +868,10 @@ def _inner_training_loop( paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0 ), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}" + # compatibility for older version + if self.state.last_saved_step == 0: + self.state.last_saved_step = self.state.global_step + epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) @@ -1373,6 +1377,16 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, ) ) + if self.control.should_save: + trained_steps = self.state.global_step - self.state.last_saved_step + tokens_trained_this_ckpt_interval_in_billion = round( + trained_steps * seq_length * total_train_batch_size / (10**9), 4 + ) + logs.update( + {"tokens_trained_this_ckpt_interval_in_billion": tokens_trained_this_ckpt_interval_in_billion} + ) + self.state.last_saved_step = self.state.global_step + self._total_loss_scalar += tr_loss_scalar self._globalstep_last_logged = self.state.global_step self._globalstep_last_start_time = time.time() diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py index b263c7930daf..e887282e0387 100644 --- a/paddlenlp/trainer/trainer_callback.py +++ b/paddlenlp/trainer/trainer_callback.py @@ -95,6 +95,7 @@ class TrainerState: is_world_process_zero: bool = True trial_name: str = None trial_params: Dict[str, Union[str, float, int, bool]] = None + last_saved_step: int = 0 def __post_init__(self): if self.log_history is None: diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py index 37ccd5b1f6c5..cff86474f80b 100644 --- a/paddlenlp/trainer/trainer_utils.py +++ b/paddlenlp/trainer/trainer_utils.py @@ -365,11 +365,12 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None, seq_lengt if seq_length is not None: tokens_per_second_per_device = samples_per_second * seq_length / paddle.distributed.get_world_size() result[f"{split}_tokens_per_second_per_device"] = round(tokens_per_second_per_device, 4) + tokens_trained_this_step_in_billion = num_samples * seq_length / (10**9) + result[f"{split}_tokens_trained_this_step_in_billion"] = round(tokens_trained_this_step_in_billion, 4) if model_flops is not None: result[f"{split}_hardware_tflops_per_device"] = round( tokens_per_second_per_device * model_flops / seq_length / 2**40, 2 ) - if num_steps is not None: steps_per_second = num_steps / runtime result[f"{split}_steps_per_second"] = round(steps_per_second, 4) diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py index 487af97951b3..7d9804ca9e8e 100644 --- a/tests/trainer/test_trainer_callback.py +++ b/tests/trainer/test_trainer_callback.py @@ -229,7 +229,7 @@ def test_event_flow(self): # A bit of everything trainer = self.get_trainer( callbacks=[MyTestTrainerCallback], - logging_steps=3, + logging_steps=2, save_steps=10, eval_steps=5, evaluation_strategy="steps",