[LLM] add token metrics after regular & save step

SylarTiaNII · SylarTiaNII · commit ba04e3848da4 · 2024-09-18T21:08:43.000+08:00
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -868,6 +868,10 @@ def _inner_training_loop(
                     paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0
                 ), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}"
 
+            # compatibility for older version
+            if self.state.last_saved_step == 0:
+                self.state.last_saved_step = self.state.global_step
+
             epochs_trained = self.state.global_step // num_update_steps_per_epoch
             if not args.ignore_data_skip:
                 steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
@@ -1366,6 +1370,16 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
                 )
             )
 
+            if self.control.should_save:
+                trained_steps = self.state.global_step - self.state.last_saved_step
+                tokens_trained_this_ckpt_interval_in_billion = round(
+                    trained_steps * seq_length * total_train_batch_size / (10**9), 4
+                )
+                logs.update(
+                    {"tokens_trained_this_ckpt_interval_in_billion": tokens_trained_this_ckpt_interval_in_billion}
+                )
+                self.state.last_saved_step = self.state.global_step
+
             self._total_loss_scalar += tr_loss_scalar
             self._globalstep_last_logged = self.state.global_step
             self._globalstep_last_start_time = time.time()
diff --git a/paddlenlp/trainer/trainer_callback.py b/paddlenlp/trainer/trainer_callback.py
@@ -95,6 +95,7 @@ class TrainerState:
     is_world_process_zero: bool = True
     trial_name: str = None
     trial_params: Dict[str, Union[str, float, int, bool]] = None
+    last_saved_step: int = 0
 
     def __post_init__(self):
         if self.log_history is None:
diff --git a/paddlenlp/trainer/trainer_utils.py b/paddlenlp/trainer/trainer_utils.py
@@ -365,6 +365,8 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None, seq_lengt
         if seq_length is not None:
             tokens_per_second_per_device = samples_per_second * seq_length / paddle.distributed.get_world_size()
             result[f"{split}_tokens_per_second_per_device"] = round(tokens_per_second_per_device, 4)
+            tokens_trained_this_step_in_billion = num_samples * seq_length / (10**9)
+            result[f"{split}_tokens_trained_this_step_in_billion"] = round(tokens_trained_this_step_in_billion, 4)
     if num_steps is not None:
         steps_per_second = num_steps / runtime
         result[f"{split}_steps_per_second"] = round(steps_per_second, 4)
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
@@ -229,7 +229,7 @@ def test_event_flow(self):
         # A bit of everything
         trainer = self.get_trainer(
             callbacks=[MyTestTrainerCallback],
-            logging_steps=3,
+            logging_steps=2,
             save_steps=10,
             eval_steps=5,
             evaluation_strategy="steps",