Skip to content

Commit ba04e38

Browse files
committed
[LLM] add token metrics after regular & save step
1 parent 2150ae3 commit ba04e38

File tree

4 files changed

+18
-1
lines changed

4 files changed

+18
-1
lines changed

paddlenlp/trainer/trainer.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -868,6 +868,10 @@ def _inner_training_loop(
868868
paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0
869869
), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}"
870870

871+
# compatibility for older version
872+
if self.state.last_saved_step == 0:
873+
self.state.last_saved_step = self.state.global_step
874+
871875
epochs_trained = self.state.global_step // num_update_steps_per_epoch
872876
if not args.ignore_data_skip:
873877
steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
@@ -1366,6 +1370,16 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
13661370
)
13671371
)
13681372

1373+
if self.control.should_save:
1374+
trained_steps = self.state.global_step - self.state.last_saved_step
1375+
tokens_trained_this_ckpt_interval_in_billion = round(
1376+
trained_steps * seq_length * total_train_batch_size / (10**9), 4
1377+
)
1378+
logs.update(
1379+
{"tokens_trained_this_ckpt_interval_in_billion": tokens_trained_this_ckpt_interval_in_billion}
1380+
)
1381+
self.state.last_saved_step = self.state.global_step
1382+
13691383
self._total_loss_scalar += tr_loss_scalar
13701384
self._globalstep_last_logged = self.state.global_step
13711385
self._globalstep_last_start_time = time.time()

paddlenlp/trainer/trainer_callback.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class TrainerState:
9595
is_world_process_zero: bool = True
9696
trial_name: str = None
9797
trial_params: Dict[str, Union[str, float, int, bool]] = None
98+
last_saved_step: int = 0
9899

99100
def __post_init__(self):
100101
if self.log_history is None:

paddlenlp/trainer/trainer_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,8 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None, seq_lengt
365365
if seq_length is not None:
366366
tokens_per_second_per_device = samples_per_second * seq_length / paddle.distributed.get_world_size()
367367
result[f"{split}_tokens_per_second_per_device"] = round(tokens_per_second_per_device, 4)
368+
tokens_trained_this_step_in_billion = num_samples * seq_length / (10**9)
369+
result[f"{split}_tokens_trained_this_step_in_billion"] = round(tokens_trained_this_step_in_billion, 4)
368370
if num_steps is not None:
369371
steps_per_second = num_steps / runtime
370372
result[f"{split}_steps_per_second"] = round(steps_per_second, 4)

tests/trainer/test_trainer_callback.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def test_event_flow(self):
229229
# A bit of everything
230230
trainer = self.get_trainer(
231231
callbacks=[MyTestTrainerCallback],
232-
logging_steps=3,
232+
logging_steps=2,
233233
save_steps=10,
234234
eval_steps=5,
235235
evaluation_strategy="steps",

0 commit comments

Comments
 (0)