Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,10 @@ def _inner_training_loop(
paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0
), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}"

# compatibility for older version
if self.state.last_saved_step == 0:
self.state.last_saved_step = self.state.global_step

epochs_trained = self.state.global_step // num_update_steps_per_epoch
if not args.ignore_data_skip:
steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
Expand Down Expand Up @@ -1373,6 +1377,16 @@ def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval,
)
)

if self.control.should_save:
trained_steps = self.state.global_step - self.state.last_saved_step
tokens_trained_this_ckpt_interval_in_billion = round(
trained_steps * seq_length * total_train_batch_size / (10**9), 4
)
logs.update(
{"tokens_trained_this_ckpt_interval_in_billion": tokens_trained_this_ckpt_interval_in_billion}
)
self.state.last_saved_step = self.state.global_step

self._total_loss_scalar += tr_loss_scalar
self._globalstep_last_logged = self.state.global_step
self._globalstep_last_start_time = time.time()
Expand Down
1 change: 1 addition & 0 deletions paddlenlp/trainer/trainer_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class TrainerState:
is_world_process_zero: bool = True
trial_name: str = None
trial_params: Dict[str, Union[str, float, int, bool]] = None
last_saved_step: int = 0

def __post_init__(self):
if self.log_history is None:
Expand Down
3 changes: 2 additions & 1 deletion paddlenlp/trainer/trainer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,12 @@ def speed_metrics(split, start_time, num_samples=None, num_steps=None, seq_lengt
if seq_length is not None:
tokens_per_second_per_device = samples_per_second * seq_length / paddle.distributed.get_world_size()
result[f"{split}_tokens_per_second_per_device"] = round(tokens_per_second_per_device, 4)
tokens_trained_this_step_in_billion = num_samples * seq_length / (10**9)
result[f"{split}_tokens_trained_this_step_in_billion"] = round(tokens_trained_this_step_in_billion, 4)
if model_flops is not None:
result[f"{split}_hardware_tflops_per_device"] = round(
tokens_per_second_per_device * model_flops / seq_length / 2**40, 2
)

if num_steps is not None:
steps_per_second = num_steps / runtime
result[f"{split}_steps_per_second"] = round(steps_per_second, 4)
Expand Down
2 changes: 1 addition & 1 deletion tests/trainer/test_trainer_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def test_event_flow(self):
# A bit of everything
trainer = self.get_trainer(
callbacks=[MyTestTrainerCallback],
logging_steps=3,
logging_steps=2,
save_steps=10,
eval_steps=5,
evaluation_strategy="steps",
Expand Down