[trainer] Fix bug when resume_from_checkpoint (#3201)

LemonNoel · web-flow · commit 2c82f0327cd7 · 2022-09-05T19:01:39.000+08:00
diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py
@@ -128,7 +128,7 @@ def get_default_shortenable_ids(self):
         idx = []
         for p in self.template:
             if 'shortenable' in p:
-                idx.append(1 if d['shortenable'] else 0)
+                idx.append(1 if p['shortenable'] else 0)
             else:
                 idx.append(1 if 'text' in p else 0)
         return idx
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
@@ -335,6 +335,7 @@ def init_num_steps(args, num_samples_per_epoch):
             num_samples_per_epoch % args.train_batch_size > 0)
         num_update_steps_per_epoch //= args.gradient_accumulation_steps
         num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+        args.num_update_steps_per_epoch = num_update_steps_per_epoch
 
         if args.max_steps > 0:
             args.num_training_steps = args.max_steps
@@ -447,10 +448,10 @@ def train(
                 os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)):
             self.state = TrainerState.load_from_json(
                 os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
-            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            epochs_trained = self.state.global_step // args.num_update_steps_per_epoch
             if not args.ignore_data_skip:
                 steps_trained_in_current_epoch = self.state.global_step % (
-                    num_update_steps_per_epoch)
+                    args.num_update_steps_per_epoch)
                 steps_trained_in_current_epoch *= args.gradient_accumulation_steps
             else:
                 steps_trained_in_current_epoch = 0