update perf (#264)

Jintao-Huang · web-flow · commit e650d8b969ab · 2023-12-30T15:53:08.000+08:00
diff --git a/scripts/benchmark/test_memory_time/run_single.py b/scripts/benchmark/test_memory_time/run_single.py
@@ -37,24 +37,21 @@ def test_memory_time(train_args: TrainArguments) -> Dict[str, Dict[str, Any]]:
     args_kwargs = get_non_default_args(train_args)
     print(f'args_kwargs: {args_kwargs}')
     for i in range(train_args.run_time):
-        start_t = time.time()
         sft_args = SftArguments(
             dataset_test_ratio=0,
             dataset=DatasetName.cls_fudan_news_zh,
-            train_dataset_sample=1000,
+            train_dataset_sample=-1,
             save_strategy='no',
             check_dataset_strategy='warning',
             truncation_strategy='truncation_left',
             seed=get_seed(random_state),
             preprocess_num_proc=4,
             **args_kwargs)
         output = sft_main(sft_args)
-        t = (time.time() - start_t) / 60  # min
-        max_memory = torch.cuda.max_memory_reserved() / 1024**2
         torch.cuda.empty_cache()
     output = {
-        'time': f'{t}min',
-        'memory': f'{max_memory}MiB',
+        'samples/s': f"{output['train_info']['samples/s']:.2f}",
+        'memory': output['memory'],
         'train_args': check_json_format(args_kwargs),
         'model_info': output['model_info'],
     }
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -331,6 +331,12 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
             trainer._add_patterns_to_gitignores(['images/'])
             trainer.push_to_hub()
     return {
+        'memory': trainer.perf['memory'],
+        'train_info': {
+            'time': trainer.perf['train_time'],
+            'num_samples': len(train_dataset),
+            'samples/s': len(train_dataset) / trainer.perf['train_time']
+        },
         'last_model_checkpoint': last_model_checkpoint,
         'best_model_checkpoint': trainer.state.best_model_checkpoint,
         'best_metric': trainer.state.best_metric,
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -41,21 +41,20 @@ def __init__(self, *args, **kwargs):
             self.model.get_trainable_parameters() if hasattr(
                 self.model, 'get_trainable_parameters') else None,
         }
-        self._iter_perf = 0
 
     def training_step(self, *args, **kwargs) -> torch.Tensor:
         train_time = time.time()
         training_output = super().training_step(*args, **kwargs)
         train_time = time.time() - train_time
         self.perf['train_time'] = self.perf['train_time'] + train_time
-        self._iter_perf += 1
-        if self._iter_perf > 20 and not self.perf[
-                'memory'] and torch.cuda.device_count() > 0:
-            for i in range(torch.cuda.device_count()):
-                self.perf['memory'][
-                    f'device:{i}'] = f'{torch.cuda.memory_reserved(i)/1024/1024/1024:.2f}GB'
         return training_output
 
+    def train(self, *args, **kwargs) -> torch.Tensor:
+        super().train(*args, **kwargs)
+        for i in range(torch.cuda.device_count()):
+            self.perf['memory'][
+                f'cuda:{i}'] = f'{torch.cuda.max_memory_reserved(i)/1024/1024/1024:.2f}GiB'
+
     def prediction_step(
         self,
         model: nn.Module,
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
@@ -485,7 +485,7 @@ def get_trainable_parameters(self):
                f'|| trainable%: {100 * trainable_params / all_param:.4f}' \
                '|| cuda memory: ' \
                f'{sum([torch.cuda.memory_allocated(i) for i in range(torch.cuda.device_count())])/1024/1024/1024:.2f}' \
-               'GB.'
+               'GiB.'
 
 
 class Swift:
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
@@ -91,13 +91,18 @@ def test_loss_matching(self):
             best_model_checkpoint = output['best_model_checkpoint']
             print(f'best_model_checkpoint: {best_model_checkpoint}')
             torch.cuda.empty_cache()
+            load_dataset_config = str(bool_var or NO_EVAL_HUMAN)
+            if load_dataset_config:
+                show_dataset_sample = 2
+            else:
+                show_dataset_sample = -1
             infer_main([
                 '--ckpt_dir', best_model_checkpoint, '--show_dataset_sample',
-                '-1', '--max_new_tokens', '100', '--use_flash_attn', 'true',
-                '--verbose',
+                str(show_dataset_sample), '--max_new_tokens', '100',
+                '--use_flash_attn', 'true', '--verbose',
                 str(not bool_var), '--merge_lora_and_save',
                 str(bool_var), '--load_dataset_config',
-                str(bool_var or NO_EVAL_HUMAN)
+                str(load_dataset_config)
             ])
             loss = output['log_history'][-1]['train_loss']
             losses.append(loss)