@@ -1035,13 +1035,17 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
10351035 samples_per_sec_per_replica = samples_per_sec / args .data_parallel_size
10361036 tokens_per_sec = samples_per_sec * seq_len
10371037 tokens_per_sec_per_replica = tokens_per_sec / args .data_parallel_size
1038+ tokens_per_gpu_per_second = tokens_per_sec / args .world_size
1039+ tokens_per_gpu_per_second_per_replica = tokens_per_gpu_per_second / args .data_parallel_size
10381040 if wandb is not None and getattr (wandb , 'run' , None ) is not None :
10391041 tput = {
10401042 'throughput/iteration-time' : elapsed_time_per_iteration , # 1000 ms / s
10411043 'throughput/samples_per_sec' : samples_per_sec ,
10421044 'throughput/samples_per_sec_per_replica' : samples_per_sec_per_replica ,
10431045 'throughput/tokens_per_sec' : tokens_per_sec ,
10441046 'throughput/tokens_per_sec_per_replica' : tokens_per_sec_per_replica ,
1047+ 'throughput/tokens_per_gpu_per_sec' : tokens_per_gpu_per_second ,
1048+ 'throughput/tokens_per_gpu_per_sec_per_replica' : tokens_per_gpu_per_second_per_replica ,
10451049 'throughput/tflops' : tflops ,
10461050 'throughput/approx_params_in_billions' : approx_parameters_in_billions ,
10471051 'throughput/elapsed_ms_per_iteration' : elapsed_time_per_iteration ,
@@ -1091,6 +1095,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
10911095 log_string += ' number of nan iterations: {:3d} |' .format (
10921096 total_loss_dict [nan_iters_key ])
10931097 log_string += ' samples per second: {:.3f} |' .format (samples_per_sec )
1098+ log_string += ' tokens per gpu per second (tgs): {:.3f} |' .format (tokens_per_gpu_per_second )
10941099 log_string += ' TFLOPs: {:.2f} |' .format (tflops )
10951100 total_loss_dict [advanced_iters_key ] = 0
10961101 total_loss_dict [skipped_iters_key ] = 0
0 commit comments