Fix tflops glu computation (#283)

Muennighoff · thomasw21 · web-flow · commit e23393fbefb8 · 2022-06-05T21:30:48.000+02:00
* Fix tflops glu computation

* Explain GLU TFLOPs difference

* Fix typo

* Specify MLP

Co-authored-by: Thomas Wang &lt;24695242+thomasw21@users.noreply.github.com&gt;

Co-authored-by: Thomas Wang &lt;24695242+thomasw21@users.noreply.github.com&gt;
diff --git a/megatron/training.py b/megatron/training.py
@@ -692,7 +692,11 @@ def add_to_logging(name):
         # The factor of 4 is when used with activation check-pointing,
         # otherwise it will be 3, but for 200B model, activation check-pointing will always be on.
         checkpoint_activations_factor = 4 if args.checkpoint_activations else 3
-        flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
+        # GLU activations double the hidden states in the upscaling feed-forward in each transformer layer
+        # This leads to 16bsh^2 instead of 8bsh^2 per first feed-forward layer in MLP, thus we increase the coefficient by 8.
+        # Refer to https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/283#issue-1260805063 for more details.
+        coefficient = 32 if args.glu_activation else 24
+        flops_per_iteration = (coefficient * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
         tflops = flops_per_iteration / (elapsed_time_per_iteration * args.world_size * (10**12))
 
         # only the last rank process has a non-None _GLOBAL_TENSORBOARD_WRITER