Fix mixed fused layer norm to mimick nn.LayerNorm for torch>1.11 (#281)

thomasw21 · web-flow · commit 908dc9cb4b97 · 2022-05-03T10:31:25.000+02:00
* If pytorch&gt;=1.11 available we can use nn.LayerNorm instead of MixedLayerNorm

* Add MixedFusedLayerNorm fix

* Turns out LayerNorm for bf16 is slower using torch==1.11

* Test for LayerNorm
diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
@@ -317,7 +317,7 @@ void cuApplyLayerNorm(
     if (gamma != NULL && beta != NULL) {
       for (int i = thrx;  i < n2;  i+=numx) {
         U curr = static_cast<U>(lvals[i]);
-        ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];
+        ovals[i] = (curr - mu) * c_invvar * static_cast<U>(gamma[i]) + static_cast<U>(beta[i]);
       }
     } else {
       for (int i = thrx;  i < n2;  i+=numx) {
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
@@ -18,11 +18,17 @@
    with some changes. """
 
 import numbers
+
+from packaging import version
 import torch
+from torch import nn
 from torch.nn.parameter import Parameter
+import torch.nn.functional as F
 from torch.nn import init
 import importlib
 
+from megatron import get_args
+
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
 
@@ -62,19 +68,26 @@ def backward(ctx, grad_output):
 class MixedFusedLayerNorm(torch.nn.Module):
 
   def __init__(self, normalized_shape, eps=1e-5):
-        super(MixedFusedLayerNorm, self).__init__()
+    super(MixedFusedLayerNorm, self).__init__()
+
+    global fused_mix_prec_layer_norm_cuda
+    fused_mix_prec_layer_norm_cuda = importlib.import_module(
+      "fused_mix_prec_layer_norm_cuda")
 
-        global fused_mix_prec_layer_norm_cuda
-        fused_mix_prec_layer_norm_cuda = importlib.import_module(
-          "fused_mix_prec_layer_norm_cuda")
+    if isinstance(normalized_shape, numbers.Integral):
+        normalized_shape = (normalized_shape,)
+    self.normalized_shape = torch.Size(normalized_shape)
+    self.eps = eps
+    self.weight = Parameter(torch.Tensor(*normalized_shape))
+    self.bias = Parameter(torch.Tensor(*normalized_shape))
+    self.reset_parameters()
 
-        if isinstance(normalized_shape, numbers.Integral):
-            normalized_shape = (normalized_shape,)
-        self.normalized_shape = torch.Size(normalized_shape)
-        self.eps = eps
-        self.weight = Parameter(torch.Tensor(*normalized_shape))
-        self.bias = Parameter(torch.Tensor(*normalized_shape))
-        self.reset_parameters()
+    args = get_args()
+
+    self.use_meg_ds_fused_layer_norm = (
+      args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
+      or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
+    )
 
 
   def reset_parameters(self):
@@ -84,7 +97,8 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-
-    return FusedLayerNormAffineFunction.apply(
-      input, self.weight, self.bias, self.normalized_shape,self.eps)
-
+    if self.use_meg_ds_fused_layer_norm:
+        return FusedLayerNormAffineFunction.apply(
+            input, self.weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -4,9 +4,14 @@
 
 import deepspeed
 import torch
+from torch import nn
+import torch.nn.functional as F
+
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm
+from packaging import version
 
 from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
-from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal
 from megatron.training import setup_model_and_optimizer
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
 from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
@@ -51,9 +56,6 @@ def get_default_args():
     }
 
 
-
-
-
 def equal_vectors(tensor1, tensor2, dim=-1):
     """View tensor1 and tensor2 as a list of vectors, and compute equality"""
     return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
@@ -109,9 +111,7 @@ def test_gpt(self):
                 output_changed = model(input_token_ids_changed, *input_batch[1:])
 
                 # All token in past should be unchanged
-                self.assertTrue(
-                    torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index]))
-                )
+                torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index])
                 # All tokens in the future should have changed
                 self.assertFalse(
                     torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
@@ -173,23 +173,15 @@ def test_prefix_lm_reset_attention_mask(self):
                 output_changed_target = model(token_ids_changed_target, *input_batch[1:])
 
                 # All token in past should be unchanged
-                self.assertTrue(
-                    torch.all(
-                        equal_vectors(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
-                    )
-                )
+                torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
                 # All tokens in the future should have changed
                 self.assertFalse(
                     torch.any(
                         equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
                     )
                 )
                 # Unchanged changed rows should not change either
-                self.assertTrue(
-                    torch.all(
-                        equal_vectors(output[1, :], output_changed_target[1, :])
-                    )
-                )
+                torch_assert_equal(output[1, :], output_changed_target[1, :])
 
                 ## --------------- CHANGE AN INPUT TOKEN ---------------------------
                 # Let's change the the last prefix token and make sure that the first token changed
@@ -212,11 +204,7 @@ def test_prefix_lm_reset_attention_mask(self):
                     )
                 )
                 # Unchanged changed rows should not change either
-                self.assertTrue(
-                    torch.all(
-                        equal_vectors(output[1, :], output_changed_input[1, :])
-                    )
-                )
+                torch_assert_equal(output[1, :], output_changed_input[1, :])
 
     def test_prefix_lm_wo_reset_attention_mask(self):
         """
@@ -282,6 +270,43 @@ def test_gpt_rotary_embeddings(self):
 
                 #TODO: Check all invariants
 
+    def test_fused_layer_norm(self):
+        command_args = get_default_args()
+
+        # Condition to use custom cuda kernel
+        command_args["--bf16"] = ""
+        del command_args["--fp16"]
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                initialize_megatron()
+                args = get_args()
+
+                dummy_input = torch.randn(args.micro_batch_size, args.seq_length, args.hidden_size, device="cuda", dtype=torch.bfloat16)
+
+                normalized_shape = (args.hidden_size,)
+                epsilon = 1e-5
+                mfln = MixedFusedLayerNorm(normalized_shape, eps=epsilon)
+
+                self.assertTrue(mfln.use_meg_ds_fused_layer_norm, "Expected model to use Megatron-DeepSpeed custom cuda kernel for LayerNorm.")
+                self.assertTrue(args.bf16, "Test has to be done in half precision.")
+
+                # We set the weight manually so we simulate state that's not the initialisation
+                weight = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                bias = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                mfln.weight = nn.Parameter(weight)
+                mfln.bias = nn.Parameter(bias)
+
+                mfln_output = mfln(dummy_input)
+                # We check that our layernorm matches pytorch 1.11 onwards
+                if version.parse(torch.__version__) >= version.parse("1.11.0"):
+                    torch_layer_norm_output = F.layer_norm(dummy_input, normalized_shape, weight, bias, eps=epsilon)
+                else:
+                    # In this case we use can check that basically it corresponds to the fp32 version
+                    torch_layer_norm_output = F.layer_norm(dummy_input.float(), normalized_shape, weight.float(), bias.float(), eps=epsilon).to(torch.bfloat16)
+
+                torch_assert_equal(mfln_output, torch_layer_norm_output)
+
 
 if __name__ == '__main__':
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -317,7 +317,7 @@ void cuApplyLayerNorm(`
`317`	`317`	`if (gamma != NULL && beta != NULL) {`
`318`	`318`	`for (int i = thrx; i < n2; i+=numx) {`
`319`	`319`	`U curr = static_cast<U>(lvals[i]);`
`320`		`- ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];`
	`320`	`+ ovals[i] = (curr - mu) * c_invvar * static_cast<U>(gamma[i]) + static_cast<U>(beta[i]);`
`321`	`321`	`}`
`322`	`322`	`} else {`
`323`	`323`	`for (int i = thrx; i < n2; i+=numx) {`