From 07ccb3db1717f300fd09afaaf9eac57678ca0e5d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 23:23:54 +0100
Subject: [PATCH 01/37] Better

---
 tests/test_training.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index fb72e59c6..bc64ffb73 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -682,6 +682,8 @@ def test_layer_norm_consistent(self, variation):
             execute_subprocess_async(cmd, env=self.get_env())
 
         checkpoints = ["global_step10", "global_step20"]
+
+        # Check transformer layer norm
         keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
         for checkpoint in checkpoints:
@@ -693,6 +695,7 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
 
+        # Check embed layer norm
         keys_to_compare = ["word_embeddings.norm.weight"]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
         for checkpoint in checkpoints:
@@ -702,4 +705,4 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
\ No newline at end of file
+                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)

From 391ed4882546c38dea1e6c37e65e411cab06add5 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 24 Mar 2022 23:36:27 +0100
Subject: [PATCH 02/37] Force synchronize the layer norms parameters across all
 TP

---
 megatron/model/fused_layer_norm.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 8430f528c..9a7b5f06e 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -84,19 +84,11 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group())
-    biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())]
-    torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group())
-    if any(torch.any(weight != self.weight) for weight in weights):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Weight sync failed")
-            print(weights)
-    if any(torch.any(bias != self.bias) for bias in biases):
-        if mpu.get_tensor_model_parallel_rank() == 0:
-            print("Bias sync failed")
-            print(biases)
+    tp_world_size = mpu.get_tensor_model_parallel_world_size()
+    # TODO: hack in order to synchronize all layer norms despite them being unsynched
+    weight = mpu.reduce_from_tensor_model_parallel_region(self.weight) / tp_world_size
+    bias = mpu.reduce_from_tensor_model_parallel_region(self.bias) / tp_world_size
 
     return FusedLayerNormAffineFunction.apply(
-      input, self.weight, self.bias, self.normalized_shape,self.eps)
+      input, weight, bias, self.normalized_shape,self.eps)
 

From 98d0e7cc0b383d828a23c754010e14aee76404c4 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 24 Mar 2022 18:42:31 -0700
Subject: [PATCH 03/37] import mpu

---
 megatron/model/fused_layer_norm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 9a7b5f06e..410a9bcd4 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -23,6 +23,7 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
+from megatron import mpu
 
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None

From 279a77ebf2c878e4aad9392df3188fa2f391665a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 24 Mar 2022 19:05:27 -0700
Subject: [PATCH 04/37] use the bf16 branch for testing

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index da76b5e44..08e48cd26 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,8 @@ six
 tensorboard
 torch>=1.7
 transformers
-DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
+# for now using this branch for bf16 work
+DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates
 # versions from HF transformers
 black==21.4b0
 isort>=5.5.4

From 87a9dba0b9e91c09c410229f2ab97dec9c98afe0 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:12:14 -0700
Subject: [PATCH 05/37] `torch.testing.assert_equal` didn't make it (#273)

looks like pt-1.11 dropped `torch.testing.assert_equal`, so using `torch.testing.assert_equal` instead
---
 megatron/testing_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/testing_utils.py b/megatron/testing_utils.py
index 2143b610b..9521cb361 100644
--- a/megatron/testing_utils.py
+++ b/megatron/testing_utils.py
@@ -232,9 +232,9 @@ def get_gpu_count():
         return 0
 
 def torch_assert_equal(actual, expected, **kwargs):
-    # assert_equal was added around pt-1.9, it does better checks - e.g will check dimensions match
-    if hasattr(torch.testing, "assert_equal"):
-        return torch.testing.assert_equal(actual, expected, **kwargs)
+    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
+    if hasattr(torch.testing, "assert_close"):
+        return torch.testing.assert_close(actual, expected, rtol=0.0, atol=0.0, **kwargs)
     else:
         return torch.allclose(actual, expected, rtol=0.0, atol=0.0)
 
@@ -886,4 +886,4 @@ def flatten_arguments(args):
 
     Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
     """
-    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
\ No newline at end of file
+    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]

From 70f91f8208a9459218391e0e581f908066bb1b86 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:35:12 -0700
Subject: [PATCH 06/37] bf16 comms requite pt-1.11

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 08e48cd26..f0ec53a7d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ pybind11
 regex
 six
 tensorboard
-torch>=1.7
+torch>=1.11
 transformers
 # for now using this branch for bf16 work
 DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates

From 835a3e5c952244da1b54e4b9ff20ba631341e99d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 24 Mar 2022 19:36:21 -0700
Subject: [PATCH 07/37] already part of the function

---
 tests/test_training.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index bc64ffb73..65067982e 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -693,7 +693,7 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)
 
         # Check embed layer norm
         keys_to_compare = ["word_embeddings.norm.weight"]
@@ -705,4 +705,4 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False)
+                        torch_assert_equal(ref, weight, check_device=False)

From 37795a9238e86b124c3c611fb4869d7193aa4542 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 24 Mar 2022 19:42:40 -0700
Subject: [PATCH 08/37] reproduce the crashing on resume

---
 tests/test_training.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_training.py b/tests/test_training.py
index 65067982e..3c3db39c1 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -706,3 +706,14 @@ def test_layer_norm_consistent(self, variation):
                     ref = weights[0]
                     for weight in weights[1:]:
                         torch_assert_equal(ref, weight, check_device=False)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)

From 3ec65f7c3b24aa67ff5beccbd23aa728a153c025 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 24 Mar 2022 19:54:38 -0700
Subject: [PATCH 09/37] run just the test we want for now

---
 .github/workflows/main.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e343df39e..a2786a402 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -186,7 +186,9 @@ jobs:
           pip install pytest-timeout
 
       - name: Run tests
-        run: pytest --timeout=600 tests
+        # run: pytest --timeout=600 tests
+        # run just the test we want for now
+        run: pytest --timeout=600 tests/test_training.py::MegDSTestTraining::test_layer_norm_consistent_0_bf16
 
   stop-runner:
     name: Stop self-hosted EC2 runner

From 8271d419fcb950eb0083ed95db1957a83bfab258 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 10:50:58 +0100
Subject: [PATCH 10/37] all_reduce is an in_place operation

---
 megatron/model/fused_layer_norm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 410a9bcd4..100595d26 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -87,8 +87,10 @@ def reset_parameters(self):
   def forward(self, input):
     tp_world_size = mpu.get_tensor_model_parallel_world_size()
     # TODO: hack in order to synchronize all layer norms despite them being unsynched
-    weight = mpu.reduce_from_tensor_model_parallel_region(self.weight) / tp_world_size
-    bias = mpu.reduce_from_tensor_model_parallel_region(self.bias) / tp_world_size
+    weight = torch.clone(self.weight)
+    bias = torch.clone(self.bias)
+    weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size
+    bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size
 
     return FusedLayerNormAffineFunction.apply(
       input, weight, bias, self.normalized_shape,self.eps)

From b418b47a6e27c69c9eef36177b9b48e965b198e9 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 11:52:08 +0100
Subject: [PATCH 11/37] Make a test that TP reshaping works

---
 tests/test_tensor_parallel.py | 152 +++++++++++++++++++++++++++++++++-
 tests/test_training.py        | 123 ---------------------------
 2 files changed, 150 insertions(+), 125 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index ed383e17a..a117ede61 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 from random import randint
 from unittest.mock import patch
@@ -9,7 +10,8 @@
 
 import pytest
 from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
-from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, \
+    require_torch_multi_gpu, torch_assert_equal, CaptureStdout, execute_subprocess_async
 from megatron.training import setup_model_and_optimizer
 from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
@@ -64,7 +66,7 @@ def get_default_args(self):
             
             #ds args
             "--deepspeed": "",
-            "--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json",
+            "--deepspeed_config": f"{self.test_file_dir_str}/ds_config.json",
             "--zero-stage": "1",
             "--deepspeed-activation-checkpointing": ""
             # DATA_ARGS
@@ -293,5 +295,151 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
 
         self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
 
+    @parameterized.expand(["bf16", "fp16"])
+    def test_layer_norm_consistent(self, variation):
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()
+        num_gpus = 2
+        seq_len = 128
+        data_dir = f"{self.data_dir}/gpt2"
+        default_args = f"""
+                    --pipeline-model-parallel-size 1
+                    --distributed-backend nccl
+
+                    --log-interval 1
+                    --save-interval 10
+                    --eval-interval 10
+                    --eval-iters 5
+                    --checkpoint-activations
+                    --partition-activations
+                    --exit-interval {20}
+
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                    --save {output_dir}/checkpoints
+                    --load {output_dir}/checkpoints
+                    --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                    --tensorboard-dir {output_dir}/tensorboard
+                    --tensorboard-queue-size 5
+                    --log-timers-to-tensorboard
+                    --log-batch-size-to-tensorboard
+                    --log-validation-ppl-to-tensorboard
+
+                    --num-layers 2
+                    --hidden-size 64
+                    --num-attention-heads 2
+                    --seq-length {seq_len}
+                    --max-position-embeddings 1024
+                    --micro-batch-size 2
+                    --global-batch-size 16
+
+                    --optimizer adam
+                    --adam-beta1 0.9
+                    --adam-beta2 0.95
+                    --adam-eps 1e-8
+                    --lr 1e-1
+                    --clip-grad 1.0
+                    --weight-decay 1e-1
+                    --embed-layernorm
+
+                    --log-level debug
+                    --log-level-replica info
+
+                    --rampup-batch-size 2 2 200
+                    --train-samples 200
+
+                    --position-embedding-type alibi
+            """.split()
+
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120"  # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--position-embedding-type"] = "alibi"
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        if variation == "bf16":
+            command_args["--bf16"] = ""
+            command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config_bf16.json"
+            command_args["--zero-stage"] = "0"
+        elif variation == "fp16":
+            command_args["--fp16"] = ""
+            command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config.json"
+            command_args["--zero-stage"] = "1"
+
+        # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+        cmd = launcher + script + " ".join([f"{key} {value}" for key, value in command_args.items()])
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        checkpoints = ["global_step10", "global_step20"]
+
+        # Check transformer layer norm
+        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
+                           "post_attention_layernorm.bias"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+                            layer_id in [3, 4]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
+
+        # Check embed layer norm
+        keys_to_compare = ["word_embeddings.norm.weight"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+                            layer_id in [1]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
+
+        # # 2. test training from checkpoint: resume
+        # # now do it again, this time resuming from the checkpoint
+        # with CaptureStdout() as cs:
+        #     execute_subprocess_async(cmd, env=self.get_env())
+        #
+        # # test checkpoint loading
+        # self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+        #
+        # # test reports
+        # self.assertIn("consumed samples", cs.out)
+
+        # 3. test that inference with changes TP works.
+        command_args["--tensor-model-parallel-size"] = "1"
+
+        pool = Pool(1)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, output_dir, None))])
+        pool.close()
+        pool.join()
+
+        output, tokens = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model,
+                          [((0, 2, command_args, tokens, None, output_dir)), ((1, 2, command_args, tokens, None, output_dir))])
+        pool.close()
+        pool.join()
+
+        output2, tokens = result[0]
+
+        logging.getLogger().critical(output - output2)
+        self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0),
+                        "Different results when running with TP=1 and TP=2")
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_training.py b/tests/test_training.py
index 3c3db39c1..bf31bf904 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -594,126 +594,3 @@ def test_skip_train_iteration(self):
         train_iterations = range(1,10)
         for i in train_iterations:
             self.assertTrue(f"iteration {i:8d}/" in cs.out)
-
-    @parameterized.expand(["bf16", "fp16"])
-    def test_layer_norm_consistent(self, variation):
-        src_dir = self.src_dir
-        output_dir = self.get_auto_remove_tmp_dir()
-        num_gpus = 2
-        seq_len = 128
-        data_dir = f"{self.data_dir}/gpt2"
-        args = f"""
-                --tensor-model-parallel-size {2}
-                --pipeline-model-parallel-size {1}
-                --distributed-backend nccl
-
-                --log-interval 1
-                --save-interval 10
-                --eval-interval 10
-                --eval-iters 5
-                --checkpoint-activations
-                --partition-activations
-                --exit-interval {20}
-
-                --merge-file {data_dir}/gpt2-tiny-merges.txt
-                --vocab-file {data_dir}/gpt2-tiny-vocab.json
-                --save {output_dir}/checkpoints
-                --load {output_dir}/checkpoints
-                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
-                --tensorboard-dir {output_dir}/tensorboard
-                --tensorboard-queue-size 5
-                --log-timers-to-tensorboard
-                --log-batch-size-to-tensorboard
-                --log-validation-ppl-to-tensorboard
-
-                --num-layers 2
-                --hidden-size 64
-                --num-attention-heads 2
-                --seq-length {seq_len}
-                --max-position-embeddings 1024
-                --micro-batch-size 2
-                --global-batch-size 16
-
-                --optimizer adam
-                --adam-beta1 0.9
-                --adam-beta2 0.95
-                --adam-eps 1e-8
-                --lr 1e-1
-                --clip-grad 1.0
-                --weight-decay 1e-1
-                --embed-layernorm
-
-                --log-level debug
-                --log-level-replica info
-
-                --rampup-batch-size 2 2 200
-                --train-samples 200
-
-                --position-embedding-type alibi
-        """.split()
-
-        ds_args = f"""
-                --deepspeed
-                --deepspeed-activation-checkpointing
-        """.split()
-
-        if variation == "bf16":
-            args.append("--bf16")
-            ds_args += [
-                "--zero-stage", "0",
-                "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json"
-            ]
-        elif variation == "fp16":
-            args.append("--fp16")
-            ds_args += [
-                "--zero-stage", "1",
-                "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json"
-            ]
-
-        # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
-
-        script = [f"{src_dir}/pretrain_gpt.py"]
-        launcher = get_launcher(num_gpus)
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
-
-        with CaptureStdout() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-
-        checkpoints = ["global_step10", "global_step20"]
-
-        # Check transformer layer norm
-        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]]
-        for checkpoint in checkpoints:
-            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-            for key in keys_to_compare:
-                for files in files_to_compare:
-                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                    ref = weights[0]
-                    for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, check_device=False)
-
-        # Check embed layer norm
-        keys_to_compare = ["word_embeddings.norm.weight"]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]]
-        for checkpoint in checkpoints:
-            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-            for key in keys_to_compare:
-                for files in files_to_compare:
-                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                    ref = weights[0]
-                    for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, check_device=False)
-
-        # 2. test training from checkpoint: resume
-        # now do it again, this time resuming from the checkpoint
-        with CaptureStdout() as cs:
-            execute_subprocess_async(cmd, env=self.get_env())
-
-        # test checkpoint loading
-        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
-
-        # test reports
-        self.assertIn("consumed samples", cs.out)

From 4b7207b5c4d7ecf084413ce3a7359d471e82584c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 11:53:13 +0100
Subject: [PATCH 12/37] Woops

---
 tests/test_tensor_parallel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index a117ede61..c894ca0b9 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -9,6 +9,8 @@
 import numpy as np
 
 import pytest
+from parameterized import parameterized
+
 from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
 from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, \
     require_torch_multi_gpu, torch_assert_equal, CaptureStdout, execute_subprocess_async

From 3bc58243d07e16ba61e12a24fb2f3fec4e893863 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 11:55:37 +0100
Subject: [PATCH 13/37] Woops

---
 tests/test_tensor_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index c894ca0b9..8f77b525d 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -371,7 +371,7 @@ def test_layer_norm_consistent(self, variation):
 
         script = [f"{src_dir}/pretrain_gpt.py"]
         launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
-        cmd = launcher + script + " ".join([f"{key} {value}" for key, value in command_args.items()])
+        cmd = launcher + script + [f"{key} {value}" for key, value in command_args.items()]
         # keep for quick debug
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
 

From 05c99db6d522c6e98a578f2207603897983e357a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:05:02 +0100
Subject: [PATCH 14/37] Woops

---
 tests/test_tensor_parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 8f77b525d..b46c34e30 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -360,6 +360,7 @@ def test_layer_norm_consistent(self, variation):
 
         if variation == "bf16":
             command_args["--bf16"] = ""
+            del command_args["--fp16"]
             command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config_bf16.json"
             command_args["--zero-stage"] = "0"
         elif variation == "fp16":
@@ -371,7 +372,7 @@ def test_layer_norm_consistent(self, variation):
 
         script = [f"{src_dir}/pretrain_gpt.py"]
         launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
-        cmd = launcher + script + [f"{key} {value}" for key, value in command_args.items()]
+        cmd = launcher + script + [elt for elts in [f"{key} {value}".split() for key, value in command_args.items()] for elt in elts]
         # keep for quick debug
         # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
 

From 55e10c63e4f487ad3ca58860d52ece9b250282ca Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:10:28 +0100
Subject: [PATCH 15/37] Woops

---
 tests/test_tensor_parallel.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index b46c34e30..03434f714 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -318,10 +318,7 @@ def test_layer_norm_consistent(self, variation):
 
                     --merge-file {data_dir}/gpt2-tiny-merges.txt
                     --vocab-file {data_dir}/gpt2-tiny-vocab.json
-                    --save {output_dir}/checkpoints
-                    --load {output_dir}/checkpoints
-                    --data-path {data_dir}/meg-gpt2-openwebtext_text_document
-                    --tensorboard-dir {output_dir}/tensorboard
+
                     --tensorboard-queue-size 5
                     --log-timers-to-tensorboard
                     --log-batch-size-to-tensorboard
@@ -357,6 +354,11 @@ def test_layer_norm_consistent(self, variation):
         command_args["--pad-vocab-size-to"] = "5120"  # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
         command_args["--position-embedding-type"] = "alibi"
         command_args["--tensor-model-parallel-size"] = "2"
+        command_args["--save"] = f"{output_dir}/checkpoints"
+        command_args["--load"] = f"{output_dir}/checkpoints"
+        command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
+        command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard"
+        command_args["--lr"] = "1e-1"
 
         if variation == "bf16":
             command_args["--bf16"] = ""

From 2ab8a3ac872396a3029b058e56e1146847342602 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:27:48 +0100
Subject: [PATCH 16/37] Woops

---
 tests/test_tensor_parallel.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 03434f714..19b3e5d49 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -61,9 +61,9 @@ def get_default_args(self):
 
             # OUTPUT_ARGS
             "--log-interval": "10",
-            "--save-interval": "500",
-            "--eval-interval": "100",
-            "--eval-iters": "10",
+            "--save-interval": "10",
+            "--eval-interval": "10",
+            "--eval-iters": "5",
             "--checkpoint-activations": "",
             
             #ds args
@@ -357,6 +357,9 @@ def test_layer_norm_consistent(self, variation):
         command_args["--save"] = f"{output_dir}/checkpoints"
         command_args["--load"] = f"{output_dir}/checkpoints"
         command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
+        command_args["--train-samples"] = "200"
+        del command_args["--train-iters"]
+        del command_args["--lr-decay-iters"]
         command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard"
         command_args["--lr"] = "1e-1"
 

From d357839dc280174c492d25c4bccc5beaa20d415d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:40:17 +0100
Subject: [PATCH 17/37] Woops

---
 tests/test_tensor_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 19b3e5d49..a3e3206e5 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -48,7 +48,6 @@ def get_default_args(self):
             "--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
             "--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
             "--data-impl": "mmap",
-            "--split": "949,50,1",
             "--distributed-backend": "nccl",
             "--weight-decay": "1e-2",
             "--clip-grad": "1.0",
@@ -358,6 +357,7 @@ def test_layer_norm_consistent(self, variation):
         command_args["--load"] = f"{output_dir}/checkpoints"
         command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
         command_args["--train-samples"] = "200"
+        command_args["--seq-length"] = "128"
         del command_args["--train-iters"]
         del command_args["--lr-decay-iters"]
         command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard"

From 5fb231c158d96f1191796af6a04065f3716d2b81 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:42:32 +0100
Subject: [PATCH 18/37] Woops

---
 tests/test_tensor_parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index a3e3206e5..8072ac5eb 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -358,6 +358,7 @@ def test_layer_norm_consistent(self, variation):
         command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
         command_args["--train-samples"] = "200"
         command_args["--seq-length"] = "128"
+        command_args["--exit-interval"] = "20"
         del command_args["--train-iters"]
         del command_args["--lr-decay-iters"]
         command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard"

From cc7ff45b46dd805dc1c93ba7999f00d5e99d961b Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 14:14:28 +0100
Subject: [PATCH 19/37] Woops

---
 tests/test_tensor_parallel.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 8072ac5eb..1350e3918 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -352,11 +352,13 @@ def test_layer_norm_consistent(self, variation):
         command_args = self.get_default_args()
         command_args["--pad-vocab-size-to"] = "5120"  # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
         command_args["--position-embedding-type"] = "alibi"
+        command_args["--embed-layernorm"] = ""
         command_args["--tensor-model-parallel-size"] = "2"
         command_args["--save"] = f"{output_dir}/checkpoints"
         command_args["--load"] = f"{output_dir}/checkpoints"
         command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
         command_args["--train-samples"] = "200"
+        command_args["--rampup-batch-size"] = "4 4 200"
         command_args["--seq-length"] = "128"
         command_args["--exit-interval"] = "20"
         del command_args["--train-iters"]
@@ -426,6 +428,8 @@ def test_layer_norm_consistent(self, variation):
         # self.assertIn("consumed samples", cs.out)
 
         # 3. test that inference with changes TP works.
+        mp.set_start_method('spawn', force=True)
+        del command_args["--rampup-batch-size"]
         command_args["--tensor-model-parallel-size"] = "1"
 
         pool = Pool(1)

From 7cdb1be83fe865d378f8eb3a0423c3fd5ce68f71 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 14:29:12 +0100
Subject: [PATCH 20/37] Woops

---
 tests/test_tensor_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 1350e3918..26491b6df 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -433,7 +433,7 @@ def test_layer_norm_consistent(self, variation):
         command_args["--tensor-model-parallel-size"] = "1"
 
         pool = Pool(1)
-        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, output_dir, None))])
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))])
         pool.close()
         pool.join()
 

From 4574ec978b36d01f1b4a083f650906d3a22d9d46 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 14:43:55 +0100
Subject: [PATCH 21/37] Fix load issue

---
 tests/test_tensor_parallel.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 26491b6df..b4b04d426 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -118,8 +118,6 @@ def create_model_inputs(tokens):
 
                 tokenizer = get_tokenizer()
 
-                model, _, _ = setup_model_and_optimizer(gpt_model_provider)
-                model = model[0]
                 if load is not None:
                     # Hack (same as in eval_harness/evaluate.py)
                     # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
@@ -130,6 +128,10 @@ def create_model_inputs(tokens):
                     # Deepspeed does however manage to load the model if we just turn off this sanity check.
                     deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
 
+                model, _, _ = setup_model_and_optimizer(gpt_model_provider)
+                model = model[0]
+
+                if load is not None:
                     zero_enabled = model._config.zero_enabled
                     model._config.zero_enabled = False
                     _, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)

From 04e89d1413924121c37ea7d044c7136504ffa551 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 15:12:31 +0100
Subject: [PATCH 22/37] Woops

---
 tests/test_tensor_parallel.py | 47 ++---------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index b4b04d426..97468ecf4 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -305,51 +305,6 @@ def test_layer_norm_consistent(self, variation):
         num_gpus = 2
         seq_len = 128
         data_dir = f"{self.data_dir}/gpt2"
-        default_args = f"""
-                    --pipeline-model-parallel-size 1
-                    --distributed-backend nccl
-
-                    --log-interval 1
-                    --save-interval 10
-                    --eval-interval 10
-                    --eval-iters 5
-                    --checkpoint-activations
-                    --partition-activations
-                    --exit-interval {20}
-
-                    --merge-file {data_dir}/gpt2-tiny-merges.txt
-                    --vocab-file {data_dir}/gpt2-tiny-vocab.json
-
-                    --tensorboard-queue-size 5
-                    --log-timers-to-tensorboard
-                    --log-batch-size-to-tensorboard
-                    --log-validation-ppl-to-tensorboard
-
-                    --num-layers 2
-                    --hidden-size 64
-                    --num-attention-heads 2
-                    --seq-length {seq_len}
-                    --max-position-embeddings 1024
-                    --micro-batch-size 2
-                    --global-batch-size 16
-
-                    --optimizer adam
-                    --adam-beta1 0.9
-                    --adam-beta2 0.95
-                    --adam-eps 1e-8
-                    --lr 1e-1
-                    --clip-grad 1.0
-                    --weight-decay 1e-1
-                    --embed-layernorm
-
-                    --log-level debug
-                    --log-level-replica info
-
-                    --rampup-batch-size 2 2 200
-                    --train-samples 200
-
-                    --position-embedding-type alibi
-            """.split()
 
         command_args = self.get_default_args()
         command_args["--pad-vocab-size-to"] = "5120"  # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
@@ -433,6 +388,8 @@ def test_layer_norm_consistent(self, variation):
         mp.set_start_method('spawn', force=True)
         del command_args["--rampup-batch-size"]
         command_args["--tensor-model-parallel-size"] = "1"
+        del command_args["--load"]
+        del command_args["--save"]
 
         pool = Pool(1)
         result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))])

From e9431002a20262172d5a97ebfa70fb7027d207e8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 16:28:52 +0100
Subject: [PATCH 23/37] Fix checkpoint path

---
 tests/test_tensor_parallel.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 97468ecf4..62d4a4775 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -195,7 +195,7 @@ def test_alibi_tp(self):
         output2, tokens = result[0]
 
         logging.getLogger().critical(output-output2)
-        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+        self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
 
 
 
@@ -391,8 +391,9 @@ def test_layer_norm_consistent(self, variation):
         del command_args["--load"]
         del command_args["--save"]
 
+        checkpoints_path = os.path.join(output_dir, "checkpoints")
         pool = Pool(1)
-        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))])
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, checkpoints_path))])
         pool.close()
         pool.join()
 
@@ -403,7 +404,7 @@ def test_layer_norm_consistent(self, variation):
 
         pool = Pool(2)
         result = pool.map(MegDSTestTP.infer_model,
-                          [((0, 2, command_args, tokens, None, output_dir)), ((1, 2, command_args, tokens, None, output_dir))])
+                          [((0, 2, command_args, tokens, None, checkpoints_path)), ((1, 2, command_args, tokens, None, checkpoints_path))])
         pool.close()
         pool.join()
 

From 09cead38c3dd0447b3473bb5efbbf180e1cebd05 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 17:57:04 +0100
Subject: [PATCH 24/37] Test that force sync will allow TP changes

---
 megatron/arguments.py              |  1 +
 megatron/model/fused_layer_norm.py | 20 +++++++++++++-------
 tests/test_tensor_parallel.py      |  1 +
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 2be64b77d..194a518ba 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -738,6 +738,7 @@ def _add_distributed_args(parser):
     group.add_argument('--use-cpu-initialization', action='store_true',
                        default=None, help='If set, affine parallel weights '
                        'initialization uses CPU' )
+    group.add_argument('--force-sync-layer-norm-parameters', action="store_true")
     return parser
 
 
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 100595d26..cd10e5b77 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -19,7 +19,7 @@
 
 import numbers
 import torch
-from megatron import mpu
+from megatron import mpu, get_args
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
@@ -64,6 +64,7 @@ class MixedFusedLayerNorm(torch.nn.Module):
 
   def __init__(self, normalized_shape, eps=1e-5):
         super(MixedFusedLayerNorm, self).__init__()
+        args = get_args()
 
         global fused_mix_prec_layer_norm_cuda
         fused_mix_prec_layer_norm_cuda = importlib.import_module(
@@ -76,6 +77,7 @@ def __init__(self, normalized_shape, eps=1e-5):
         self.weight = Parameter(torch.Tensor(*normalized_shape))
         self.bias = Parameter(torch.Tensor(*normalized_shape))
         self.reset_parameters()
+        self.force_sync_layer_norm_parameters = args.force_sync_layer_norm_parameters
 
 
   def reset_parameters(self):
@@ -85,12 +87,16 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-    tp_world_size = mpu.get_tensor_model_parallel_world_size()
-    # TODO: hack in order to synchronize all layer norms despite them being unsynched
-    weight = torch.clone(self.weight)
-    bias = torch.clone(self.bias)
-    weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size
-    bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size
+    if self.force_sync_layer_norm_parameters:
+        tp_world_size = mpu.get_tensor_model_parallel_world_size()
+        # TODO: hack in order to synchronize all layer norms despite them being unsynched
+        weight = torch.clone(self.weight)
+        bias = torch.clone(self.bias)
+        weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size
+        bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size
+    else:
+        weight = self.weight
+        bias = self.bias
 
     return FusedLayerNormAffineFunction.apply(
       input, weight, bias, self.normalized_shape,self.eps)
diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 62d4a4775..d9a0f7fd0 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -390,6 +390,7 @@ def test_layer_norm_consistent(self, variation):
         command_args["--tensor-model-parallel-size"] = "1"
         del command_args["--load"]
         del command_args["--save"]
+        command_args["--force-sync-layer-norm-parameters"] = ""
 
         checkpoints_path = os.path.join(output_dir, "checkpoints")
         pool = Pool(1)

From 77abee61db2da4945a029df246fd00b71a0c3d79 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 25 Mar 2022 18:19:13 +0100
Subject: [PATCH 25/37] Nit

---
 tests/test_tensor_parallel.py | 57 ++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index d9a0f7fd0..cbb67ca0a 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -344,34 +344,35 @@ def test_layer_norm_consistent(self, variation):
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
 
-        checkpoints = ["global_step10", "global_step20"]
-
-        # Check transformer layer norm
-        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
-                           "post_attention_layernorm.bias"]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
-                            layer_id in [3, 4]]
-        for checkpoint in checkpoints:
-            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-            for key in keys_to_compare:
-                for files in files_to_compare:
-                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                    ref = weights[0]
-                    for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, check_device=False)
-
-        # Check embed layer norm
-        keys_to_compare = ["word_embeddings.norm.weight"]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
-                            layer_id in [1]]
-        for checkpoint in checkpoints:
-            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-            for key in keys_to_compare:
-                for files in files_to_compare:
-                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                    ref = weights[0]
-                    for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, check_device=False)
+        # # 1. test that the layer norm weights and biases are synchronized
+        # checkpoints = ["global_step10", "global_step20"]
+
+        # # Check transformer layer norm
+        # keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
+        #                    "post_attention_layernorm.bias"]
+        # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+        #                     layer_id in [3, 4]]
+        # for checkpoint in checkpoints:
+        #     checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+        #     for key in keys_to_compare:
+        #         for files in files_to_compare:
+        #             weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+        #             ref = weights[0]
+        #             for weight in weights[1:]:
+        #                 torch_assert_equal(ref, weight, check_device=False)
+        #
+        # # Check embed layer norm
+        # keys_to_compare = ["word_embeddings.norm.weight"]
+        # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+        #                     layer_id in [1]]
+        # for checkpoint in checkpoints:
+        #     checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+        #     for key in keys_to_compare:
+        #         for files in files_to_compare:
+        #             weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+        #             ref = weights[0]
+        #             for weight in weights[1:]:
+        #                 torch_assert_equal(ref, weight, check_device=False)
 
         # # 2. test training from checkpoint: resume
         # # now do it again, this time resuming from the checkpoint

From 64a62c8081d5e1c9184c169e4f50372f4e82d92f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 11:03:00 +0200
Subject: [PATCH 26/37] Now that we have a force sync mechanism, let's try to
 reproduce

---
 tests/test_tensor_parallel.py | 69 ++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index cbb67ca0a..17ac9dab6 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -344,35 +344,46 @@ def test_layer_norm_consistent(self, variation):
         with CaptureStdout() as cs:
             execute_subprocess_async(cmd, env=self.get_env())
 
-        # # 1. test that the layer norm weights and biases are synchronized
-        # checkpoints = ["global_step10", "global_step20"]
-
-        # # Check transformer layer norm
-        # keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
-        #                    "post_attention_layernorm.bias"]
-        # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
-        #                     layer_id in [3, 4]]
-        # for checkpoint in checkpoints:
-        #     checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-        #     for key in keys_to_compare:
-        #         for files in files_to_compare:
-        #             weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-        #             ref = weights[0]
-        #             for weight in weights[1:]:
-        #                 torch_assert_equal(ref, weight, check_device=False)
-        #
-        # # Check embed layer norm
-        # keys_to_compare = ["word_embeddings.norm.weight"]
-        # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
-        #                     layer_id in [1]]
-        # for checkpoint in checkpoints:
-        #     checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
-        #     for key in keys_to_compare:
-        #         for files in files_to_compare:
-        #             weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-        #             ref = weights[0]
-        #             for weight in weights[1:]:
-        #                 torch_assert_equal(ref, weight, check_device=False)
+        # 1. test that the layer norm weights and biases are synchronized
+        checkpoints = ["global_step10", "global_step20"]
+
+        # Check transformer layer norm
+        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
+                           "post_attention_layernorm.bias"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+                            layer_id in [3, 4]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
+
+        # Check embed layer norm
+        keys_to_compare = ["word_embeddings.norm.weight"]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+                            layer_id in [1]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
+
+        keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    print(weights)
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
 
         # # 2. test training from checkpoint: resume
         # # now do it again, this time resuming from the checkpoint

From 0b7afcc943f972f0f0e3c78efc8d6fe160350485 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 11:15:00 +0200
Subject: [PATCH 27/37] Compare model_states_rank

---
 tests/test_tensor_parallel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 17ac9dab6..bca25c6ae 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -375,6 +375,7 @@ def test_layer_norm_consistent(self, variation):
                         torch_assert_equal(ref, weight, check_device=False)
 
         keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"]
+        files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             for key in keys_to_compare:

From ce017338f400da2cee5b127ede22999b8726ea09 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 11:17:07 +0200
Subject: [PATCH 28/37] test

---
 tests/test_tensor_parallel.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index bca25c6ae..4577f0e8c 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -379,12 +379,13 @@ def test_layer_norm_consistent(self, variation):
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             for key in keys_to_compare:
+                print(key)
                 for files in files_to_compare:
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     print(weights)
-                    ref = weights[0]
-                    for weight in weights[1:]:
-                        torch_assert_equal(ref, weight, check_device=False)
+                        # ref = weights[0]
+                        # for weight in weights[1:]:
+                        #     assert ref == weight
 
         # # 2. test training from checkpoint: resume
         # # now do it again, this time resuming from the checkpoint

From 89ab0b72b58b3fb8bd4a361f4211bfc5eeef07fb Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 14:47:57 +0200
Subject: [PATCH 29/37] Row column bias should be synchronized as well

---
 tests/test_tensor_parallel.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 4577f0e8c..1f1822c1f 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -348,8 +348,14 @@ def test_layer_norm_consistent(self, variation):
         checkpoints = ["global_step10", "global_step20"]
 
         # Check transformer layer norm
-        keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight",
-                           "post_attention_layernorm.bias"]
+        keys_to_compare = [
+            "input_layernorm.weight",
+            "input_layernorm.bias",
+            "post_attention_layernorm.weight",
+            "post_attention_layernorm.bias",
+            "self_attention.dense.bias",
+            "mlp.dense_4h_to_h.bias"
+        ]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
                             layer_id in [3, 4]]
         for checkpoint in checkpoints:
@@ -426,7 +432,7 @@ def test_layer_norm_consistent(self, variation):
         output2, tokens = result[0]
 
         logging.getLogger().critical(output - output2)
-        self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0),
+        self.assertTrue(np.allclose(output, output2, atol=0, rtol=0),
                         "Different results when running with TP=1 and TP=2")
 
 if __name__ == '__main__':

From 42997b2a93438a67147588e80b8afa4688b67d93 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 14:59:43 +0200
Subject: [PATCH 30/37] New list of matching embeddings

---
 tests/test_tensor_parallel.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 1f1822c1f..284a51057 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -368,7 +368,11 @@ def test_layer_norm_consistent(self, variation):
                         torch_assert_equal(ref, weight, check_device=False)
 
         # Check embed layer norm
-        keys_to_compare = ["word_embeddings.norm.weight"]
+        keys_to_compare = [
+            "word_embeddings.norm.weight",
+            "tokentype_embeddings.weight"
+            "position_embeddings.weight"
+        ]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
                             layer_id in [1]]
         for checkpoint in checkpoints:

From e0ef1683a4b7336d609179f015ece06c704a6f8c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 18:53:53 +0200
Subject: [PATCH 31/37] Figure out why state differs

---
 tests/test_tensor_parallel.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 284a51057..f490feede 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -384,18 +384,16 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         torch_assert_equal(ref, weight, check_device=False)
 
-        keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"]
+        keys_to_compare = ["torch_rng_state"]
         files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             for key in keys_to_compare:
-                print(key)
                 for files in files_to_compare:
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
-                    print(weights)
-                        # ref = weights[0]
-                        # for weight in weights[1:]:
-                        #     assert ref == weight
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        assert ref == weight, f"key: {key} ref: {ref}, weight: {weight}"
 
         # # 2. test training from checkpoint: resume
         # # now do it again, this time resuming from the checkpoint

From 1fc4fe82787b31366f8ac2ba9ac4ebcb7ebee756 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 18:57:52 +0200
Subject: [PATCH 32/37] Test for final weight

---
 tests/test_tensor_parallel.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index f490feede..712df08d7 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -370,8 +370,7 @@ def test_layer_norm_consistent(self, variation):
         # Check embed layer norm
         keys_to_compare = [
             "word_embeddings.norm.weight",
-            "tokentype_embeddings.weight"
-            "position_embeddings.weight"
+            "word_embeddings.norm.bias"
         ]
         files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
                             layer_id in [1]]
@@ -384,6 +383,23 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         torch_assert_equal(ref, weight, check_device=False)
 
+        # Final layer norm
+        keys_to_compare = [
+            "weight",
+            "bias"
+        ]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)]
+                            for
+                            layer_id in [6]]
+        for checkpoint in checkpoints:
+            checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
+            for key in keys_to_compare:
+                for files in files_to_compare:
+                    weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
+                    ref = weights[0]
+                    for weight in weights[1:]:
+                        torch_assert_equal(ref, weight, check_device=False)
+
         keys_to_compare = ["torch_rng_state"]
         files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]]
         for checkpoint in checkpoints:

From 7ebbed16ce52e8c88597595621293ada9da8b01c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 29 Mar 2022 19:23:32 +0200
Subject: [PATCH 33/37] Test that torch_rng_state

---
 tests/test_tensor_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 712df08d7..3fbd4b225 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -409,7 +409,7 @@ def test_layer_norm_consistent(self, variation):
                     weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files]
                     ref = weights[0]
                     for weight in weights[1:]:
-                        assert ref == weight, f"key: {key} ref: {ref}, weight: {weight}"
+                        assert (ref == weight).all(), f"key: {key} ref: {ref}, weight: {weight}"
 
         # # 2. test training from checkpoint: resume
         # # now do it again, this time resuming from the checkpoint

From 2c49216a092ab1d97ca4895c6ee66a8458341b03 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 31 Mar 2022 01:59:36 +0200
Subject: [PATCH 34/37] Fix non matching torch_rng_state for tp_rank=0

---
 megatron/data/data_samplers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py
index 1cbeac312..b933ff34e 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
@@ -52,6 +52,7 @@ def build_pretraining_data_loader(dataset, consumed_samples):
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
                                        num_workers=args.num_workers,
+                                       generator=torch.Generator().manual_seed(args.seed),
                                        pin_memory=True)
 
 class MegatronPretrainingSampler:

From 007ecb4bb83465a20adc3644cc7419e01b092cd7 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 31 Mar 2022 14:17:35 +0200
Subject: [PATCH 35/37] Update test

---
 tests/test_tensor_parallel.py | 41 ++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
index 3fbd4b225..5e821cf37 100644
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
@@ -302,7 +302,9 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
     def test_layer_norm_consistent(self, variation):
         src_dir = self.src_dir
         output_dir = self.get_auto_remove_tmp_dir()
-        num_gpus = 2
+        tp_size = 2
+        pp_size = 1
+        num_gpus = tp_size * pp_size # dp = 1
         seq_len = 128
         data_dir = f"{self.data_dir}/gpt2"
 
@@ -310,13 +312,14 @@ def test_layer_norm_consistent(self, variation):
         command_args["--pad-vocab-size-to"] = "5120"  # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
         command_args["--position-embedding-type"] = "alibi"
         command_args["--embed-layernorm"] = ""
-        command_args["--tensor-model-parallel-size"] = "2"
+        command_args["--tensor-model-parallel-size"] = f"{tp_size}"
+        command_args["--pipeline-model-parallel-size"] = f"{pp_size}"
         command_args["--save"] = f"{output_dir}/checkpoints"
         command_args["--load"] = f"{output_dir}/checkpoints"
         command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document"
         command_args["--train-samples"] = "200"
         command_args["--rampup-batch-size"] = "4 4 200"
-        command_args["--seq-length"] = "128"
+        command_args["--seq-length"] = f"{seq_len}"
         command_args["--exit-interval"] = "20"
         del command_args["--train-iters"]
         del command_args["--lr-decay-iters"]
@@ -356,7 +359,7 @@ def test_layer_norm_consistent(self, variation):
             "self_attention.dense.bias",
             "mlp.dense_4h_to_h.bias"
         ]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)] for
                             layer_id in [3, 4]]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
@@ -372,7 +375,7 @@ def test_layer_norm_consistent(self, variation):
             "word_embeddings.norm.weight",
             "word_embeddings.norm.bias"
         ]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)] for
                             layer_id in [1]]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
@@ -388,7 +391,7 @@ def test_layer_norm_consistent(self, variation):
             "weight",
             "bias"
         ]
-        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)]
+        files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)]
                             for
                             layer_id in [6]]
         for checkpoint in checkpoints:
@@ -401,7 +404,7 @@ def test_layer_norm_consistent(self, variation):
                         torch_assert_equal(ref, weight, check_device=False)
 
         keys_to_compare = ["torch_rng_state"]
-        files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]]
+        files_to_compare = [[f"mp_rank_{tp + pp*tp_size:02d}_model_states.pt" for tp in range(tp_size)] for pp in range(pp_size)]
         for checkpoint in checkpoints:
             checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint)
             for key in keys_to_compare:
@@ -411,16 +414,20 @@ def test_layer_norm_consistent(self, variation):
                     for weight in weights[1:]:
                         assert (ref == weight).all(), f"key: {key} ref: {ref}, weight: {weight}"
 
-        # # 2. test training from checkpoint: resume
-        # # now do it again, this time resuming from the checkpoint
-        # with CaptureStdout() as cs:
-        #     execute_subprocess_async(cmd, env=self.get_env())
-        #
-        # # test checkpoint loading
-        # self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
-        #
-        # # test reports
-        # self.assertIn("consumed samples", cs.out)
+
+        # 2. test training from checkpoint: resume
+        command_args["--exit-interval"] = "30"
+        cmd = launcher + script + [elt for elts in [f"{key} {value}".split() for key, value in command_args.items()] for elt in elts]
+
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
 
         # 3. test that inference with changes TP works.
         mp.set_start_method('spawn', force=True)

From c3844b5c3d23790bfb4f0f1cb9839e69c1d664ce Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 31 Mar 2022 15:22:38 +0200
Subject: [PATCH 36/37] I'm surprised one can apply inplace operation here

---
 megatron/mpu/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 4d94156ac..e649a1259 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -241,7 +241,7 @@ def forward(self, input_):
                                       self.sparse)
         # Mask the output embedding.
         if self.tensor_model_parallel_size > 1:
-            output_parallel[input_mask, :] = 0.0
+            output_parallel = output_parallel.masked_fill(input_mask[..., None], 0.0)
         # Reduce across all the model parallel GPUs.
         output = reduce_from_tensor_model_parallel_region(output_parallel)
 

From 189f0547f711104661328de07233f5e09a873026 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 4 Apr 2022 11:01:51 +0200
Subject: [PATCH 37/37] Test out the loss from the fp32 weights and optimizer
 states

---
 megatron/checkpointing.py | 63 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index d9a30f468..3fe5dafeb 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -169,6 +169,69 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
         # Trim off the filename and mp_rank_* directory.
         for _ in range(3):
             checkpoint_name = os.path.dirname(checkpoint_name)
+
+            # Debug
+            layer_norms_params_end_with = [
+                "word_embeddings.norm.weight", "word_embeddings.norm.bias",
+                "input_layernorm.weight", "input_layernorm.bias",
+                "post_attention_layernorm.weight", "post_attention_layernorm.bias",
+                "self_attention.dense.bias", "mlp.dense_4h_to_h.bias",
+            ]
+            for n,p in model[0].named_parameters():
+                # Here is how you can access fp32 version of the bf16 param and fp32 optim states
+                #
+                # Note that there is an all_reduce called on all dp ranks when `get_full_hp_param` is called -
+                # so it's not free
+                #
+                # a. fp32 param
+                for end in layer_norms_params_end_with:
+                    if n.endswith(end):
+                        fp32_param = p.get_full_hp_param()
+
+                        fp32_params_acculumator = [
+                            torch.zeros_like(fp32_param)
+                            for _ in range(mpu.get_tensor_model_parallel_world_size())
+                        ]
+                        torch.distributed.gather(
+                            fp32_param,
+                            fp32_params_acculumator,
+                            dst=0,
+                            group=mpu.get_tensor_model_parallel_group()
+                        )
+                        if mpu.get_tensor_model_parallel_rank() == 0:
+                            square = torch.tensor([
+                                [
+                                    torch.max(torch.abs(c1 - c2))
+                                    for c2 in fp32_params_acculumator
+                                ] for c1 in fp32_params_acculumator
+                            ])
+                            print(f"Parameter name = {n}")
+                            print(square)
+
+                        # b. fp32 optim states
+                        for key in ['exp_avg', 'exp_avg_sq']:
+                            full_optim_state = p.get_full_hp_param(optim_state_key=key)
+
+                            full_optim_state_acculumator = [
+                                torch.zeros_like(fp32_param)
+                                for _ in range(mpu.get_tensor_model_parallel_world_size())
+                            ]
+                            torch.distributed.gather(
+                                full_optim_state,
+                                full_optim_state_acculumator,
+                                dst=0,
+                                group=mpu.get_tensor_model_parallel_group()
+                            )
+                            if mpu.get_tensor_model_parallel_rank() == 0:
+                                square = torch.tensor([
+                                    [
+                                        torch.max(torch.abs(c1 - c2))
+                                        for c2 in full_optim_state_acculumator
+                                    ] for c1 in full_optim_state_acculumator
+                                ])
+                                print(f"Optimizer state: parameter name = {n}, key = {key}")
+                                print(square)
+
         model[0].save_checkpoint(checkpoint_name, client_state=state_dict)
 
     # Wait so everyone is done (necessary)