From 07ccb3db1717f300fd09afaaf9eac57678ca0e5d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 23:23:54 +0100 Subject: [PATCH 01/37] Better --- tests/test_training.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index fb72e59c6..bc64ffb73 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -682,6 +682,8 @@ def test_layer_norm_consistent(self, variation): execute_subprocess_async(cmd, env=self.get_env()) checkpoints = ["global_step10", "global_step20"] + + # Check transformer layer norm keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] for checkpoint in checkpoints: @@ -693,6 +695,7 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + # Check embed layer norm keys_to_compare = ["word_embeddings.norm.weight"] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] for checkpoint in checkpoints: @@ -702,4 +705,4 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) \ No newline at end of file + torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) From 391ed4882546c38dea1e6c37e65e411cab06add5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 24 Mar 2022 23:36:27 +0100 Subject: [PATCH 02/37] Force synchronize the layer norms parameters across all TP --- megatron/model/fused_layer_norm.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 8430f528c..9a7b5f06e 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -84,19 +84,11 @@ def reset_parameters(self): def forward(self, input): - weights = [torch.empty_like(self.weight) for tp in range(mpu.get_tensor_model_parallel_world_size())] - torch.distributed.all_gather(weights, self.weight, group=mpu.get_tensor_model_parallel_group()) - biases = [torch.empty_like(self.bias) for tp in range(mpu.get_tensor_model_parallel_world_size())] - torch.distributed.all_gather(biases, self.bias, group=mpu.get_tensor_model_parallel_group()) - if any(torch.any(weight != self.weight) for weight in weights): - if mpu.get_tensor_model_parallel_rank() == 0: - print("Weight sync failed") - print(weights) - if any(torch.any(bias != self.bias) for bias in biases): - if mpu.get_tensor_model_parallel_rank() == 0: - print("Bias sync failed") - print(biases) + tp_world_size = mpu.get_tensor_model_parallel_world_size() + # TODO: hack in order to synchronize all layer norms despite them being unsynched + weight = mpu.reduce_from_tensor_model_parallel_region(self.weight) / tp_world_size + bias = mpu.reduce_from_tensor_model_parallel_region(self.bias) / tp_world_size return FusedLayerNormAffineFunction.apply( - input, self.weight, self.bias, self.normalized_shape,self.eps) + input, weight, bias, self.normalized_shape,self.eps) From 98d0e7cc0b383d828a23c754010e14aee76404c4 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 18:42:31 -0700 Subject: [PATCH 03/37] import mpu --- megatron/model/fused_layer_norm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 9a7b5f06e..410a9bcd4 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -23,6 +23,7 @@ from torch.nn.parameter import Parameter from torch.nn import init import importlib +from megatron import mpu global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None From 279a77ebf2c878e4aad9392df3188fa2f391665a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:05:27 -0700 Subject: [PATCH 04/37] use the bf16 branch for testing --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index da76b5e44..08e48cd26 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,8 @@ six tensorboard torch>=1.7 transformers -DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git +# for now using this branch for bf16 work +DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates # versions from HF transformers black==21.4b0 isort>=5.5.4 From 87a9dba0b9e91c09c410229f2ab97dec9c98afe0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:12:14 -0700 Subject: [PATCH 05/37] `torch.testing.assert_equal` didn't make it (#273) looks like pt-1.11 dropped `torch.testing.assert_equal`, so using `torch.testing.assert_equal` instead --- megatron/testing_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/testing_utils.py b/megatron/testing_utils.py index 2143b610b..9521cb361 100644 --- a/megatron/testing_utils.py +++ b/megatron/testing_utils.py @@ -232,9 +232,9 @@ def get_gpu_count(): return 0 def torch_assert_equal(actual, expected, **kwargs): - # assert_equal was added around pt-1.9, it does better checks - e.g will check dimensions match - if hasattr(torch.testing, "assert_equal"): - return torch.testing.assert_equal(actual, expected, **kwargs) + # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match + if hasattr(torch.testing, "assert_close"): + return torch.testing.assert_close(actual, expected, rtol=0.0, atol=0.0, **kwargs) else: return torch.allclose(actual, expected, rtol=0.0, atol=0.0) @@ -886,4 +886,4 @@ def flatten_arguments(args): Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"] """ - return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] \ No newline at end of file + return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] From 70f91f8208a9459218391e0e581f908066bb1b86 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:35:12 -0700 Subject: [PATCH 06/37] bf16 comms requite pt-1.11 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 08e48cd26..f0ec53a7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ pybind11 regex six tensorboard -torch>=1.7 +torch>=1.11 transformers # for now using this branch for bf16 work DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates From 835a3e5c952244da1b54e4b9ff20ba631341e99d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:36:21 -0700 Subject: [PATCH 07/37] already part of the function --- tests/test_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index bc64ffb73..65067982e 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -693,7 +693,7 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + torch_assert_equal(ref, weight, check_device=False) # Check embed layer norm keys_to_compare = ["word_embeddings.norm.weight"] @@ -705,4 +705,4 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - torch_assert_equal(ref, weight, rtol=0.0, atol=0.0, check_device=False) + torch_assert_equal(ref, weight, check_device=False) From 37795a9238e86b124c3c611fb4869d7193aa4542 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:42:40 -0700 Subject: [PATCH 08/37] reproduce the crashing on resume --- tests/test_training.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_training.py b/tests/test_training.py index 65067982e..3c3db39c1 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -706,3 +706,14 @@ def test_layer_norm_consistent(self, variation): ref = weights[0] for weight in weights[1:]: torch_assert_equal(ref, weight, check_device=False) + + # 2. test training from checkpoint: resume + # now do it again, this time resuming from the checkpoint + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) From 3ec65f7c3b24aa67ff5beccbd23aa728a153c025 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 24 Mar 2022 19:54:38 -0700 Subject: [PATCH 09/37] run just the test we want for now --- .github/workflows/main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e343df39e..a2786a402 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -186,7 +186,9 @@ jobs: pip install pytest-timeout - name: Run tests - run: pytest --timeout=600 tests + # run: pytest --timeout=600 tests + # run just the test we want for now + run: pytest --timeout=600 tests/test_training.py::MegDSTestTraining::test_layer_norm_consistent_0_bf16 stop-runner: name: Stop self-hosted EC2 runner From 8271d419fcb950eb0083ed95db1957a83bfab258 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 10:50:58 +0100 Subject: [PATCH 10/37] all_reduce is an in_place operation --- megatron/model/fused_layer_norm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 410a9bcd4..100595d26 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -87,8 +87,10 @@ def reset_parameters(self): def forward(self, input): tp_world_size = mpu.get_tensor_model_parallel_world_size() # TODO: hack in order to synchronize all layer norms despite them being unsynched - weight = mpu.reduce_from_tensor_model_parallel_region(self.weight) / tp_world_size - bias = mpu.reduce_from_tensor_model_parallel_region(self.bias) / tp_world_size + weight = torch.clone(self.weight) + bias = torch.clone(self.bias) + weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size + bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size return FusedLayerNormAffineFunction.apply( input, weight, bias, self.normalized_shape,self.eps) From b418b47a6e27c69c9eef36177b9b48e965b198e9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 11:52:08 +0100 Subject: [PATCH 11/37] Make a test that TP reshaping works --- tests/test_tensor_parallel.py | 152 +++++++++++++++++++++++++++++++++- tests/test_training.py | 123 --------------------------- 2 files changed, 150 insertions(+), 125 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index ed383e17a..a117ede61 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -1,3 +1,4 @@ +import os import unittest from random import randint from unittest.mock import patch @@ -9,7 +10,8 @@ import pytest from megatron import initialize_megatron, get_args, get_tokenizer, global_vars -from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu +from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, \ + require_torch_multi_gpu, torch_assert_equal, CaptureStdout, execute_subprocess_async from megatron.training import setup_model_and_optimizer from megatron.mpu.mappings import gather_from_tensor_model_parallel_region from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe @@ -64,7 +66,7 @@ def get_default_args(self): #ds args "--deepspeed": "", - "--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json", + "--deepspeed_config": f"{self.test_file_dir_str}/ds_config.json", "--zero-stage": "1", "--deepspeed-activation-checkpointing": "" # DATA_ARGS @@ -293,5 +295,151 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self): self.assertEqual(str(exc_info.value), "5121 is not divisible by 128") + @parameterized.expand(["bf16", "fp16"]) + def test_layer_norm_consistent(self, variation): + src_dir = self.src_dir + output_dir = self.get_auto_remove_tmp_dir() + num_gpus = 2 + seq_len = 128 + data_dir = f"{self.data_dir}/gpt2" + default_args = f""" + --pipeline-model-parallel-size 1 + --distributed-backend nccl + + --log-interval 1 + --save-interval 10 + --eval-interval 10 + --eval-iters 5 + --checkpoint-activations + --partition-activations + --exit-interval {20} + + --merge-file {data_dir}/gpt2-tiny-merges.txt + --vocab-file {data_dir}/gpt2-tiny-vocab.json + --save {output_dir}/checkpoints + --load {output_dir}/checkpoints + --data-path {data_dir}/meg-gpt2-openwebtext_text_document + --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 + --log-timers-to-tensorboard + --log-batch-size-to-tensorboard + --log-validation-ppl-to-tensorboard + + --num-layers 2 + --hidden-size 64 + --num-attention-heads 2 + --seq-length {seq_len} + --max-position-embeddings 1024 + --micro-batch-size 2 + --global-batch-size 16 + + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --lr 1e-1 + --clip-grad 1.0 + --weight-decay 1e-1 + --embed-layernorm + + --log-level debug + --log-level-replica info + + --rampup-batch-size 2 2 200 + --train-samples 200 + + --position-embedding-type alibi + """.split() + + command_args = self.get_default_args() + command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary + command_args["--position-embedding-type"] = "alibi" + command_args["--tensor-model-parallel-size"] = "2" + + if variation == "bf16": + command_args["--bf16"] = "" + command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config_bf16.json" + command_args["--zero-stage"] = "0" + elif variation == "fp16": + command_args["--fp16"] = "" + command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config.json" + command_args["--zero-stage"] = "1" + + # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) + + script = [f"{src_dir}/pretrain_gpt.py"] + launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + cmd = launcher + script + " ".join([f"{key} {value}" for key, value in command_args.items()]) + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + checkpoints = ["global_step10", "global_step20"] + + # Check transformer layer norm + keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", + "post_attention_layernorm.bias"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + layer_id in [3, 4]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) + + # Check embed layer norm + keys_to_compare = ["word_embeddings.norm.weight"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + layer_id in [1]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) + + # # 2. test training from checkpoint: resume + # # now do it again, this time resuming from the checkpoint + # with CaptureStdout() as cs: + # execute_subprocess_async(cmd, env=self.get_env()) + # + # # test checkpoint loading + # self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + # + # # test reports + # self.assertIn("consumed samples", cs.out) + + # 3. test that inference with changes TP works. + command_args["--tensor-model-parallel-size"] = "1" + + pool = Pool(1) + result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, output_dir, None))]) + pool.close() + pool.join() + + output, tokens = result[0] + logging.getLogger().info("First done!") + + command_args["--tensor-model-parallel-size"] = "2" + + pool = Pool(2) + result = pool.map(MegDSTestTP.infer_model, + [((0, 2, command_args, tokens, None, output_dir)), ((1, 2, command_args, tokens, None, output_dir))]) + pool.close() + pool.join() + + output2, tokens = result[0] + + logging.getLogger().critical(output - output2) + self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0), + "Different results when running with TP=1 and TP=2") + if __name__ == '__main__': unittest.main() diff --git a/tests/test_training.py b/tests/test_training.py index 3c3db39c1..bf31bf904 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -594,126 +594,3 @@ def test_skip_train_iteration(self): train_iterations = range(1,10) for i in train_iterations: self.assertTrue(f"iteration {i:8d}/" in cs.out) - - @parameterized.expand(["bf16", "fp16"]) - def test_layer_norm_consistent(self, variation): - src_dir = self.src_dir - output_dir = self.get_auto_remove_tmp_dir() - num_gpus = 2 - seq_len = 128 - data_dir = f"{self.data_dir}/gpt2" - args = f""" - --tensor-model-parallel-size {2} - --pipeline-model-parallel-size {1} - --distributed-backend nccl - - --log-interval 1 - --save-interval 10 - --eval-interval 10 - --eval-iters 5 - --checkpoint-activations - --partition-activations - --exit-interval {20} - - --merge-file {data_dir}/gpt2-tiny-merges.txt - --vocab-file {data_dir}/gpt2-tiny-vocab.json - --save {output_dir}/checkpoints - --load {output_dir}/checkpoints - --data-path {data_dir}/meg-gpt2-openwebtext_text_document - --tensorboard-dir {output_dir}/tensorboard - --tensorboard-queue-size 5 - --log-timers-to-tensorboard - --log-batch-size-to-tensorboard - --log-validation-ppl-to-tensorboard - - --num-layers 2 - --hidden-size 64 - --num-attention-heads 2 - --seq-length {seq_len} - --max-position-embeddings 1024 - --micro-batch-size 2 - --global-batch-size 16 - - --optimizer adam - --adam-beta1 0.9 - --adam-beta2 0.95 - --adam-eps 1e-8 - --lr 1e-1 - --clip-grad 1.0 - --weight-decay 1e-1 - --embed-layernorm - - --log-level debug - --log-level-replica info - - --rampup-batch-size 2 2 200 - --train-samples 200 - - --position-embedding-type alibi - """.split() - - ds_args = f""" - --deepspeed - --deepspeed-activation-checkpointing - """.split() - - if variation == "bf16": - args.append("--bf16") - ds_args += [ - "--zero-stage", "0", - "--deepspeed_config", f"{self.test_file_dir_str}/ds_config_bf16.json" - ] - elif variation == "fp16": - args.append("--fp16") - ds_args += [ - "--zero-stage", "1", - "--deepspeed_config", f"{self.test_file_dir_str}/ds_config.json" - ] - - # args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200) - - script = [f"{src_dir}/pretrain_gpt.py"] - launcher = get_launcher(num_gpus) - cmd = launcher + script + args + ds_args - # keep for quick debug - # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die - - with CaptureStdout() as cs: - execute_subprocess_async(cmd, env=self.get_env()) - - checkpoints = ["global_step10", "global_step20"] - - # Check transformer layer norm - keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", "post_attention_layernorm.bias"] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3,4]] - for checkpoint in checkpoints: - checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - for key in keys_to_compare: - for files in files_to_compare: - weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - ref = weights[0] - for weight in weights[1:]: - torch_assert_equal(ref, weight, check_device=False) - - # Check embed layer norm - keys_to_compare = ["word_embeddings.norm.weight"] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] - for checkpoint in checkpoints: - checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - for key in keys_to_compare: - for files in files_to_compare: - weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - ref = weights[0] - for weight in weights[1:]: - torch_assert_equal(ref, weight, check_device=False) - - # 2. test training from checkpoint: resume - # now do it again, this time resuming from the checkpoint - with CaptureStdout() as cs: - execute_subprocess_async(cmd, env=self.get_env()) - - # test checkpoint loading - self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) - - # test reports - self.assertIn("consumed samples", cs.out) From 4b7207b5c4d7ecf084413ce3a7359d471e82584c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 11:53:13 +0100 Subject: [PATCH 12/37] Woops --- tests/test_tensor_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index a117ede61..c894ca0b9 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -9,6 +9,8 @@ import numpy as np import pytest +from parameterized import parameterized + from megatron import initialize_megatron, get_args, get_tokenizer, global_vars from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, \ require_torch_multi_gpu, torch_assert_equal, CaptureStdout, execute_subprocess_async From 3bc58243d07e16ba61e12a24fb2f3fec4e893863 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 11:55:37 +0100 Subject: [PATCH 13/37] Woops --- tests/test_tensor_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index c894ca0b9..8f77b525d 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -371,7 +371,7 @@ def test_layer_norm_consistent(self, variation): script = [f"{src_dir}/pretrain_gpt.py"] launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() - cmd = launcher + script + " ".join([f"{key} {value}" for key, value in command_args.items()]) + cmd = launcher + script + [f"{key} {value}" for key, value in command_args.items()] # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die From 05c99db6d522c6e98a578f2207603897983e357a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:05:02 +0100 Subject: [PATCH 14/37] Woops --- tests/test_tensor_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 8f77b525d..b46c34e30 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -360,6 +360,7 @@ def test_layer_norm_consistent(self, variation): if variation == "bf16": command_args["--bf16"] = "" + del command_args["--fp16"] command_args["--deepspeed_config"] = f"{self.test_file_dir_str}/ds_config_bf16.json" command_args["--zero-stage"] = "0" elif variation == "fp16": @@ -371,7 +372,7 @@ def test_layer_norm_consistent(self, variation): script = [f"{src_dir}/pretrain_gpt.py"] launcher = f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() - cmd = launcher + script + [f"{key} {value}" for key, value in command_args.items()] + cmd = launcher + script + [elt for elts in [f"{key} {value}".split() for key, value in command_args.items()] for elt in elts] # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die From 55e10c63e4f487ad3ca58860d52ece9b250282ca Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:10:28 +0100 Subject: [PATCH 15/37] Woops --- tests/test_tensor_parallel.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index b46c34e30..03434f714 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -318,10 +318,7 @@ def test_layer_norm_consistent(self, variation): --merge-file {data_dir}/gpt2-tiny-merges.txt --vocab-file {data_dir}/gpt2-tiny-vocab.json - --save {output_dir}/checkpoints - --load {output_dir}/checkpoints - --data-path {data_dir}/meg-gpt2-openwebtext_text_document - --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard @@ -357,6 +354,11 @@ def test_layer_norm_consistent(self, variation): command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary command_args["--position-embedding-type"] = "alibi" command_args["--tensor-model-parallel-size"] = "2" + command_args["--save"] = f"{output_dir}/checkpoints" + command_args["--load"] = f"{output_dir}/checkpoints" + command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" + command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard" + command_args["--lr"] = "1e-1" if variation == "bf16": command_args["--bf16"] = "" From 2ab8a3ac872396a3029b058e56e1146847342602 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:27:48 +0100 Subject: [PATCH 16/37] Woops --- tests/test_tensor_parallel.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 03434f714..19b3e5d49 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -61,9 +61,9 @@ def get_default_args(self): # OUTPUT_ARGS "--log-interval": "10", - "--save-interval": "500", - "--eval-interval": "100", - "--eval-iters": "10", + "--save-interval": "10", + "--eval-interval": "10", + "--eval-iters": "5", "--checkpoint-activations": "", #ds args @@ -357,6 +357,9 @@ def test_layer_norm_consistent(self, variation): command_args["--save"] = f"{output_dir}/checkpoints" command_args["--load"] = f"{output_dir}/checkpoints" command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" + command_args["--train-samples"] = "200" + del command_args["--train-iters"] + del command_args["--lr-decay-iters"] command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard" command_args["--lr"] = "1e-1" From d357839dc280174c492d25c4bccc5beaa20d415d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:40:17 +0100 Subject: [PATCH 17/37] Woops --- tests/test_tensor_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 19b3e5d49..a3e3206e5 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -48,7 +48,6 @@ def get_default_args(self): "--merge-file": f"{data_dir}/gpt2-tiny-merges.txt", "--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json", "--data-impl": "mmap", - "--split": "949,50,1", "--distributed-backend": "nccl", "--weight-decay": "1e-2", "--clip-grad": "1.0", @@ -358,6 +357,7 @@ def test_layer_norm_consistent(self, variation): command_args["--load"] = f"{output_dir}/checkpoints" command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" command_args["--train-samples"] = "200" + command_args["--seq-length"] = "128" del command_args["--train-iters"] del command_args["--lr-decay-iters"] command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard" From 5fb231c158d96f1191796af6a04065f3716d2b81 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 12:42:32 +0100 Subject: [PATCH 18/37] Woops --- tests/test_tensor_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index a3e3206e5..8072ac5eb 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -358,6 +358,7 @@ def test_layer_norm_consistent(self, variation): command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" command_args["--train-samples"] = "200" command_args["--seq-length"] = "128" + command_args["--exit-interval"] = "20" del command_args["--train-iters"] del command_args["--lr-decay-iters"] command_args["--tensorboard-dir"] = f"{output_dir}/tensorboard" From cc7ff45b46dd805dc1c93ba7999f00d5e99d961b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 14:14:28 +0100 Subject: [PATCH 19/37] Woops --- tests/test_tensor_parallel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 8072ac5eb..1350e3918 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -352,11 +352,13 @@ def test_layer_norm_consistent(self, variation): command_args = self.get_default_args() command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary command_args["--position-embedding-type"] = "alibi" + command_args["--embed-layernorm"] = "" command_args["--tensor-model-parallel-size"] = "2" command_args["--save"] = f"{output_dir}/checkpoints" command_args["--load"] = f"{output_dir}/checkpoints" command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" command_args["--train-samples"] = "200" + command_args["--rampup-batch-size"] = "4 4 200" command_args["--seq-length"] = "128" command_args["--exit-interval"] = "20" del command_args["--train-iters"] @@ -426,6 +428,8 @@ def test_layer_norm_consistent(self, variation): # self.assertIn("consumed samples", cs.out) # 3. test that inference with changes TP works. + mp.set_start_method('spawn', force=True) + del command_args["--rampup-batch-size"] command_args["--tensor-model-parallel-size"] = "1" pool = Pool(1) From 7cdb1be83fe865d378f8eb3a0423c3fd5ce68f71 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 14:29:12 +0100 Subject: [PATCH 20/37] Woops --- tests/test_tensor_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 1350e3918..26491b6df 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -433,7 +433,7 @@ def test_layer_norm_consistent(self, variation): command_args["--tensor-model-parallel-size"] = "1" pool = Pool(1) - result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, output_dir, None))]) + result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))]) pool.close() pool.join() From 4574ec978b36d01f1b4a083f650906d3a22d9d46 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 14:43:55 +0100 Subject: [PATCH 21/37] Fix load issue --- tests/test_tensor_parallel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 26491b6df..b4b04d426 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -118,8 +118,6 @@ def create_model_inputs(tokens): tokenizer = get_tokenizer() - model, _, _ = setup_model_and_optimizer(gpt_model_provider) - model = model[0] if load is not None: # Hack (same as in eval_harness/evaluate.py) # Loading pipelined models in deepspeed with different TP than it was originally trained on fails @@ -130,6 +128,10 @@ def create_model_inputs(tokens): # Deepspeed does however manage to load the model if we just turn off this sanity check. deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None + model, _, _ = setup_model_and_optimizer(gpt_model_provider) + model = model[0] + + if load is not None: zero_enabled = model._config.zero_enabled model._config.zero_enabled = False _, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True) From 04e89d1413924121c37ea7d044c7136504ffa551 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 15:12:31 +0100 Subject: [PATCH 22/37] Woops --- tests/test_tensor_parallel.py | 47 ++--------------------------------- 1 file changed, 2 insertions(+), 45 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index b4b04d426..97468ecf4 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -305,51 +305,6 @@ def test_layer_norm_consistent(self, variation): num_gpus = 2 seq_len = 128 data_dir = f"{self.data_dir}/gpt2" - default_args = f""" - --pipeline-model-parallel-size 1 - --distributed-backend nccl - - --log-interval 1 - --save-interval 10 - --eval-interval 10 - --eval-iters 5 - --checkpoint-activations - --partition-activations - --exit-interval {20} - - --merge-file {data_dir}/gpt2-tiny-merges.txt - --vocab-file {data_dir}/gpt2-tiny-vocab.json - - --tensorboard-queue-size 5 - --log-timers-to-tensorboard - --log-batch-size-to-tensorboard - --log-validation-ppl-to-tensorboard - - --num-layers 2 - --hidden-size 64 - --num-attention-heads 2 - --seq-length {seq_len} - --max-position-embeddings 1024 - --micro-batch-size 2 - --global-batch-size 16 - - --optimizer adam - --adam-beta1 0.9 - --adam-beta2 0.95 - --adam-eps 1e-8 - --lr 1e-1 - --clip-grad 1.0 - --weight-decay 1e-1 - --embed-layernorm - - --log-level debug - --log-level-replica info - - --rampup-batch-size 2 2 200 - --train-samples 200 - - --position-embedding-type alibi - """.split() command_args = self.get_default_args() command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary @@ -433,6 +388,8 @@ def test_layer_norm_consistent(self, variation): mp.set_start_method('spawn', force=True) del command_args["--rampup-batch-size"] command_args["--tensor-model-parallel-size"] = "1" + del command_args["--load"] + del command_args["--save"] pool = Pool(1) result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))]) From e9431002a20262172d5a97ebfa70fb7027d207e8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 16:28:52 +0100 Subject: [PATCH 23/37] Fix checkpoint path --- tests/test_tensor_parallel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 97468ecf4..62d4a4775 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -195,7 +195,7 @@ def test_alibi_tp(self): output2, tokens = result[0] logging.getLogger().critical(output-output2) - self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2") + self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2") @@ -391,8 +391,9 @@ def test_layer_norm_consistent(self, variation): del command_args["--load"] del command_args["--save"] + checkpoints_path = os.path.join(output_dir, "checkpoints") pool = Pool(1) - result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, output_dir))]) + result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, None, checkpoints_path))]) pool.close() pool.join() @@ -403,7 +404,7 @@ def test_layer_norm_consistent(self, variation): pool = Pool(2) result = pool.map(MegDSTestTP.infer_model, - [((0, 2, command_args, tokens, None, output_dir)), ((1, 2, command_args, tokens, None, output_dir))]) + [((0, 2, command_args, tokens, None, checkpoints_path)), ((1, 2, command_args, tokens, None, checkpoints_path))]) pool.close() pool.join() From 09cead38c3dd0447b3473bb5efbbf180e1cebd05 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 17:57:04 +0100 Subject: [PATCH 24/37] Test that force sync will allow TP changes --- megatron/arguments.py | 1 + megatron/model/fused_layer_norm.py | 20 +++++++++++++------- tests/test_tensor_parallel.py | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 2be64b77d..194a518ba 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -738,6 +738,7 @@ def _add_distributed_args(parser): group.add_argument('--use-cpu-initialization', action='store_true', default=None, help='If set, affine parallel weights ' 'initialization uses CPU' ) + group.add_argument('--force-sync-layer-norm-parameters', action="store_true") return parser diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 100595d26..cd10e5b77 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -19,7 +19,7 @@ import numbers import torch -from megatron import mpu +from megatron import mpu, get_args from torch.nn.parameter import Parameter from torch.nn import init import importlib @@ -64,6 +64,7 @@ class MixedFusedLayerNorm(torch.nn.Module): def __init__(self, normalized_shape, eps=1e-5): super(MixedFusedLayerNorm, self).__init__() + args = get_args() global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = importlib.import_module( @@ -76,6 +77,7 @@ def __init__(self, normalized_shape, eps=1e-5): self.weight = Parameter(torch.Tensor(*normalized_shape)) self.bias = Parameter(torch.Tensor(*normalized_shape)) self.reset_parameters() + self.force_sync_layer_norm_parameters = args.force_sync_layer_norm_parameters def reset_parameters(self): @@ -85,12 +87,16 @@ def reset_parameters(self): def forward(self, input): - tp_world_size = mpu.get_tensor_model_parallel_world_size() - # TODO: hack in order to synchronize all layer norms despite them being unsynched - weight = torch.clone(self.weight) - bias = torch.clone(self.bias) - weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size - bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size + if self.force_sync_layer_norm_parameters: + tp_world_size = mpu.get_tensor_model_parallel_world_size() + # TODO: hack in order to synchronize all layer norms despite them being unsynched + weight = torch.clone(self.weight) + bias = torch.clone(self.bias) + weight = mpu.reduce_from_tensor_model_parallel_region(weight) / tp_world_size + bias = mpu.reduce_from_tensor_model_parallel_region(bias) / tp_world_size + else: + weight = self.weight + bias = self.bias return FusedLayerNormAffineFunction.apply( input, weight, bias, self.normalized_shape,self.eps) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 62d4a4775..d9a0f7fd0 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -390,6 +390,7 @@ def test_layer_norm_consistent(self, variation): command_args["--tensor-model-parallel-size"] = "1" del command_args["--load"] del command_args["--save"] + command_args["--force-sync-layer-norm-parameters"] = "" checkpoints_path = os.path.join(output_dir, "checkpoints") pool = Pool(1) From 77abee61db2da4945a029df246fd00b71a0c3d79 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 25 Mar 2022 18:19:13 +0100 Subject: [PATCH 25/37] Nit --- tests/test_tensor_parallel.py | 57 ++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index d9a0f7fd0..cbb67ca0a 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -344,34 +344,35 @@ def test_layer_norm_consistent(self, variation): with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - checkpoints = ["global_step10", "global_step20"] - - # Check transformer layer norm - keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", - "post_attention_layernorm.bias"] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for - layer_id in [3, 4]] - for checkpoint in checkpoints: - checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - for key in keys_to_compare: - for files in files_to_compare: - weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - ref = weights[0] - for weight in weights[1:]: - torch_assert_equal(ref, weight, check_device=False) - - # Check embed layer norm - keys_to_compare = ["word_embeddings.norm.weight"] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for - layer_id in [1]] - for checkpoint in checkpoints: - checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - for key in keys_to_compare: - for files in files_to_compare: - weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - ref = weights[0] - for weight in weights[1:]: - torch_assert_equal(ref, weight, check_device=False) + # # 1. test that the layer norm weights and biases are synchronized + # checkpoints = ["global_step10", "global_step20"] + + # # Check transformer layer norm + # keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", + # "post_attention_layernorm.bias"] + # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + # layer_id in [3, 4]] + # for checkpoint in checkpoints: + # checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + # for key in keys_to_compare: + # for files in files_to_compare: + # weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + # ref = weights[0] + # for weight in weights[1:]: + # torch_assert_equal(ref, weight, check_device=False) + # + # # Check embed layer norm + # keys_to_compare = ["word_embeddings.norm.weight"] + # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + # layer_id in [1]] + # for checkpoint in checkpoints: + # checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + # for key in keys_to_compare: + # for files in files_to_compare: + # weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + # ref = weights[0] + # for weight in weights[1:]: + # torch_assert_equal(ref, weight, check_device=False) # # 2. test training from checkpoint: resume # # now do it again, this time resuming from the checkpoint From 64a62c8081d5e1c9184c169e4f50372f4e82d92f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:03:00 +0200 Subject: [PATCH 26/37] Now that we have a force sync mechanism, let's try to reproduce --- tests/test_tensor_parallel.py | 69 ++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index cbb67ca0a..17ac9dab6 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -344,35 +344,46 @@ def test_layer_norm_consistent(self, variation): with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - # # 1. test that the layer norm weights and biases are synchronized - # checkpoints = ["global_step10", "global_step20"] - - # # Check transformer layer norm - # keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", - # "post_attention_layernorm.bias"] - # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for - # layer_id in [3, 4]] - # for checkpoint in checkpoints: - # checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - # for key in keys_to_compare: - # for files in files_to_compare: - # weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - # ref = weights[0] - # for weight in weights[1:]: - # torch_assert_equal(ref, weight, check_device=False) - # - # # Check embed layer norm - # keys_to_compare = ["word_embeddings.norm.weight"] - # files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for - # layer_id in [1]] - # for checkpoint in checkpoints: - # checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) - # for key in keys_to_compare: - # for files in files_to_compare: - # weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - # ref = weights[0] - # for weight in weights[1:]: - # torch_assert_equal(ref, weight, check_device=False) + # 1. test that the layer norm weights and biases are synchronized + checkpoints = ["global_step10", "global_step20"] + + # Check transformer layer norm + keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", + "post_attention_layernorm.bias"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + layer_id in [3, 4]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) + + # Check embed layer norm + keys_to_compare = ["word_embeddings.norm.weight"] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + layer_id in [1]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) + + keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + print(weights) + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) # # 2. test training from checkpoint: resume # # now do it again, this time resuming from the checkpoint From 0b7afcc943f972f0f0e3c78efc8d6fe160350485 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:15:00 +0200 Subject: [PATCH 27/37] Compare model_states_rank --- tests/test_tensor_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 17ac9dab6..bca25c6ae 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -375,6 +375,7 @@ def test_layer_norm_consistent(self, variation): torch_assert_equal(ref, weight, check_device=False) keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"] + files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) for key in keys_to_compare: From ce017338f400da2cee5b127ede22999b8726ea09 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:17:07 +0200 Subject: [PATCH 28/37] test --- tests/test_tensor_parallel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index bca25c6ae..4577f0e8c 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -379,12 +379,13 @@ def test_layer_norm_consistent(self, variation): for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) for key in keys_to_compare: + print(key) for files in files_to_compare: weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] print(weights) - ref = weights[0] - for weight in weights[1:]: - torch_assert_equal(ref, weight, check_device=False) + # ref = weights[0] + # for weight in weights[1:]: + # assert ref == weight # # 2. test training from checkpoint: resume # # now do it again, this time resuming from the checkpoint From 89ab0b72b58b3fb8bd4a361f4211bfc5eeef07fb Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 14:47:57 +0200 Subject: [PATCH 29/37] Row column bias should be synchronized as well --- tests/test_tensor_parallel.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 4577f0e8c..1f1822c1f 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -348,8 +348,14 @@ def test_layer_norm_consistent(self, variation): checkpoints = ["global_step10", "global_step20"] # Check transformer layer norm - keys_to_compare = ["input_layernorm.weight", "input_layernorm.bias", "post_attention_layernorm.weight", - "post_attention_layernorm.bias"] + keys_to_compare = [ + "input_layernorm.weight", + "input_layernorm.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + "self_attention.dense.bias", + "mlp.dense_4h_to_h.bias" + ] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [3, 4]] for checkpoint in checkpoints: @@ -426,7 +432,7 @@ def test_layer_norm_consistent(self, variation): output2, tokens = result[0] logging.getLogger().critical(output - output2) - self.assertTrue(np.allclose(output, output2, atol=5e-3, rtol=0), + self.assertTrue(np.allclose(output, output2, atol=0, rtol=0), "Different results when running with TP=1 and TP=2") if __name__ == '__main__': From 42997b2a93438a67147588e80b8afa4688b67d93 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 14:59:43 +0200 Subject: [PATCH 30/37] New list of matching embeddings --- tests/test_tensor_parallel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 1f1822c1f..284a51057 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -368,7 +368,11 @@ def test_layer_norm_consistent(self, variation): torch_assert_equal(ref, weight, check_device=False) # Check embed layer norm - keys_to_compare = ["word_embeddings.norm.weight"] + keys_to_compare = [ + "word_embeddings.norm.weight", + "tokentype_embeddings.weight" + "position_embeddings.weight" + ] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] for checkpoint in checkpoints: From e0ef1683a4b7336d609179f015ece06c704a6f8c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 18:53:53 +0200 Subject: [PATCH 31/37] Figure out why state differs --- tests/test_tensor_parallel.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 284a51057..f490feede 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -384,18 +384,16 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: torch_assert_equal(ref, weight, check_device=False) - keys_to_compare = ["random_rng_state", "np_rng_state", "torch_rng_state", "cuda_rng_state", "rng_tracker_states"] + keys_to_compare = ["torch_rng_state"] files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) for key in keys_to_compare: - print(key) for files in files_to_compare: weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] - print(weights) - # ref = weights[0] - # for weight in weights[1:]: - # assert ref == weight + ref = weights[0] + for weight in weights[1:]: + assert ref == weight, f"key: {key} ref: {ref}, weight: {weight}" # # 2. test training from checkpoint: resume # # now do it again, this time resuming from the checkpoint From 1fc4fe82787b31366f8ac2ba9ac4ebcb7ebee756 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 18:57:52 +0200 Subject: [PATCH 32/37] Test for final weight --- tests/test_tensor_parallel.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index f490feede..712df08d7 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -370,8 +370,7 @@ def test_layer_norm_consistent(self, variation): # Check embed layer norm keys_to_compare = [ "word_embeddings.norm.weight", - "tokentype_embeddings.weight" - "position_embeddings.weight" + "word_embeddings.norm.bias" ] files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for layer_id in [1]] @@ -384,6 +383,23 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: torch_assert_equal(ref, weight, check_device=False) + # Final layer norm + keys_to_compare = [ + "weight", + "bias" + ] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] + for + layer_id in [6]] + for checkpoint in checkpoints: + checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) + for key in keys_to_compare: + for files in files_to_compare: + weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] + ref = weights[0] + for weight in weights[1:]: + torch_assert_equal(ref, weight, check_device=False) + keys_to_compare = ["torch_rng_state"] files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]] for checkpoint in checkpoints: From 7ebbed16ce52e8c88597595621293ada9da8b01c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Tue, 29 Mar 2022 19:23:32 +0200 Subject: [PATCH 33/37] Test that torch_rng_state --- tests/test_tensor_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 712df08d7..3fbd4b225 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -409,7 +409,7 @@ def test_layer_norm_consistent(self, variation): weights = [torch.load(os.path.join(checkpoint_path, file))[key] for file in files] ref = weights[0] for weight in weights[1:]: - assert ref == weight, f"key: {key} ref: {ref}, weight: {weight}" + assert (ref == weight).all(), f"key: {key} ref: {ref}, weight: {weight}" # # 2. test training from checkpoint: resume # # now do it again, this time resuming from the checkpoint From 2c49216a092ab1d97ca4895c6ee66a8458341b03 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 31 Mar 2022 01:59:36 +0200 Subject: [PATCH 34/37] Fix non matching torch_rng_state for tp_rank=0 --- megatron/data/data_samplers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 1cbeac312..b933ff34e 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -52,6 +52,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, + generator=torch.Generator().manual_seed(args.seed), pin_memory=True) class MegatronPretrainingSampler: From 007ecb4bb83465a20adc3644cc7419e01b092cd7 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 31 Mar 2022 14:17:35 +0200 Subject: [PATCH 35/37] Update test --- tests/test_tensor_parallel.py | 41 ++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 3fbd4b225..5e821cf37 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -302,7 +302,9 @@ def test_tokenizer_raise_error_make_vocab_size_divisible_by(self): def test_layer_norm_consistent(self, variation): src_dir = self.src_dir output_dir = self.get_auto_remove_tmp_dir() - num_gpus = 2 + tp_size = 2 + pp_size = 1 + num_gpus = tp_size * pp_size # dp = 1 seq_len = 128 data_dir = f"{self.data_dir}/gpt2" @@ -310,13 +312,14 @@ def test_layer_norm_consistent(self, variation): command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary command_args["--position-embedding-type"] = "alibi" command_args["--embed-layernorm"] = "" - command_args["--tensor-model-parallel-size"] = "2" + command_args["--tensor-model-parallel-size"] = f"{tp_size}" + command_args["--pipeline-model-parallel-size"] = f"{pp_size}" command_args["--save"] = f"{output_dir}/checkpoints" command_args["--load"] = f"{output_dir}/checkpoints" command_args["--data-path"] = f"{data_dir}/meg-gpt2-openwebtext_text_document" command_args["--train-samples"] = "200" command_args["--rampup-batch-size"] = "4 4 200" - command_args["--seq-length"] = "128" + command_args["--seq-length"] = f"{seq_len}" command_args["--exit-interval"] = "20" del command_args["--train-iters"] del command_args["--lr-decay-iters"] @@ -356,7 +359,7 @@ def test_layer_norm_consistent(self, variation): "self_attention.dense.bias", "mlp.dense_4h_to_h.bias" ] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)] for layer_id in [3, 4]] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) @@ -372,7 +375,7 @@ def test_layer_norm_consistent(self, variation): "word_embeddings.norm.weight", "word_embeddings.norm.bias" ] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] for + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)] for layer_id in [1]] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) @@ -388,7 +391,7 @@ def test_layer_norm_consistent(self, variation): "weight", "bias" ] - files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(num_gpus)] + files_to_compare = [[f"layer_{layer_id:02d}-model_{tp:02d}-model_states.pt" for tp in range(tp_size)] for layer_id in [6]] for checkpoint in checkpoints: @@ -401,7 +404,7 @@ def test_layer_norm_consistent(self, variation): torch_assert_equal(ref, weight, check_device=False) keys_to_compare = ["torch_rng_state"] - files_to_compare = [[f"mp_rank_{tp:02d}_model_states.pt" for tp in range(num_gpus)]] + files_to_compare = [[f"mp_rank_{tp + pp*tp_size:02d}_model_states.pt" for tp in range(tp_size)] for pp in range(pp_size)] for checkpoint in checkpoints: checkpoint_path = os.path.join(output_dir, "checkpoints", checkpoint) for key in keys_to_compare: @@ -411,16 +414,20 @@ def test_layer_norm_consistent(self, variation): for weight in weights[1:]: assert (ref == weight).all(), f"key: {key} ref: {ref}, weight: {weight}" - # # 2. test training from checkpoint: resume - # # now do it again, this time resuming from the checkpoint - # with CaptureStdout() as cs: - # execute_subprocess_async(cmd, env=self.get_env()) - # - # # test checkpoint loading - # self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) - # - # # test reports - # self.assertIn("consumed samples", cs.out) + + # 2. test training from checkpoint: resume + command_args["--exit-interval"] = "30" + cmd = launcher + script + [elt for elts in [f"{key} {value}".split() for key, value in command_args.items()] for elt in elts] + + # now do it again, this time resuming from the checkpoint + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) # 3. test that inference with changes TP works. mp.set_start_method('spawn', force=True) From c3844b5c3d23790bfb4f0f1cb9839e69c1d664ce Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 31 Mar 2022 15:22:38 +0200 Subject: [PATCH 36/37] I'm surprised one can apply inplace operation here --- megatron/mpu/layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 4d94156ac..e649a1259 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -241,7 +241,7 @@ def forward(self, input_): self.sparse) # Mask the output embedding. if self.tensor_model_parallel_size > 1: - output_parallel[input_mask, :] = 0.0 + output_parallel = output_parallel.masked_fill(input_mask[..., None], 0.0) # Reduce across all the model parallel GPUs. output = reduce_from_tensor_model_parallel_region(output_parallel) From 189f0547f711104661328de07233f5e09a873026 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Apr 2022 11:01:51 +0200 Subject: [PATCH 37/37] Test out the loss from the fp32 weights and optimizer states --- megatron/checkpointing.py | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index d9a30f468..3fe5dafeb 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -169,6 +169,69 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): # Trim off the filename and mp_rank_* directory. for _ in range(3): checkpoint_name = os.path.dirname(checkpoint_name) + + # Debug + layer_norms_params_end_with = [ + "word_embeddings.norm.weight", "word_embeddings.norm.bias", + "input_layernorm.weight", "input_layernorm.bias", + "post_attention_layernorm.weight", "post_attention_layernorm.bias", + "self_attention.dense.bias", "mlp.dense_4h_to_h.bias", + ] + for n,p in model[0].named_parameters(): + # Here is how you can access fp32 version of the bf16 param and fp32 optim states + # + # Note that there is an all_reduce called on all dp ranks when `get_full_hp_param` is called - + # so it's not free + # + # a. fp32 param + for end in layer_norms_params_end_with: + if n.endswith(end): + fp32_param = p.get_full_hp_param() + + fp32_params_acculumator = [ + torch.zeros_like(fp32_param) + for _ in range(mpu.get_tensor_model_parallel_world_size()) + ] + torch.distributed.gather( + fp32_param, + fp32_params_acculumator, + dst=0, + group=mpu.get_tensor_model_parallel_group() + ) + if mpu.get_tensor_model_parallel_rank() == 0: + square = torch.tensor([ + [ + torch.max(torch.abs(c1 - c2)) + for c2 in fp32_params_acculumator + ] for c1 in fp32_params_acculumator + ]) + print(f"Parameter name = {n}") + print(square) + + # b. fp32 optim states + for key in ['exp_avg', 'exp_avg_sq']: + full_optim_state = p.get_full_hp_param(optim_state_key=key) + + full_optim_state_acculumator = [ + torch.zeros_like(fp32_param) + for _ in range(mpu.get_tensor_model_parallel_world_size()) + ] + torch.distributed.gather( + full_optim_state, + full_optim_state_acculumator, + dst=0, + group=mpu.get_tensor_model_parallel_group() + ) + if mpu.get_tensor_model_parallel_rank() == 0: + square = torch.tensor([ + [ + torch.max(torch.abs(c1 - c2)) + for c2 in full_optim_state_acculumator + ] for c1 in full_optim_state_acculumator + ]) + print(f"Optimizer state: parameter name = {n}, key = {key}") + print(square) + model[0].save_checkpoint(checkpoint_name, client_state=state_dict) # Wait so everyone is done (necessary)