Universal Checkpoint: BC for older DeepSpeed (bigscience-workshop#271)

tjruwase · mrwyattii · web-flow · commit 8e1ff895b5f6 · 2023-10-26T11:12:13.000-04:00
* Enable universal ckpting

* Update run scripts

* Address PR feedback

* Remove line

* Fix white lines

* Remove redudant changes

* Apply to gpt_model only

* Code cleanup

* Code cleanup

* Update training.py

Co-authored-by: Michael Wyatt &lt;mrwyattii@gmail.com&gt;

* Update training.py

Co-authored-by: Michael Wyatt &lt;mrwyattii@gmail.com&gt;

* Log loss_scale only valid for fp16

* Add README and bf16 scripts

* Visualization docsts

* Support older DS

* Handle uni_ckpt import error

* Revert changes

---------

Co-authored-by: Michael Wyatt &lt;mrwyattii@gmail.com&gt;
diff --git a/examples_deepspeed/universal_checkpointing/ds_config.json b/examples_deepspeed/universal_checkpointing/ds_config.json
@@ -1,6 +1,6 @@
 {
   "train_batch_size" : 16,
-  "train_micro_batch_size_per_gpu": 1,
+  "train_micro_batch_size_per_gpu": 16,
   "steps_per_print": 1,
 
   "zero_optimization": {
@@ -11,7 +11,7 @@
     "enabled": true
   },
 
-    "data_types": {
+  "data_types": {
         "grad_accum_dtype": "fp32" 
   },
 
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -33,7 +33,8 @@
     )
     DS_UNIVERSAL_CHECKPOINT_INFO = True 
 except ImportError:
-    DS_UNIVERSAL_CHECKPOINT_INFO = None 
+    DS_UNIVERSAL_CHECKPOINT_INFO = False  
+
 
 def post_language_model_processing(lm_output, labels, logit_weights,
                                    parallel_output,
@@ -341,37 +342,37 @@ def _logits_helper(embedding, lm_output):
 
     def universal_checkpoint_info(self):
         info = dict()
+        if DS_UNIVERSAL_CHECKPOINT_INFO:
+            # Vocabulary parameters (embeddings) that require special handling due to padding.
+            info[VOCABULARY_PARAMETER_PATTERNS] = [
+                r"tied_modules.embed.word_embeddings.weight"
+            ]
+
+            # Replicated (shared) parameters on the pipeline dimension
+            info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = [
+                r"tied_modules.embed.word_embeddings.weight",
+                r"tied_modules.embed.position_embeddings.weight"
+            ]
+
+            # Parameter slices that should be averaged not concatenated.
+            info[TP_REPLICATED_PARAMETER_PATTERNS] = [
+                r"tied_modules.embed.word_embeddings.norm.weight",
+                r"tied_modules.embed.word_embeddings.norm.bias",
+                r"tied_modules.embed.position_embeddings.weight",
+                r"\d+.input_layernorm.weight",
+                r"\d+.input_layernorm.bias",
+                r"\d+.post_attention_layernorm.weight",
+                r"\d+.post_attention_layernorm.bias",
+                r"\d+.self_attention.dense.bias",
+                r"\d+.mlp.dense_4h_to_h.bias",
+                r"\d+.weight",
+                r"\d+.bias",
+            ]
 
-        # Vocabulary parameters (embeddings) that require special handling due to padding.
-        info[VOCABULARY_PARAMETER_PATTERNS] = [
-            r"tied_modules.embed.word_embeddings.weight"
-        ]
-
-        # Replicated (shared) parameters on the pipeline dimension
-        info[PIPELINE_REPLICATED_PARAMETER_PATTERNS] = [
-            r"tied_modules.embed.word_embeddings.weight",
-            r"tied_modules.embed.position_embeddings.weight"
-        ]
-
-        # Parameter slices that should be averaged not concatenated.
-        info[TP_REPLICATED_PARAMETER_PATTERNS] = [
-            r"tied_modules.embed.word_embeddings.norm.weight",
-            r"tied_modules.embed.word_embeddings.norm.bias",
-            r"tied_modules.embed.position_embeddings.weight",
-            r"\d+.input_layernorm.weight",
-            r"\d+.input_layernorm.bias",
-            r"\d+.post_attention_layernorm.weight",
-            r"\d+.post_attention_layernorm.bias",
-            r"\d+.self_attention.dense.bias",
-            r"\d+.mlp.dense_4h_to_h.bias",
-            r"\d+.weight",
-            r"\d+.bias",
-        ]
-
-        # Parameter that are sliced on the row dimension
-        info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = [
-            r"\d+.mlp.dense_4h_to_h.weight",
-            r"\d+.mlp.self_attention.dense.weight",
-        ]
+            # Parameter that are sliced on the row dimension
+            info[PARAMETER_WITH_ROW_PARALLELISM_PATTERNS] = [
+                r"\d+.mlp.dense_4h_to_h.weight",
+                r"\d+.mlp.self_attention.dense.weight",
+            ]
 
         return info