feat: support moe hf chkpt

kmehant · kmehant · commit be45088ac152 · 2025-03-06T13:37:40.000+05:30
Signed-off-by: Mehant Kammakomati &lt;mehant.kammakomati2@ibm.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_scattermoe.py
@@ -14,18 +14,30 @@
 
 # Standard
 from typing import Dict, Tuple
+import asyncio
+import os
 
 # Third Party
+from accelerate import Accelerator
 from fms_acceleration import AccelerationPlugin
 from peft import LoraConfig
-from transformers import TrainingArguments
+from transformers import (
+    Trainer,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from transformers.trainer import TRAINING_ARGS_NAME
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 import torch
 
 # Local
 from .utils import (
     patch_huggingface_save_and_load_for_dtensors,
     patch_torch_optim_foreach_to_not_apply_to_dtensors,
     prepare_scattermoe,
+    recover_safetensors_from_dcp,
 )
 
 
@@ -90,10 +102,81 @@ def augmentation(
         return model, modifiable_args
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator=None
+        self,
+        model: torch.nn.Module = None,
+        accelerator: Accelerator = None,
+        trainer: Trainer = None,
+        pretrained_module_name_or_path: str = None,
     ):
 
         callbacks = []
+
+        class ConvertAndSaveHFCheckpointAtEverySave(TrainerCallback):
+            def __init__(self, pretrained_model_name_or_path: str, trainer: Trainer):
+                self.pretrained_model_name_or_path = pretrained_model_name_or_path
+                self.trainer = trainer
+
+            def on_save(
+                self,
+                args: TrainingArguments,
+                state: TrainerState,
+                control: TrainerControl,
+                **kwargs,
+            ):
+                """
+                Save all HF files and convert dcp checkpoint to safetensors at every save operation.
+                """
+
+                async def checkpoint():
+                    checkpoint_dir = os.path.join(
+                        args.output_dir,
+                        f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+                    )
+                    hf_converted_output_dir = os.path.join(
+                        checkpoint_dir, "hf_converted_checkpoint"
+                    )
+                    if os.path.exists(hf_converted_output_dir):
+                        # if the folder already exists
+                        # we return, since this is possible to happen
+                        # saving the checkpointing at the end of the training
+                        return
+                    os.mkdir(hf_converted_output_dir)
+                    try:
+                        recover_safetensors_from_dcp(
+                            checkpoint_dir,
+                            self.pretrained_model_name_or_path,
+                            hf_converted_output_dir,
+                        )
+                        # save tokenizer
+                        if self.trainer.processing_class:
+                            self.trainer.processing_class.save_pretrained(
+                                hf_converted_output_dir
+                            )
+                        # save training args
+                        torch.save(
+                            args,
+                            os.path.join(
+                                hf_converted_output_dir, TRAINING_ARGS_NAME
+                            ),
+                        )
+                        # save model config files
+                        self.trainer.model.config.save_pretrained(
+                            hf_converted_output_dir
+                        )
+
+                    except Exception as e:
+                        raise ValueError(
+                            f"Failed to convert the checkpoint {checkpoint_dir} to a HF compatible checkpoint"
+                        ) from e
+                if state.is_world_process_zero:
+                    asyncio.run(checkpoint())
+
+        callbacks.append(
+            ConvertAndSaveHFCheckpointAtEverySave(
+                pretrained_model_name_or_path=pretrained_module_name_or_path,
+                trainer=trainer,
+            )
+        )
         if (
             accelerator is not None
             and getattr(accelerator.state, "fsdp_plugin", None) is not None
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/__init__.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 
 # Local
-from .checkpoint_utils import patch_huggingface_save_and_load_for_dtensors
+from .checkpoint_utils import (
+    patch_huggingface_save_and_load_for_dtensors,
+    recover_safetensors_from_dcp,
+)
 from .scattermoe_prepare import prepare_scattermoe
 
 # this is a special patch function to disable foreach for
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py
@@ -457,75 +457,38 @@ def save_sharded_safetensors(
 # --------------------------- SCRIPT -------------------------
 
 
-# have it serve as a conversion script
-if __name__ == "__main__":
-    # Standard
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description=(
-            "Utility for converting ScatterMoE checkpoint back to the "
-            "orginal state dict format. "
-            "The ScatterMoE checkpoint was saved after the pretrained model "
-            "had been converted by a module swap, hence the state dict will "
-            "no longer resemble the original. This utility creaes"
-        )
-    )
-
-    parser.add_argument(
-        "checkpoint_dir",
-        help="Path to the checkpoint.",
-    )
-
-    parser.add_argument(
-        "output_dir", help="Path to the location to write the converted checkpoint."
-    )
-
-    parser.add_argument(
-        "pretrained_model_name_or_path",
-        help=(
-            "In order to reconstruct the state dict, we requre hints from "
-            "the original pretrained model checkpoint (from which this "
-            "checkpoint is obtained)."
-        ),
-        default=None,
-    )
-
-    args = parser.parse_args()
-
-    # search for an FSDP checkpoint. If it is an FSDP checkpoint, it must
-    # start with FSDP_MODEL_NAME
-    if args.checkpoint_dir.startswith(FSDP_MODEL_NAME):
-        checkpoint_dir = args.checkpoint_dir
+def recover_safetensors_from_dcp(
+    checkpoint_dir, pretrained_model_name_or_path, output_dir
+):
+    if checkpoint_dir.startswith(FSDP_MODEL_NAME):
         loader = get_state_dict_from_dcp_checkpoint
     else:
-        checkpoint_dir = [
+        fsdp_checkpoint_dirs = [
             x
-            for x in os.listdir(args.checkpoint_dir)
-            if os.path.isdir(os.path.join(args.checkpoint_dir, x))
+            for x in os.listdir(checkpoint_dir)
+            if os.path.isdir(os.path.join(checkpoint_dir, x))
             and x.startswith(FSDP_MODEL_NAME)
         ]
-        if len(checkpoint_dir) == 1:
-            checkpoint_dir = os.path.join(args.checkpoint_dir, checkpoint_dir[0])
+        if len(fsdp_checkpoint_dirs) == 1:
+            checkpoint_dir = os.path.join(checkpoint_dir, fsdp_checkpoint_dirs[0])
             loader = get_state_dict_from_dcp_checkpoint
-        elif len(checkpoint_dir) > 1:
+        elif len(fsdp_checkpoint_dirs) > 1:
             raise ValueError(
-                f"Found > 1 dirs in dcp checkpoint dir {args.checkpoint_dir} "
+                f"Found > 1 dirs in dcp checkpoint dir {checkpoint_dir} "
                 f"that starts with {FSDP_MODEL_NAME}. Please spectify the exact dir."
             )
         else:
             # then take it as a safetensors checkpoint
             # - do not support .bin checkpoints
-            checkpoint_dir = args.checkpoint_dir
             loader = get_state_dict_from_safe_checkpoint
 
     # - pretrained model name
-    _name_or_path = args.pretrained_model_name_or_path
+    _name_or_path = pretrained_model_name_or_path
 
     # assume output directory exists, we do not create it
     # - copy the config file if exists
     config_file = os.path.join(checkpoint_dir, CONFIG_NAME)
-    target_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+    target_config_file = os.path.join(output_dir, CONFIG_NAME)
     if os.path.exists(config_file):
         shutil.copyfile(config_file, target_config_file)
 
@@ -544,6 +507,46 @@ def save_sharded_safetensors(
     # save it as a safetensors file
     save_sharded_safetensors(
         {k: v.contiguous() for k, v in state_dict.items()},
-        args.output_dir,
+        output_dir,
         metadata={"format": "pt"},
     )
+
+
+# have it serve as a conversion script
+if __name__ == "__main__":
+    # Standard
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=(
+            "Utility for converting ScatterMoE checkpoint back to the "
+            "orginal state dict format. "
+            "The ScatterMoE checkpoint was saved after the pretrained model "
+            "had been converted by a module swap, hence the state dict will "
+            "no longer resemble the original. This utility creaes"
+        )
+    )
+
+    parser.add_argument(
+        "checkpoint_dir",
+        help="Path to the checkpoint.",
+    )
+
+    parser.add_argument(
+        "output_dir", help="Path to the location to write the converted checkpoint."
+    )
+
+    parser.add_argument(
+        "pretrained_model_name_or_path",
+        help=(
+            "In order to reconstruct the state dict, we requre hints from "
+            "the original pretrained model checkpoint (from which this "
+            "checkpoint is obtained)."
+        ),
+        default=None,
+    )
+
+    args = parser.parse_args()
+    recover_safetensors_from_dcp(
+        args.checkpoint_dir, args.pretrained_model_name_or_path, args.output_dir
+    )
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -27,9 +27,10 @@
 from fms_acceleration.model_patcher import patch_target_module
 from peft import LoraConfig, prepare_model_for_kbit_training
 from peft.tuners.lora.model import LoraModel
-from transformers import AutoModelForCausalLM, TrainingArguments
+from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
 from transformers.modeling_utils import is_fsdp_enabled
 from transformers.utils.import_utils import _is_package_available
+from accelerate import Accelerator
 import torch
 import torch.distributed
 
@@ -355,7 +356,7 @@ def augmentation(
         return model, modifiable_args
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator=None
+        self, model: torch.nn.Module = None, accelerator: Accelerator = None, trainer: Trainer = None, pretrained_module_name_or_path: str = None
     ):
         callbacks = []
         if (
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py
@@ -24,8 +24,9 @@
 # Third Party
 from fms_acceleration import AccelerationPlugin
 from peft import LoraConfig, get_peft_model
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
 from transformers.utils.import_utils import _is_package_available
+from accelerate import Accelerator
 import torch
 
 # Local
@@ -218,7 +219,7 @@ def augmentation(
         return model, modifiable_args
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator=None
+        self, model: torch.nn.Module = None, accelerator: Accelerator = None, trainer: Trainer = None, pretrained_module_name_or_path: str = None
     ):
         callbacks = []
         if (
diff --git a/plugins/framework/src/fms_acceleration/framework.py b/plugins/framework/src/fms_acceleration/framework.py
@@ -18,7 +18,7 @@
 
 # Third Party
 from accelerate import Accelerator
-from transformers import PreTrainedModel, TrainingArguments
+from transformers import PreTrainedModel, TrainingArguments, Trainer
 from transformers.utils import logging
 from transformers.utils.import_utils import _is_package_available
 import torch
@@ -218,7 +218,7 @@ def requires_augmentation(self):
         return any(x.requires_augmentation for _, x in self.active_plugins)
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator: Accelerator = None
+        self, model: torch.nn.Module = None, accelerator: Accelerator = None, trainer: Trainer = None, pretrained_module_name_or_path: str = None
     ):
 
         # Local
@@ -257,5 +257,5 @@ def get_callbacks_and_ready_for_train(
 
         cbks = []
         for _, plugin in self.active_plugins:
-            cbks.extend(plugin.get_callbacks_and_ready_for_train(model, accelerator))
+            cbks.extend(plugin.get_callbacks_and_ready_for_train(model, accelerator, trainer, pretrained_module_name_or_path))
         return cbks
diff --git a/plugins/framework/src/fms_acceleration/framework_plugin.py b/plugins/framework/src/fms_acceleration/framework_plugin.py
@@ -21,7 +21,7 @@
 # Third Party
 from accelerate import Accelerator
 from peft import LoraConfig
-from transformers import TrainingArguments
+from transformers import TrainingArguments, Trainer
 import torch
 
 
@@ -186,7 +186,7 @@ def augmentation(
         raise NotImplementedError
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator: Accelerator = None
+        self, model: torch.nn.Module = None, accelerator: Accelerator = None, trainer: Trainer = None, pretrained_module_name_or_path: str = None
     ):
         return []
 
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -19,7 +19,8 @@
 from fms_acceleration import AccelerationPlugin, AccelerationPluginConfigError
 from peft import LoraConfig
 from peft.tuners.lora.layer import LoraLayer
-from transformers import PretrainedConfig, TrainingArguments
+from transformers import PretrainedConfig, TrainingArguments, Trainer
+from accelerate import Accelerator
 import torch
 
 # Local
@@ -184,7 +185,7 @@ def augmentation(
         return model, modifiable_args
 
     def get_callbacks_and_ready_for_train(
-        self, model: torch.nn.Module = None, accelerator=None
+        self, model: torch.nn.Module = None, accelerator: Accelerator = None, trainer: Trainer = None, pretrained_module_name_or_path: str = None
     ):
         # This callback applies only for qpeft
         # should not install this for full FT and standard peft