fix: logic for lora

willmj · willmj · commit d98b2c9e824d · 2025-04-11T14:30:24.000-04:00
Signed-off-by: Will Johnson &lt;mwjohnson728@gmail.com&gt;
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/checkpoint_utils.py
@@ -343,6 +343,19 @@ def recover_original_state_dict_from_checkpoint(
     # config
     config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path)
 
+    # if lora, check for input/output layers
+    ip_op_layers = False
+    router_layer = False
+    if lora:
+        for name, _ in sd.items():
+            if "w1" in name:
+                ip_op_layers = True
+                break
+        for name, _ in sd.items():
+            if "router" in name:
+                router_layer = True
+                break
+
     (
         _,
         router_name,
@@ -412,7 +425,8 @@ def _infer_prefixes_and_module_names(
             module_name,
             router_name,
             expert_name,
-            lora_utils=lora,
+            ip_op_layers=ip_op_layers,
+            router_layer=router_layer,
         )
 
         model2scatter = defaultdict(dict)
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_prepare.py
@@ -128,10 +128,6 @@ def prepare_scattermoe(
     # pylint: disable=import-outside-toplevel
     from .scattermoe import ScatterMoE
 
-    lora = False
-    if lora_config:
-        lora = True
-
     if disable_distributed and ep_degree > 1:
         raise ValueError(
             "expert sharding can not be deferred to top level sharding"
@@ -255,7 +251,6 @@ def prepare_scattermoe(
                 module_name,
                 router_name,
                 "|".join(expert_name),
-                lora_start=lora,
                 target_modules=lora_config.target_modules,
             )
 
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py b/plugins/accelerated-moe/src/fms_acceleration_moe/utils/scattermoe_state_dict.py
@@ -88,8 +88,8 @@ def get_checkpoint_meta_from_sharded_safetensor(
     router_name: str = "gate",  # e.g., named "gate" within block_sparse_moe
     expert_name: str = "experts",  # e.g., named "experts" within block_sparse_moe
     expert_map: Dict = None,  # map -> [w1,w2,w3]
-    lora_start: bool = False,  # if lora is detected in prepare_scattermoe.py
-    lora_utils: bool = False,  # if lora is detected in checkpoint_utils.py
+    ip_op_layers: bool = False,  # if input/output layers are detected in utils
+    router_layer: bool = False,  # if router layer is detected in utils
     target_modules: Dict = None,  # target modules from prepare_scattermoe.py
 ) -> Dict[str, List[Tuple]]:
     """
@@ -111,6 +111,8 @@ def get_checkpoint_meta_from_sharded_safetensor(
                 e.g., input_linear|output_linear|input_linear
         expert_map (dict): This is used with pattern ii) described above in expert_name.
             If not specified, will be the identity map, e.g., w1 -> w1
+        lora_start (bool): Boolean to determine if lora is detected in scattermoe_prepare.py
+        lora_utils (bool):
     """
 
     # insert in order
@@ -171,34 +173,26 @@ def _insert(L: List, i: int, v):
                 f"'{router_name}' or expert_name '{expert_name}'"
             )
         if m.group(1) == router_name:
-            if lora_utils:
+            if router_layer:
                 _map[KEY_SCATTERMOE_LORA_A_ROUTER].append((k, stfile))
                 _map[KEY_SCATTERMOE_LORA_B_ROUTER].append((k, stfile))
             else:
                 _map[KEY_SCATTERMOE_ROUTER].append((k, stfile))
         elif m.group(1) in expert_name:
-            index = m.group(2)
-            index = 0 if index is None else int(index)
-            mod = None
-
-            # LoRA case
             if (
                 "input_linear" in target_modules and "output_linear" in target_modules
-            ) or lora_utils:
-                if not lora_utils:
+            ) or ip_op_layers:
+                index = m.group(2)
+                index = 0 if index is None else int(index)
+                mod = None
+                if not ip_op_layers:
                     for mod in expert_map.get(m.group(1), expert_map.get(m.group(3))):
                         _insert(_map[f"{mod}.weight"], index, (k, stfile))
                 else:
                     for mod in expert_map.get(m.group(1), expert_map.get(m.group(3))):
                         _insert(_map[f"{mod}.lora_A"], index, (k, stfile))
                         _insert(_map[f"{mod}.lora_B"], index, (k, stfile))
-
-            # Fine-tuning case
-            elif not lora_utils and not lora_start:
-                for mod in expert_map.get(m.group(1), expert_map.get(m.group(3))):
-                    _insert(_map[f"{mod}.weight"], index, (k, stfile))
-
-            assert mod is not None, f"cannot map '{rel_k}'"
+                assert mod is not None, f"cannot map '{rel_k}'"
 
     if len(_map) == 0:
         raise ValueError(