add cross ent fix for llama, mistral, mixtral

anhuong · anhuong · commit bb6e04e6be79 · 2025-02-04T21:03:42.000-07:00
Signed-off-by: Anh Uong &lt;anh.uong@ibm.com&gt;
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py
@@ -33,7 +33,7 @@
 
 # Local
 from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward
-from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
+from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss, replace_custom_loss_when_triggered
 from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
 from ..kernels.unsloth.rope_embedding import fast_rope_embedding
 from ..utils import filter_mp_rules
@@ -44,6 +44,7 @@
     build_lora_fused_ops,
     get_hidden_activation_fn_key,
     trigger_fused_ops,
+    get_transformers_version,
 )
 
 
@@ -122,14 +123,25 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None):
             trigger=ModelPatcherTrigger(check=LlamaForCausalLM),
             forward=lce_forward,
         ),
-        ModelPatcherRule(
-            rule_id="llama-cross-ent",
-            import_and_maybe_reload=(
-                "torch.nn.CrossEntropyLoss",
-                FastCrossEntropyLoss,
-                "transformers.models.llama.modeling_llama",
-            ),
-        ),
+        *[
+            ModelPatcherRule(
+                rule_id="llama-custom-loss",
+                trigger=ModelPatcherTrigger(
+                    check=replace_custom_loss_when_triggered(
+                        LlamaForCausalLM, custom_loss_type="llama-custom-loss"
+                    )
+                ),
+            )
+            if get_transformers_version() >= "4.46" else
+            ModelPatcherRule(
+                rule_id="llama-cross-ent",
+                import_and_maybe_reload=(
+                    "torch.nn.CrossEntropyLoss",
+                    FastCrossEntropyLoss,
+                    "transformers.models.llama.modeling_llama",
+                ),
+            )
+        ],
         # TODO: have a generic version of this rule
         # - get the module name
         # - check if "apply_rotary_pos_emb" exists
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py
@@ -33,7 +33,7 @@
 
 # Local
 from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward
-from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
+from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss, replace_custom_loss_when_triggered
 from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
 from ..kernels.unsloth.rope_embedding import fast_rope_embedding
 from ..utils import filter_mp_rules
@@ -44,6 +44,7 @@
     build_lora_fused_ops,
     get_hidden_activation_fn_key,
     trigger_fused_ops,
+    get_transformers_version,
 )
 
 
@@ -114,14 +115,25 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None):
                 base_type=base_type,
             ),
         ),
-        ModelPatcherRule(
-            rule_id="mistral-cross-ent",
-            import_and_maybe_reload=(
-                "torch.nn.CrossEntropyLoss",
-                FastCrossEntropyLoss,
-                "transformers.models.mistral.modeling_mistral",
-            ),
-        ),
+        *[
+            ModelPatcherRule(
+                rule_id="mistral-custom-loss",
+                trigger=ModelPatcherTrigger(
+                    check=replace_custom_loss_when_triggered(
+                        MistralForCausalLM, custom_loss_type="mistral-custom-loss"
+                    )
+                ),
+            )
+            if get_transformers_version() >= "4.46" else
+            ModelPatcherRule(
+                rule_id="mistral-cross-ent",
+                import_and_maybe_reload=(
+                    "torch.nn.CrossEntropyLoss",
+                    FastCrossEntropyLoss,
+                    "transformers.models.mistral.modeling_mistral",
+                ),
+            )
+        ],
         ModelPatcherRule(
             rule_id="mistral-fused-lce",
             trigger=ModelPatcherTrigger(check=MistralForCausalLM),
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py
@@ -25,13 +25,14 @@
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralAttention,
     MixtralRMSNorm,
+    MixtralForCausalLM,
 )
 
 # Local
-from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
+from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss, replace_custom_loss_when_triggered
 from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
 from ..kernels.unsloth.rope_embedding import fast_rope_embedding
-from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops
+from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops, get_transformers_version
 
 
 def get_mp_rules(base_type):
@@ -85,14 +86,25 @@ def get_mp_rules(base_type):
                 logic="APPEND",
             ),
         ),
-        ModelPatcherRule(
-            rule_id="mixtral-cross-ent",
-            import_and_maybe_reload=(
-                "torch.nn.CrossEntropyLoss",
-                FastCrossEntropyLoss,
-                "transformers.models.mixtral.modeling_mixtral",
-            ),
-        ),
+        *[
+            ModelPatcherRule(
+                rule_id="mixtral-custom-loss",
+                trigger=ModelPatcherTrigger(
+                    check=replace_custom_loss_when_triggered(
+                        MixtralForCausalLM, custom_loss_type="mixtral-custom-loss"
+                    )
+                ),
+            )
+            if get_transformers_version() >= "4.46" else
+            ModelPatcherRule(
+                rule_id="mixtral-cross-ent",
+                import_and_maybe_reload=(
+                    "torch.nn.CrossEntropyLoss",
+                    FastCrossEntropyLoss,
+                    "transformers.models.mixtral.modeling_mixtral",
+                ),
+            )
+        ],
         ModelPatcherRule(
             rule_id="mixtral-rope",
             import_and_maybe_reload=(