trigger-only pattern for custom loss

fabianlim · fabianlim · commit 2769736fa21a · 2025-01-30T14:36:58.000Z
Signed-off-by: Yu Chin Fabian Lim &lt;flim@sg.ibm.com&gt;
diff --git a/plugins/framework/src/fms_acceleration/model_patcher.py b/plugins/framework/src/fms_acceleration/model_patcher.py
@@ -184,10 +184,10 @@ def __post_init__(self):
                     self.import_and_maybe_reload is not None,
                 ]
             )
-            != 1
+            > 1
         ):
             raise ValueError(
-                f"Rule '{self.rule_id}' must only have only one of forward, "
+                f"Rule '{self.rule_id}' must only have at most one of forward, "
                 "foward builder, or import_and_maybe_reload, specified."
             )
 
@@ -425,7 +425,7 @@ def _patch_forwards(
             # otherwise triggered
             if rule.forward is not None:
                 forward = rule.forward
-            else:
+            elif rule.forward_builder is not None:
                 fba = {}
                 if rule.forward_builder_args is not None:
                     fba = {
@@ -434,6 +434,9 @@ def _patch_forwards(
                         if rule.forward_builder_args
                     }
                 forward = rule.forward_builder(mod, **fba)
+            else:
+                # trigger-only case
+                forward = None
 
             if isinstance(forward, list):
                 # this will be list of tuples case
@@ -468,7 +471,8 @@ def _patch_forwards(
                 continue
 
             # otherwise
-            mod.forward = MethodType(forward, mod)
+            if forward is not None:
+                mod.forward = MethodType(forward, mod)
             ModelPatcher.history.append(
                 ModelPatcherHistory(
                     instance=mod_id,
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -73,7 +73,7 @@ def register_foak_model_patch_rules(
 FILTER_MAP = {
     "fused_lora": {"qkvo", "mlp"},
     "fast_loss": {
-        True: "cross-ent",
+        True: {"cross-ent", "custom-loss"},
         "fused_ce_liger": "fused-lce",
     },
     "fast_rms_layernorm": "rms",
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/kernels/unsloth/cross_entropy_loss.py
@@ -16,6 +16,7 @@
 import triton.language as tl
 import torch
 from .utils import calculate_settings, MAX_FUSED_SIZE
+from typing import Type
 
 
 @triton.jit
@@ -290,3 +291,55 @@ def forward(self, input, target):
         )
         n_items = torch.count_nonzero(target != -100)
         return loss.sum() / n_items
+
+
+# added by flim@sg.ibm.com
+
+# adapted from transformers.loss.loss_utils.ForCausalLMLoss
+def FastForCausalLMLoss(
+    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+):
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+    labels = labels.to(logits.device)
+    # Shift so that tokens < n predict n
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+
+    # Flatten the tokens
+    shift_logits = shift_logits.view(-1, vocab_size)
+    shift_labels = shift_labels.view(-1)
+    # Enable model parallelism
+    shift_labels = shift_labels.to(shift_logits.device)
+
+    reduction = "sum" if num_items_in_batch is not None else "mean"
+    assert ignore_index == -100, "FastForCausalLMLoss currently supports only hardcoded ignore index -100."
+    loss = Fast_CrossEntropyLoss.apply(
+        shift_logits, shift_labels
+    )
+    if reduction == "sum":
+        n_items = num_items_in_batch
+    else:
+        n_items = torch.count_nonzero(shift_labels != -100)
+    return loss.sum() / n_items
+
+
+def replace_custom_loss_when_triggered(
+    module_cls: Type,
+    custom_loss_type: str,
+):
+
+    # this is a special trigger that will perform the replacement
+    def _trigger(mod):
+        if isinstance (mod, module_cls) and hasattr(mod, "loss_function"):
+            # guarded
+            from transformers.loss.loss_utils import LOSS_MAPPING
+            LOSS_MAPPING[custom_loss_type] = FastForCausalLMLoss
+            mod.loss_type = custom_loss_type
+            return True
+
+        return False
+
+    return _trigger
+
+
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/granite.py
@@ -27,7 +27,7 @@
 
 # Local
 from ..fused_ops.liger_ce.fused_linear_cross_entropy_loss import lce_forward
-from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss
+from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss, replace_custom_loss_when_triggered
 from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm
 from ..kernels.unsloth.rope_embedding import fast_rope_embedding
 from ..utils import filter_mp_rules
@@ -38,6 +38,7 @@
     build_lora_fused_ops,
     get_hidden_activation_fn_key,
     trigger_fused_ops,
+    get_transformers_version,
 )
 
 
@@ -122,16 +123,25 @@ def get_mp_rules(base_type: str, config: PretrainedConfig = None):
                 base_type=base_type,
             ),
         ),
-        # TODO: have a generic version of this rule
-        # - get the module_name and reload on that
-        ModelPatcherRule(
-            rule_id="granite-cross-ent",
-            import_and_maybe_reload=(
-                "torch.nn.CrossEntropyLoss",
-                FastCrossEntropyLoss,
-                "transformers.models.granite.modeling_granite",
-            ),
-        ),
+        *[
+            ModelPatcherRule(
+                rule_id="granite-custom-loss",
+                trigger=ModelPatcherTrigger(
+                    check=replace_custom_loss_when_triggered(
+                        GraniteForCausalLM, custom_loss_type="granite-custom-loss"
+                    )
+                ),
+            )
+            if get_transformers_version() >= "4.46" else
+            ModelPatcherRule(
+                rule_id="granite-cross-ent",
+                import_and_maybe_reload=(
+                    "torch.nn.CrossEntropyLoss",
+                    FastCrossEntropyLoss,
+                    "transformers.models.granite.modeling_granite",
+                ),
+            ) 
+        ],
         ModelPatcherRule(
             rule_id="granite-fused-lce",
             trigger=ModelPatcherTrigger(check=GraniteForCausalLM),
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py
@@ -6,6 +6,7 @@
 # Third Party
 from fms_acceleration.model_patcher import ModelPatcherTrigger
 from transformers import PretrainedConfig
+from transformers.utils.import_utils import _is_package_available
 import torch
 
 # Local
@@ -214,3 +215,7 @@ def get_hidden_activation_fn_key(config: PretrainedConfig):
         "Unable to determine activation function key for "
         f"architecture {config.architectures}."
     )
+
+def get_transformers_version():
+    _, _transformers_version = _is_package_available("transformers", return_version=True)
+    return _transformers_version