axolotl-ai-cloud
diff --git a/‎src/axolotl/integrations/kernels/README.md‎
Lines changed: 37 additions & 5 deletions b/‎src/axolotl/integrations/kernels/README.md‎
Lines changed: 37 additions & 5 deletions
diff --git a/‎src/axolotl/integrations/kernels/args.py‎
Lines changed: 15 additions & 4 deletions b/‎src/axolotl/integrations/kernels/args.py‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎src/axolotl/integrations/kernels/constants.py‎
Lines changed: 68 additions & 0 deletions b/‎src/axolotl/integrations/kernels/constants.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎src/axolotl/integrations/kernels/plugin.py‎
Lines changed: 81 additions & 26 deletions b/‎src/axolotl/integrations/kernels/plugin.py‎
Lines changed: 81 additions & 26 deletions
diff --git a/‎src/axolotl/integrations/kernels/sonicmoe/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/axolotl/integrations/kernels/sonicmoe/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -10,7 +10,7 @@ class ExpertsInterface(GeneralInterface):
     }
 ```
 
-In our custom integration, we add support for **ScatterMoE**, which is even more efficient and faster than `grouped_mm`.
+In our custom integration, we add support for **ScatterMoE** and **SonicMoE**, which are more efficient and faster than `grouped_mm`.
 
 ## Usage
 
@@ -21,23 +21,55 @@ plugins:
   - axolotl.integrations.kernels.KernelsPlugin
 
 use_kernels: true
+
+# Choose one (mutually exclusive):
 use_scattermoe: true
+# OR
+use_sonicmoe: true
+```
+
+**Important:** Setting `experts_implementation` is incompatible with custom kernel options.
+
+### SonicMoE installation
+
+**Prerequisites:**
+- NVIDIA Hopper (H100, H200) or Blackwell (B200, GB200) GPU
+- CUDA 12.9+ (13.0+ for B300)
+- PyTorch 2.7+ (2.9.1 recommended)
+- For B300: Triton 3.6.0
+
+```bash
+pip install --ignore-requires-python --no-deps "sonic-moe @ git+https://github.com/Dao-AILab/sonic-moe.git@116e2df0a41874f77fa0ad269ce7df3f0cfcb956" && pip install nvidia-cutlass-dsl==4.4.0 quack-kernels==0.2.5
 ```
 
-**Important:** Setting `experts_implementation` is incompatible with `use_scattermoe`.
+See the [SonicMoE installation guide](https://github.com/Dao-AILab/sonic-moe?tab=readme-ov-file#-installation) for the latest prerequisite details.
+
+**Note:** Blackwell support is in upstream beta. On Blackwell GPUs, Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels.
 
 ## How It Works
 
 The `KernelsPlugin` runs before model loading and:
 
-1. Registers the ScatterMoE kernel from the [`axolotl-ai-co/scattermoe`](https://huggingface.co/axolotl-ai-co/scattermoe) Hub repo.
+### ScatterMoE
+1. Registers the ScatterMoE kernel from the local `libs/scattermoe_lora` package (includes fused LoRA support via Triton kernels).
 2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation.
 
-This works for any MoE model in transformers that uses a `SparseMoeBlock` class (Mixtral, Qwen2-MoE, OLMoE, etc.).
+### SonicMoE
+1. Resolves the model's MoE block class(es) from `constants.py`.
+2. Patches the forward method with SonicMoE's optimized kernels and registers a weight converter for the interleaved gate/up projection format.
+3. Supports both softmax->topk and sigmoid->topk routing strategies.
+
+Both paths use the shared `resolve_moe_block_classes` utility in `constants.py` for model-type-to-class resolution.
+
+#### Supported Models
+
+See `constants.py` for the full list of supported model types (Qwen2-MoE, Qwen3-MoE, OLMoE, Mixtral, DeepSeek-V3, GLM-MoE, MiniMax, etc.).
 
 ## Limitations
 
-ScatterMoE uses a softmax -> topk routing, so results may be different for some model arch as baseline (GPT-OSS, GLM_MOE_DSA).
+ScatterMoE uses a softmax -> topk routing, so results may be different for some model architectures as baseline (GPT-OSS, etc). Incompatible with `GLM_MOE_DSA` (GLM 5) and `GLM4_MOE_LITE` (GLM 4.7 Flash) at the moment.
+
+SonicMoE supports both softmax->topk and sigmoid->topk routing, covering a wider range of architectures.
 
 ScatterMoE does not work for GLM4.7 Flash (glm4_moe_lite) atm.
 
 
@@ -6,7 +6,18 @@
 
 
 class KernelsArgs(BaseModel):
-    use_scattermoe: bool | None = True
+    use_scattermoe: bool | None = None
+    use_sonicmoe: bool | None = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_mutually_exclusive(cls, data):
+        if data.get("use_scattermoe") and data.get("use_sonicmoe"):
+            raise ValueError(
+                "Cannot use both ScatterMoE and SonicMoE simultaneously. "
+                "Please set only one of `use_scattermoe` or `use_sonicmoe` to true."
+            )
+        return data
 
     @model_validator(mode="before")
     @classmethod
@@ -36,11 +47,11 @@ def check_experts_implementation(cls, data):
 
     @model_validator(mode="before")
     @classmethod
-    def disable_mlp_kernel_scattermoe(cls, data):
-        if data.get("use_scattermoe") is True:
+    def disable_mlp_kernel(cls, data):
+        if data.get("use_scattermoe") is True or data.get("use_sonicmoe") is True:
             if data.get("lora_mlp_kernel") is True:
                 LOG.warning(
-                    "Disabling lora_mlp_kernel when using scattermoe due to compatibility issues."
+                    "Disabling lora_mlp_kernel when using custom MoE kernels due to compatibility issues."
                 )
                 data["lora_mlp_kernel"] = False
             data["mlp_kernel"] = False
 
@@ -0,0 +1,68 @@
+"""
+Supported MoE block mappings for kernel integrations.
+
+Maps model_type to the SparseMoeBlock class name(s) in transformers.
+Used by both ScatterMoE and SonicMoE kernel paths.
+
+Values can be a single class name (str) or a list of class names for models
+with multiple MoE block types (e.g. qwen3_omni_moe has Thinker + Talker).
+"""
+
+import importlib
+
+SPARSE_MOE_BLOCK = {
+    # softmax -> topk routing
+    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
+    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
+    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
+    "qwen3_next": "Qwen3NextSparseMoeBlock",
+    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
+    # qwen3_omni_moe: Thinker (standard) + Talker (shared experts + shared_expert_gate)
+    "qwen3_omni_moe": [
+        "Qwen3OmniMoeThinkerTextSparseMoeBlock",
+        "Qwen3OmniMoeTalkerTextSparseMoeBlock",
+    ],
+    "olmoe": "OlmoeSparseMoeBlock",
+    "mixtral": "MixtralSparseMoeBlock",
+    "minimax": "MiniMaxSparseMoeBlock",
+    # sigmoid -> topk routing (with group-based expert selection)
+    "glm_moe_dsa": "GlmMoeDsaMoE",
+    "deepseek_v3": "DeepseekV3MoE",
+    "glm4_moe": "Glm4MoeMoE",
+    "glm4_moe_lite": "Glm4MoeLiteMoE",
+    "glm4v_moe": "Glm4vMoeTextMoE",
+    # sigmoid -> topk routing (no group selection)
+    "minimax_m2": "MiniMaxM2SparseMoeBlock",
+    # Models below need custom routing (not yet implemented):
+    # "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock",  # softmax->topk, e_score_correction_bias between softmax and topk
+    # "deepseek_v2": "DeepseekV2Moe",  # softmax->topk, group_limited_greedy, different attr names (num_group)
+    # "hunyuan_v1_moe": "HunYuanMoEV1Moe",  # softmax->topk, gate.wg (not gate.weight), scatter routing
+    # "gpt_oss": "GptOssMLP",  # topk->softmax, transposed layout [E,H,2*I], custom GLU, expert biases
+}
+
+
+def resolve_moe_block_classes(model_type: str):
+    """Resolve all MoE block classes from transformers for the given model type.
+
+    Returns a list of classes (one for most models, multiple for models with
+    distinct MoE block types like qwen3_omni_moe).
+    """
+    entry = SPARSE_MOE_BLOCK.get(model_type)
+    if entry is None:
+        raise ValueError(
+            f"Unsupported MoE model type '{model_type}'. "
+            f"Supported types: {list(SPARSE_MOE_BLOCK.keys())}"
+        )
+
+    cls_names = entry if isinstance(entry, list) else [entry]
+    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+    module = importlib.import_module(module_path)
+
+    classes = []
+    for cls_name in cls_names:
+        moe_cls = getattr(module, cls_name, None)
+        if moe_cls is None:
+            raise ValueError(f"Could not find class '{cls_name}' in '{module_path}'")
+        classes.append(moe_cls)
+
+    return classes
@@ -1,14 +1,59 @@
+import importlib
+import os
 from pathlib import Path
 
-from kernels import (
-    LocalLayerRepository,
-    Mode,
-    register_kernel_mapping,
-    replace_kernel_forward_from_hub,
-)
+import torch
 
 from axolotl.integrations.base import BasePlugin
-from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def _check_sonicmoe_gpu_compat():
+    """Validate GPU compute capability for SonicMoE and configure env.
+
+    Supported: Hopper (sm_90), Blackwell (sm_100 - sm_103).
+    B300 (sm_103) additionally requires Triton 3.6.0.
+    """
+    if not torch.cuda.is_available():
+        return
+
+    cc = torch.cuda.get_device_capability()
+
+    if cc < (9, 0):
+        raise RuntimeError(
+            f"SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+) GPU, "
+            f"but detected sm_{cc[0]}{cc[1]}."
+        )
+
+    if cc > (10, 3):
+        raise RuntimeError(
+            f"SonicMoE does not yet support sm_{cc[0]}{cc[1]}. "
+            f"Supported: Hopper (sm_90) and Blackwell (sm_100 - sm_103)."
+        )
+
+    # Blackwell (sm_100+): enable QuACK GEMM kernels
+    if cc >= (10, 0):
+        os.environ.setdefault("USE_QUACK_GEMM", "1")
+        LOG.info(
+            f"Blackwell GPU (sm_{cc[0]}{cc[1]}) detected, enabling USE_QUACK_GEMM=1"
+        )
+
+    # B300 (sm_103): requires Triton 3.6.0
+    if cc == (10, 3):
+        triton_spec = importlib.util.find_spec("triton")
+        if triton_spec is None:
+            raise RuntimeError(
+                "B300 (sm_103) requires Triton 3.6.0, but Triton is not installed."
+            )
+        import triton
+
+        triton_version = tuple(int(x) for x in triton.__version__.split(".")[:2])
+        if triton_version != (3, 6):
+            raise RuntimeError(
+                f"B300 (sm_103) requires Triton 3.6.x, but found {triton.__version__}."
+            )
 
 
 class KernelsPlugin(BasePlugin):
@@ -19,8 +64,32 @@ def pre_model_load(self, cfg):
         if cfg.use_scattermoe:
             self._register_kernels()
             self._kernelize_model(cfg.model_config_type)
+        elif cfg.use_sonicmoe:
+            if not importlib.util.find_spec("sonicmoe"):
+                raise RuntimeError(
+                    "SonicMoE is not installed. See installation instructions at "
+                    "https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/kernels/README.md#sonicmoe-installation"
+                )
+
+            _check_sonicmoe_gpu_compat()
+
+            from axolotl.integrations.kernels.sonicmoe import patch_sonicmoe
+
+            LOG.info(
+                f"Applying SonicMoE patches for model type: {cfg.model_config_type}"
+            )
+            patch_sonicmoe(
+                cfg.model_config_type,
+                torch_compile=bool(getattr(cfg, "torch_compile", False)),
+            )
 
     def _register_kernels(self):
+        from kernels import (
+            LocalLayerRepository,
+            Mode,
+            register_kernel_mapping,
+        )
+
         plugin_root = Path(__file__).parent
         register_kernel_mapping(
             {
@@ -42,25 +111,11 @@ def _register_kernels(self):
         )
 
     def _kernelize_model(self, model_type: str):
-        if model_type == "olmoe":
-            from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock
+        from kernels import replace_kernel_forward_from_hub
+
+        from axolotl.integrations.kernels.constants import resolve_moe_block_classes
 
+        for model_moe_cls in resolve_moe_block_classes(model_type):
             replace_kernel_forward_from_hub(
-                OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts"
+                model_moe_cls, "HFScatterMoEParallelExperts"
             )
-        else:
-            try:
-                model_moe_cls = get_model_moe_block(model_type)
-                replace_kernel_forward_from_hub(
-                    model_moe_cls, "HFScatterMoEParallelExperts"
-                )
-            except Exception as err:
-                raise ValueError(f"Unsupported model type: {model_type}") from err
-
-
-def get_model_moe_block(model_type: str):
-    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
-    module = __import__(module_path, fromlist=[f"{model_cls_prefix}SparseMoeBlock"])
-    model_cls = getattr(module, f"{model_cls_prefix}SparseMoeBlock")
-    return model_cls
 
@@ -0,0 +1,3 @@
+from .patch import patch_sonicmoe
+
+__all__ = ["patch_sonicmoe"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .patch import patch_sonicmoe`
	`2`	`+`
	`3`	`+__all__ = ["patch_sonicmoe"]`