Afmoe support (#2243)

LRL2-ModelCloud · web-flow · commit 1a2eb2ef4c56 · 2025-12-09T17:43:26.000+08:00
* add afmoe support

* add afmoe.py

* mod readme.md
diff --git a/README.md b/README.md
@@ -178,22 +178,21 @@ Native support support some of the most popular multi-modal models:
 <img src=https://github.com/user-attachments/assets/c1b89394-f8f6-44e5-9949-bef15a124723 width="51%"> <img src=https://github.com/user-attachments/assets/23901236-10c5-4435-ac2f-06cf2e097f1e width="47%">
 
 ## Model Support  
-| Model             |   |             |   |               |  |           |  |                 |  |
-|-------------------|---|-------------|---|---------------|--|-----------|--|-----------------|--|
-| Apertus           | ✅ | EXAONE 3.0  | ✅ | InternLM 1/2.5 | ✅ | Mixtral   | ✅ | Qwen 2/3 (Next/MoE) | ✅ |
-| Baichuan          | ✅ | Falcon (H1) | ✅ | Kimi K2       | ✅ | MobileLLM | ✅ | Qwen 2/2.5/3 VL | ✅ |
-| Bloom             | ✅ | FastVLM     | ✅ | Klear         | ✅ | MOSS      | ✅ | Qwen 2.5/3 Omni | ✅ |
-| ChatGLM           | ✅ | Gemma 1/2/3 | ✅ | LING/RING     | ✅ | MPT       | ✅ | RefinedWeb      | ✅ |
-| CodeGen           | ✅ | GPTBigCod   | ✅ | Llama 1-3.3   | ✅ | Nemotron H | ✅ | StableLM        | ✅ |
-| Cohere 1-2        | ✅ | GPTQ-Neo(X) | ✅ | Llama 3.2 VL  | ✅ | Nemotron Ultra | ✅ | StarCoder2      | ✅ |
-| DBRX Converted    | ✅ | GPT-2       | ✅ | Llama 4       | ✅ | OPT       | ✅ | TeleChat2       | ✅ |
-| Deci              | ✅ | GPT-J       | ✅ | LongCatFlash  | ✅ | OLMo2     | ✅ | Yi              | ✅ |
-| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS     | ✅ | LongLLaMA     | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS        | ✅ |
-| DeepSeek-V2-Lite  | ✅ | Granite     | ✅ | Instella      | ✅ | Phi 1-4   | ✅ | XVERSE          | ✅ |
-| Dream             | ✅ | GRIN-MoE    | ✅ | MiniCPM3      | ✅ | PanGu-α   | ✅ | Minimax M2      | ✅ |
-| ERNIE 4.5         | ✅ | Hymba       | ✅ | Mistral       | ✅ | Qwen 1/2/3 | ✅ | GLM 4.X         | ✅ |
-| Brumby            | ✅ | Dots1       | ✅ | Mistral3      | ✅ |      |  |              |  |
-
+| Model             |   |             |   |                |   |                     |   |                 |   |
+|-------------------|---|-------------|---|----------------|---|---------------------|---|-----------------|---|
+| Apertus           | ✅ | EXAONE 3.0  | ✅ | InternLM 1/2.5 | ✅ | Mixtral             | ✅ | Qwen 2/2.5/3 VL | ✅ |
+| Baichuan          | ✅ | Falcon (H1) | ✅ | Kimi K2        | ✅ | MobileLLM           | ✅ | Qwen 2.5/3 Omni | ✅ |
+| Bloom             | ✅ | FastVLM     | ✅ | Klear          | ✅ | MOSS                | ✅ | RefinedWeb      | ✅ |
+| ChatGLM           | ✅ | Gemma 1/2/3 | ✅ | LING/RING      | ✅ | MPT                 | ✅ | StableLM        | ✅ |
+| CodeGen           | ✅ | GPTBigCod   | ✅ | Llama 1-3.3    | ✅ | Nemotron H          | ✅ | StarCoder2      | ✅ |
+| Cohere 1-2        | ✅ | GPTQ-Neo(X) | ✅ | Llama 3.2 VL   | ✅ | Nemotron Ultra      | ✅ | TeleChat2       | ✅ |
+| DBRX Converted    | ✅ | GPT-2       | ✅ | Llama 4        | ✅ | OPT                 | ✅ | Trinity         | ✅ |
+| Deci              | ✅ | GPT-J       | ✅ | LongCatFlash   | ✅ | OLMo2               | ✅ | Yi              | ✅ |
+| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS     | ✅ | LongLLaMA      | ✅ | Ovis 1.6/2          | ✅ | Seed-OSS        | ✅ |
+| DeepSeek-V2-Lite  | ✅ | Granite     | ✅ | Instella       | ✅ | Phi 1-4             | ✅ | XVERSE          | ✅ |
+| Dream             | ✅ | GRIN-MoE    | ✅ | MiniCPM3       | ✅ | PanGu-α             | ✅ | Minimax M2      | ✅ |
+| ERNIE 4.5         | ✅ | Hymba       | ✅ | Mistral        | ✅ | Qwen 1/2/3          | ✅ | GLM 4.X         | ✅ |
+| Brumby            | ✅ | Dots1       | ✅ | Mistral3       | ✅ | Qwen 2/3 (Next/MoE) | ✅ |                 |   |
 
 ## Platform and HW Support 
 
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -70,6 +70,7 @@
 from ..utils.model import find_modules  # noqa: E402
 from ..utils.torch import CPU, torch_empty_cache  # noqa: E402
 from .base import BaseQModel, QuantizeConfig  # noqa: E402
+from .definitions.afmoe import AfMoeQModel  # noqa: E402
 from .definitions.apertus import ApertusQModel  # noqa: E402
 from .definitions.baichuan import BaiChuanQModel  # noqa: E402
 from .definitions.bailing_moe import BailingMoeQModel  # noqa: E402
@@ -246,6 +247,7 @@
     "bailing_moe": BailingMoeQModel,
     "lfm2_moe": LFM2MoeQModel,
     "mistral3": Mistral3GPTQ,
+    "afmoe": AfMoeQModel,
 }
 
 SUPPORTED_MODELS = list(MODEL_MAP.keys())
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -51,7 +51,6 @@
 from ..utils.importer import select_quant_linear
 from ..utils.logger import QuantizationRegionTimer, setup_logger
 from ..utils.model import MODALITY, find_modules, get_module_by_name_prefix, move_to
-from ..utils.offload import offload_to_disk
 from ..utils.structure import alias_from_turtle_for_submodule
 from ..utils.torch import TORCH_HAS_COMPILE, torch_compile
 from ._const import (
diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
@@ -70,3 +70,4 @@
 from .nemotron_h import NemotronHQModel
 from .qwen3_omni_moe import Qwen3OmniMoeGPTQ
 from .mistral3 import Mistral3GPTQ
+from .afmoe import AfMoeQModel
diff --git a/gptqmodel/models/definitions/afmoe.py b/gptqmodel/models/definitions/afmoe.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from ..base import BaseQModel
+
+
+class AfMoeQModel(BaseQModel):
+    # allow dynamic expert index for layer_modules so we don't need to write out 64 layers here
+    # config.num_experts contains the actual expert count used for index
+    dynamic_expert_index = "num_experts"
+
+    require_trust_remote_code = True
+    layer_modules_strict = False
+
+    pre_lm_head_norm_module = "model.norm"
+
+    module_tree = [
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp:?": {
+                "gate": ("gate:!",),
+                "shared_expert:0": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                "experts:0": {
+                    "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                },
+            },
+        }
+    ]
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
@@ -21,6 +21,7 @@
 from ..nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear
 from ..nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
 from ..nn_modules.qlinear.gemm_awq_triton import AwqGEMMTritonQuantLinear
+from ..nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
 from ..nn_modules.qlinear.gemv_awq import AwqGEMVQuantLinear
 from ..nn_modules.qlinear.gemv_fast_awq import AwqGEMVFastQuantLinear
 from ..nn_modules.qlinear.machete import MacheteQuantLinear
@@ -32,7 +33,6 @@
 from ..nn_modules.qlinear.torch_awq import AwqTorchQuantLinear
 from ..nn_modules.qlinear.torch_fused import TorchFusedQuantLinear
 from ..nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear
-from ..nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
 from ..nn_modules.qlinear.tritonv2 import TRITON_AVAILABLE, TRITON_INSTALL_HINT, TritonV2QuantLinear
 from ..quantization import FORMAT, METHOD
 from ..utils.logger import setup_logger
diff --git a/tests/test_kernel_output_intel_cpu_xpu.py b/tests/test_kernel_output_intel_cpu_xpu.py
@@ -12,9 +12,9 @@
 from torch import Tensor
 
 from gptqmodel import BACKEND, GPTQModel
+from gptqmodel.nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 from gptqmodel.nn_modules.qlinear.torch_fused import TorchFusedQuantLinear
-from gptqmodel.nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
 from gptqmodel.utils.model import find_modules