Skip to content

Commit 1a2eb2e

Browse files
Afmoe support (#2243)
* add afmoe support * add afmoe.py * mod readme.md
1 parent f819e9a commit 1a2eb2e

File tree

7 files changed

+55
-19
lines changed

7 files changed

+55
-19
lines changed

README.md

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -178,22 +178,21 @@ Native support support some of the most popular multi-modal models:
178178
<img src=https://github.com/user-attachments/assets/c1b89394-f8f6-44e5-9949-bef15a124723 width="51%"> <img src=https://github.com/user-attachments/assets/23901236-10c5-4435-ac2f-06cf2e097f1e width="47%">
179179

180180
## Model Support
181-
| Model | | | | | | | | | |
182-
|-------------------|---|-------------|---|---------------|--|-----------|--|-----------------|--|
183-
| Apertus || EXAONE 3.0 || InternLM 1/2.5 || Mixtral || Qwen 2/3 (Next/MoE) ||
184-
| Baichuan || Falcon (H1) || Kimi K2 || MobileLLM || Qwen 2/2.5/3 VL ||
185-
| Bloom || FastVLM || Klear || MOSS || Qwen 2.5/3 Omni ||
186-
| ChatGLM || Gemma 1/2/3 || LING/RING || MPT || RefinedWeb ||
187-
| CodeGen || GPTBigCod || Llama 1-3.3 || Nemotron H || StableLM ||
188-
| Cohere 1-2 || GPTQ-Neo(X) || Llama 3.2 VL || Nemotron Ultra || StarCoder2 ||
189-
| DBRX Converted || GPT-2 || Llama 4 || OPT || TeleChat2 ||
190-
| Deci || GPT-J || LongCatFlash || OLMo2 || Yi ||
191-
| DeepSeek-V2/V3/R1 || GPT-OSS || LongLLaMA || Ovis 1.6/2 || Seed-OSS ||
192-
| DeepSeek-V2-Lite || Granite || Instella || Phi 1-4 || XVERSE ||
193-
| Dream || GRIN-MoE || MiniCPM3 || PanGu-α || Minimax M2 ||
194-
| ERNIE 4.5 || Hymba || Mistral || Qwen 1/2/3 || GLM 4.X ||
195-
| Brumby || Dots1 || Mistral3 || | | | |
196-
181+
| Model | | | | | | | | | |
182+
|-------------------|---|-------------|---|----------------|---|---------------------|---|-----------------|---|
183+
| Apertus || EXAONE 3.0 || InternLM 1/2.5 || Mixtral || Qwen 2/2.5/3 VL ||
184+
| Baichuan || Falcon (H1) || Kimi K2 || MobileLLM || Qwen 2.5/3 Omni ||
185+
| Bloom || FastVLM || Klear || MOSS || RefinedWeb ||
186+
| ChatGLM || Gemma 1/2/3 || LING/RING || MPT || StableLM ||
187+
| CodeGen || GPTBigCod || Llama 1-3.3 || Nemotron H || StarCoder2 ||
188+
| Cohere 1-2 || GPTQ-Neo(X) || Llama 3.2 VL || Nemotron Ultra || TeleChat2 ||
189+
| DBRX Converted || GPT-2 || Llama 4 || OPT || Trinity ||
190+
| Deci || GPT-J || LongCatFlash || OLMo2 || Yi ||
191+
| DeepSeek-V2/V3/R1 || GPT-OSS || LongLLaMA || Ovis 1.6/2 || Seed-OSS ||
192+
| DeepSeek-V2-Lite || Granite || Instella || Phi 1-4 || XVERSE ||
193+
| Dream || GRIN-MoE || MiniCPM3 || PanGu-α || Minimax M2 ||
194+
| ERNIE 4.5 || Hymba || Mistral || Qwen 1/2/3 || GLM 4.X ||
195+
| Brumby || Dots1 || Mistral3 || Qwen 2/3 (Next/MoE) || | |
197196

198197
## Platform and HW Support
199198

gptqmodel/models/auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from ..utils.model import find_modules # noqa: E402
7171
from ..utils.torch import CPU, torch_empty_cache # noqa: E402
7272
from .base import BaseQModel, QuantizeConfig # noqa: E402
73+
from .definitions.afmoe import AfMoeQModel # noqa: E402
7374
from .definitions.apertus import ApertusQModel # noqa: E402
7475
from .definitions.baichuan import BaiChuanQModel # noqa: E402
7576
from .definitions.bailing_moe import BailingMoeQModel # noqa: E402
@@ -246,6 +247,7 @@
246247
"bailing_moe": BailingMoeQModel,
247248
"lfm2_moe": LFM2MoeQModel,
248249
"mistral3": Mistral3GPTQ,
250+
"afmoe": AfMoeQModel,
249251
}
250252

251253
SUPPORTED_MODELS = list(MODEL_MAP.keys())

gptqmodel/models/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@
5151
from ..utils.importer import select_quant_linear
5252
from ..utils.logger import QuantizationRegionTimer, setup_logger
5353
from ..utils.model import MODALITY, find_modules, get_module_by_name_prefix, move_to
54-
from ..utils.offload import offload_to_disk
5554
from ..utils.structure import alias_from_turtle_for_submodule
5655
from ..utils.torch import TORCH_HAS_COMPILE, torch_compile
5756
from ._const import (

gptqmodel/models/definitions/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,4 @@
7070
from .nemotron_h import NemotronHQModel
7171
from .qwen3_omni_moe import Qwen3OmniMoeGPTQ
7272
from .mistral3 import Mistral3GPTQ
73+
from .afmoe import AfMoeQModel
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
2+
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
3+
# SPDX-License-Identifier: Apache-2.0
4+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
5+
6+
from ..base import BaseQModel
7+
8+
9+
class AfMoeQModel(BaseQModel):
10+
# allow dynamic expert index for layer_modules so we don't need to write out 64 layers here
11+
# config.num_experts contains the actual expert count used for index
12+
dynamic_expert_index = "num_experts"
13+
14+
require_trust_remote_code = True
15+
layer_modules_strict = False
16+
17+
pre_lm_head_norm_module = "model.norm"
18+
19+
module_tree = [
20+
"model",
21+
"layers",
22+
"#",
23+
{
24+
"input_layernorm": ("input_layernorm:!",),
25+
"self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
26+
"post_attention_layernorm": ("post_attention_layernorm:!",),
27+
"mlp:?": {
28+
"gate": ("gate:!",),
29+
"shared_expert:0": ("gate_proj:0", "up_proj:0", "down_proj:1"),
30+
"experts:0": {
31+
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
32+
},
33+
},
34+
}
35+
]

gptqmodel/utils/importer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from ..nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear
2222
from ..nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
2323
from ..nn_modules.qlinear.gemm_awq_triton import AwqGEMMTritonQuantLinear
24+
from ..nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
2425
from ..nn_modules.qlinear.gemv_awq import AwqGEMVQuantLinear
2526
from ..nn_modules.qlinear.gemv_fast_awq import AwqGEMVFastQuantLinear
2627
from ..nn_modules.qlinear.machete import MacheteQuantLinear
@@ -32,7 +33,6 @@
3233
from ..nn_modules.qlinear.torch_awq import AwqTorchQuantLinear
3334
from ..nn_modules.qlinear.torch_fused import TorchFusedQuantLinear
3435
from ..nn_modules.qlinear.torch_fused_awq import TorchFusedAwqQuantLinear
35-
from ..nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
3636
from ..nn_modules.qlinear.tritonv2 import TRITON_AVAILABLE, TRITON_INSTALL_HINT, TritonV2QuantLinear
3737
from ..quantization import FORMAT, METHOD
3838
from ..utils.logger import setup_logger

tests/test_kernel_output_intel_cpu_xpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
from torch import Tensor
1313

1414
from gptqmodel import BACKEND, GPTQModel
15+
from gptqmodel.nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
1516
from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
1617
from gptqmodel.nn_modules.qlinear.torch_fused import TorchFusedQuantLinear
17-
from gptqmodel.nn_modules.qlinear.gemm_hf_kernel import HFKernelLinear
1818
from gptqmodel.utils.model import find_modules
1919

2020

0 commit comments

Comments
 (0)