vllm-project
diff --git a/‎.MAINTAINERS‎
Lines changed: 0 additions & 17 deletions b/‎.MAINTAINERS‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎examples/finetuning/configure_fsdp.md‎
Lines changed: 0 additions & 15 deletions b/‎examples/finetuning/configure_fsdp.md‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎examples/finetuning/example_alternating_recipe.yaml‎
Lines changed: 0 additions & 32 deletions b/‎examples/finetuning/example_alternating_recipe.yaml‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎examples/finetuning/example_fsdp_config.yaml‎
Lines changed: 0 additions & 24 deletions b/‎examples/finetuning/example_fsdp_config.yaml‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎examples/finetuning/example_single_gpu_config.yaml‎
Lines changed: 0 additions & 15 deletions b/‎examples/finetuning/example_single_gpu_config.yaml‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎src/llmcompressor/modifiers/awq/mappings.py‎
Lines changed: 25 additions & 0 deletions b/‎src/llmcompressor/modifiers/awq/mappings.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/transform/smoothquant/utils.py‎
Lines changed: 17 additions & 0 deletions b/‎src/llmcompressor/modifiers/transform/smoothquant/utils.py‎
Lines changed: 17 additions & 0 deletions
@@ -192,6 +192,30 @@ class AWQMapping:
     ),
 ]
 
+# AFMOE uses dual normalization: pre_mlp_layernorm feeds the MLP
+# (not post_attention_layernorm) and attention has its own gate_proj
+# for gating mechanism
+_afmoe_mappings = [
+    AWQMapping(
+        "re:.*input_layernorm$",
+        [
+            "re:.*self_attn.q_proj$",
+            "re:.*self_attn.k_proj$",
+            "re:.*self_attn.v_proj$",
+            "re:.*self_attn.gate_proj$",
+        ],
+    ),
+    AWQMapping("re:.*v_proj$", ["re:.*o_proj$"]),
+    AWQMapping(
+        "re:.*pre_mlp_layernorm$",
+        ["re:.*mlp.*gate_proj$", "re:.*mlp.*up_proj$"],
+    ),
+    AWQMapping(
+        "re:.*up_proj$",
+        ["re:.*down_proj$"],
+    ),
+]
+
 # Example mapping for MoE models with parallel transformer blocks, where
 # attention and MoE share the same input. This is the only case where
 # activation_hook_target is needed. Without it, the hook lands on
@@ -217,6 +241,7 @@ class AWQMapping:
 ]
 
 AWQ_MAPPING_REGISTRY: dict[str, list[AWQMapping]] = {
+    "AfmoeForCausalLM": _afmoe_mappings,
     "BloomForCausalLM": _bloom_mappings,
     "CohereForCausalLM": _cohere_mappings,
     "Cohere2ForCausalLM": _cohere_mappings,
 
@@ -66,6 +66,22 @@
     ),
 ]
 
+AFMOE_SMOOTHQUANT_MAPPINGS: list[LayerMap] = [
+    LayerMap(
+        balance_layers=[
+            "re:.*self_attn\\.q_proj",
+            "re:.*self_attn\\.k_proj",
+            "re:.*self_attn\\.v_proj",
+            "re:.*self_attn\\.gate_proj",
+        ],
+        smooth_layers="re:.*input_layernorm",
+    ),
+    LayerMap(
+        balance_layers=["re:.*mlp.*gate_proj", "re:.*mlp.*up_proj"],
+        smooth_layers="re:.*pre_mlp_layernorm",
+    ),
+]
+
 
 # Registry of layer mappings for different architectures
 #   Add more mappings here
@@ -85,6 +101,7 @@
     "Qwen2ForCausalLM": DEFAULT_SMOOTHQUANT_MAPPINGS,
     "Qwen3ForCausalLM": DEFAULT_SMOOTHQUANT_MAPPINGS,
     "WhisperForConditionalGeneration": WHISPER_V2_SMOOTHQUANT_MAPPINGS,
+    "AfmoeForCausalLM": AFMOE_SMOOTHQUANT_MAPPINGS,
 }