intel
diff --git a/‎auto_round_extension/vllm_ext/README.md‎
Lines changed: 12 additions & 0 deletions b/‎auto_round_extension/vllm_ext/README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎auto_round_extension/vllm_ext/__init__.py‎
Lines changed: 5 additions & 5 deletions b/‎auto_round_extension/vllm_ext/__init__.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎auto_round_extension/vllm_ext/apply_ext.sh‎
Lines changed: 0 additions & 46 deletions b/‎auto_round_extension/vllm_ext/apply_ext.sh‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎auto_round_extension/vllm_ext/auto_round_ext.py‎
Lines changed: 13 additions & 6 deletions b/‎auto_round_extension/vllm_ext/auto_round_ext.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎auto_round_extension/vllm_ext/envs_ext.py‎
Lines changed: 5 additions & 3 deletions b/‎auto_round_extension/vllm_ext/envs_ext.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎auto_round_extension/vllm_ext/fp4_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎auto_round_extension/vllm_ext/fp4_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎auto_round_extension/vllm_ext/linear_impl_mxfp4.py‎
Lines changed: 129 additions & 0 deletions b/‎auto_round_extension/vllm_ext/linear_impl_mxfp4.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎auto_round_extension/vllm_ext/linear_impl_mxfp8.py‎
Lines changed: 112 additions & 0 deletions b/‎auto_round_extension/vllm_ext/linear_impl_mxfp8.py‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,12 @@
+-  Build and Install vLLM 
+
+```
+git clone --branch fused-moe-ar https://github.com/yiliu30/vllm-fork.git
+VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
+```
+
+
+- Enable vLLM-Ext at Runtime
+```bash
+VLLM_ENABLE_AR_EXT=1 vllm serve ...
+```
@@ -18,9 +18,9 @@
 
 
 def apply():
-    import vllm.model_executor.layers.quantization.auto_round as auto_round_module
+    import auto_round_extension.vllm_ext.auto_round_ext
+    import auto_round_extension.vllm_ext.envs_ext
 
-    from auto_round_extension.vllm_ext.auto_round_ext import AutoRoundExtensionConfig
-
-    auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig
-    from auto_round_extension.vllm_ext.envs_ext import extra_environment_variables
+    print("*****************************************************************************")
+    print("* !!! VLLM_ENABLE_AR_EXT is set to 1, applying auto_round_vllm_extension   *")
+    print("*****************************************************************************")
@@ -18,25 +18,26 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig
+from vllm.model_executor.layers.quantization.auto_round import AutoRoundConfig as _BaseAutoRoundConfig
 
 from auto_round.schemes import QuantizationScheme
+from auto_round_extension.vllm_ext.quant_method_linear import AutoRoundQuantLinearMethod
 from auto_round_extension.vllm_ext.quant_method_moe import AutoRoundMoEMethod
 
 logger = init_logger(__name__)
 
 
-class AutoRoundExtensionConfig(AutoRoundConfig):
-    SUPPORTED_DTYPES = AutoRoundConfig.SUPPORTED_DTYPES.union({"mx_fp"})
-    SUPPORTED_FORMATS = AutoRoundConfig.SUPPORTED_FORMATS.union({"auto_round:llm_compressor"})
+class AutoRoundExtensionConfig(_BaseAutoRoundConfig):
+    SUPPORTED_DTYPES = _BaseAutoRoundConfig.SUPPORTED_DTYPES.union({"mx_fp"})
+    SUPPORTED_FORMATS = _BaseAutoRoundConfig.SUPPORTED_FORMATS.union({"auto_round:llm_compressor"})
 
     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
         # FIXME: (yi) make it compatible with `AutoRoundConfig`
         if isinstance(layer, FusedMoE):
             quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix)
             return quant_method
         elif isinstance(layer, LinearBase):
-            return UnquantizedLinearMethod()
+            return AutoRoundQuantLinearMethod.get_method(self, layer, prefix)
         else:
             return None
 
@@ -48,7 +49,7 @@ def _parse_quant_scheme(config: dict):
         return quant_scheme
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> AutoRoundConfig:
+    def from_config(cls, config: dict[str, Any]) -> _BaseAutoRoundConfig:
         ar_config = super().from_config(config)
         # TODO: (yi) refine below implementation
         quant_scheme = AutoRoundExtensionConfig._parse_quant_scheme(config)
@@ -61,3 +62,9 @@ def from_config(cls, config: dict[str, Any]) -> AutoRoundConfig:
         ar_config.quant_scheme = quant_scheme
         ar_config.layer_schemes = layer_schemes
         return ar_config
+
+
+# Patch vLLM’s AutoRoundConfig at import time
+import vllm.model_executor.layers.quantization.auto_round as _auto_round_module
+
+_auto_round_module.AutoRoundConfig = AutoRoundExtensionConfig
@@ -21,9 +21,11 @@
 
 # Define extra environment variables
 extra_environment_variables: dict[str, Callable[[], Any]] = {
-    "VLLM_MXFP4_PRE_UNPACK_WEIGHTS": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "1") in ("1", "true", "True"),
-    "VLLM_ENABLE_STATIC_MOE": lambda: os.getenv("VLLM_ENABLE_STATIC_MOE", "1") in ("1", "true", "True"),
-    "VLLM_AR_MXFP4_MODULAR_MOE": lambda: os.getenv("VLLM_AR_MXFP4_MODULAR_MOE", "0") in ("1", "true", "True"),
+    "VLLM_MXFP4_PRE_UNPACK_WEIGHTS": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") in ("1", "true", "True"),
+    "VLLM_MXFP4_PRE_UNPACK_TO_FP8": lambda: os.getenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") in ("1", "true", "True"),
+    "VLLM_ENABLE_STATIC_MOE": lambda: os.getenv("VLLM_ENABLE_STATIC_MOE", "0") in ("1", "true", "True"),
+    "VLLM_AR_MXFP4_MODULAR_MOE": lambda: os.getenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") in ("1", "true", "True"),
+    "VLLM_AR_POST_PROCESS_GPTOSS": lambda: os.getenv("VLLM_AR_POST_PROCESS_GPTOSS", "0") in ("1", "true", "True"),
 }
 # Add the extra environment variables to vllm.envs
 import vllm.envs as envs
 
@@ -51,7 +51,7 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
     indices = indices.reshape(-1)
 
     # Handle odd length by padding if necessary
-    assert indices.numel() % 2 != 0, f"Expected even number of elements, got {indices.numel()}"
+    # assert indices.numel() % 2 != 0, f"Expected even number of elements, got {indices.numel()}"
 
     # Reshape to pair consecutive elements
     indices = indices.reshape(-1, 2)
 
@@ -0,0 +1,129 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, Optional
+
+import torch
+import vllm.envs as envs
+from torch.nn.parameter import Parameter
+from vllm.logger import init_logger
+from vllm.model_executor.parameter import GroupQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter
+from vllm.platforms import current_platform
+
+from auto_round_extension.vllm_ext.mxfp4_qdq_utils import (
+    dequant_mxfp4_to_fp8,
+    mxfp4_gemm_with_unpacked_weight,
+    run_mxfp4_emulations,
+)
+
+logger = init_logger(__name__)
+
+__all__ = ["AutoRoundMXFP4LinearImpl"]
+
+from auto_round_extension.vllm_ext.quant_impl import AutoRoundQuantImpl
+
+
+class AutoRoundMXFP4LinearImpl(AutoRoundQuantImpl):
+    def __init__(self, quant_scheme):
+        self.quant_scheme = quant_scheme
+        self.group_size = 32
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        if envs.VLLM_USE_MXFP4_CT_EMULATIONS:
+            return 80
+        return 100
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Weight
+        weight = ModelWeightParameter(
+            data=torch.empty(sum(output_partition_sizes), input_size_per_partition // 2, dtype=torch.uint8),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_packed", weight)
+
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // self.group_size,
+                # dtype=torch.uint8,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer) -> None:
+        # FIXME: may dequant to bf16
+        if envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS:
+
+            weight_fp8, scale_bf16 = dequant_mxfp4_to_fp8(
+                data_lp=layer.weight_packed,
+                scale_e8m0=layer.weight_scale,
+            )
+            del layer.weight_packed
+            del layer.weight_scale
+            layer.weight_packed = None
+            layer.weight_scale = None
+            layer.register_parameter(
+                "weight_unpacked_fp8",
+                torch.nn.Parameter(
+                    weight_fp8,
+                    requires_grad=False,
+                ),
+            )
+            layer.register_parameter(
+                "weight_scale_bf16",
+                torch.nn.Parameter(
+                    scale_bf16,
+                    requires_grad=False,
+                ),
+            )
+
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if not envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS:
+            out = run_mxfp4_emulations(x=x, weight=layer.weight_packed, weight_scale=layer.weight_scale)
+            if bias is not None:
+                out = out + bias
+            return out
+        else:
+            out = mxfp4_gemm_with_unpacked_weight(
+                x=x,
+                weight_fp8=layer.weight_unpacked_fp8,
+                weight_scale_bf16=layer.weight_scale_bf16,
+                bias=bias,
+            )
+            return out
@@ -0,0 +1,112 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional
+
+import torch
+import vllm.envs as envs
+from vllm.model_executor.parameter import (
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+
+from auto_round_extension.vllm_ext.mxfp8_qdq_utils import dequant_mx_fp8, quant_mx_fp8
+from auto_round_extension.vllm_ext.quant_impl import AutoRoundQuantImpl
+
+
+class AutoRoundMXFP8LinearImpl(AutoRoundQuantImpl):
+    def __init__(self, quant_scheme):
+        self.quant_scheme = quant_scheme
+        self.strategy = "TENSOR_GROUP"
+        self.out_dtype = torch.get_default_dtype()
+        self.group_size = 32
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    def process_weights_after_loading(self, layer) -> None:
+        return
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        # maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        # Per Group Weight Scale
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition // self.group_size,
+                dtype=torch.uint8,  # E8M0 for MXFP8 scale
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # dequant weight
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+        dequnat_weight = dequant_mx_fp8(
+            weight_fp8=weight.data,
+            scale_e8m0=weight_scale.data,
+            block_size=self.group_size,
+            target_dtype=x.dtype,
+        )
+        dequnat_weight = dequnat_weight.to(x.dtype)
+        # if not envs.VLLM_AR_MXFP8_DISABLE_INPUT_QDQ:
+        # q-dq input
+        x_scale, x_quant = quant_mx_fp8(x)
+        dequant_x = dequant_mx_fp8(
+            weight_fp8=x_quant,
+            scale_e8m0=x_scale,
+            block_size=self.group_size,
+            target_dtype=x.dtype,
+        )
+        x = dequant_x.to(x.dtype)
+
+        out = x @ dequnat_weight.t()
+        return out.to(x.dtype) + (bias if bias is not None else 0)