From 493c384526c9ac353251c833bd002455011cbaa6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 02:44:45 -0500
Subject: [PATCH 01/15] add fp8 kv

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../vllm_ext/auto_round_ext.py                |  7 ++
 auto_round_extension/vllm_ext/kv_cache.py     | 65 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 auto_round_extension/vllm_ext/kv_cache.py

diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py
index d665fd568..5d5d323cf 100644
--- a/auto_round_extension/vllm_ext/auto_round_ext.py
+++ b/auto_round_extension/vllm_ext/auto_round_ext.py
@@ -33,6 +33,13 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig):
 
     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
         # FIXME: (yi) make it compatible with `AutoRoundConfig`
+        from vllm.attention.layer import Attention
+
+        if isinstance(layer, Attention):
+            from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod
+
+            return AutoRoundKVCacheMethod(self)
+
         if isinstance(layer, FusedMoE):
             quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix)
             return quant_method
diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py
new file mode 100644
index 000000000..a1630e61d
--- /dev/null
+++ b/auto_round_extension/vllm_ext/kv_cache.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+
+
+class AutoRoundKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from compressed-tensors
+    checkpoints.
+    """
+
+    def __init__(self, quant_config):
+        self.validate_kv_cache_scheme(quant_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None):
+        """
+        Validator for the kv cache scheme. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_scheme: the compressed-tensors kv cache scheme
+        """
+        return True
+        if kv_cache_scheme is None:
+            return
+
+        type_ = kv_cache_scheme.get("type")
+        num_bits = kv_cache_scheme.get("num_bits")
+
+        if type_ != "float" and num_bits != 8:
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                "num_bits=8, type=float, however "
+                f"received num_bits={num_bits}, type={type_}"
+            )
+
+        strategy = kv_cache_scheme.get("strategy")
+        if strategy != "tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for compressed-tensors KV cache. "
+                f"Expected strategy: tensor, found strategy: {strategy}"
+            )
+
+        is_symmetric = kv_cache_scheme.get("symmetric")
+        if not is_symmetric:
+            raise NotImplementedError(
+                "Only support symmetric scaling factor "
+                "for compressed-tensors KV cache. "
+                f"However found symmetric: {is_symmetric}"
+            )

From c0aff3548190458c280cacf574d1c2a9c29348d0 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 03:07:09 -0800
Subject: [PATCH 02/15] calib for kv/attn

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/compressors/base.py  | 7 +++++--
 auto_round/compressors/utils.py | 8 +++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 17470e2ae..6f74c2dc7 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1408,11 +1408,14 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                 enable_imatrix = True
             elif self.data_type == "int" and self.sym:
                 enable_imatrix = True
-
         if enable_imatrix:
             self._quant_rtn_with_imatrix(all_to_quantized_module_names)
         elif self.act_bits <= 8 and check_need_act_calibration(
-            self.act_dynamic, self.act_data_type, self.act_bits
+            self.act_dynamic,
+            self.act_data_type,
+            self.act_bits,
+            self.static_kv_dtype,
+            self.static_attention_dtype,
         ):  # TODO, mixed datatype has bug
             hook_handles = self._register_act_max_hook(self.model)
             try:
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
index 9c6c02877..1e5683623 100644
--- a/auto_round/compressors/utils.py
+++ b/auto_round/compressors/utils.py
@@ -146,8 +146,14 @@ def check_skippable_keywords(key):
 
 
 def check_need_act_calibration(
-    is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16
+    is_act_dynamic: Union[bool, None],
+    act_data_type: Union[str, None] = None,
+    act_bits: Union[int, None] = 16,
+    static_kv_dtype: Union[str, None] = None,
+    static_attention_dtype: Union[str, None] = None,
 ) -> bool:
+    if static_kv_dtype is not None or static_attention_dtype is not None:
+        return True
     if act_bits is None or act_bits > 8:
         return False
     # None is dynamic

From aea1c1c01904726c4987167e71a27e43337c0527 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 19:37:13 -0800
Subject: [PATCH 03/15] add ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 test/test_cpu/test_mxfp_nvfp.py | 51 +++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index ce61a9435..37cde58db 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -371,5 +371,56 @@ def test_qwen_moe_quant_infer(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 
+    @parameterized.expand(
+        [
+            # scheme,  static_kv_dtype, static_attention_dtype
+            ("MXFP4", None,   "fp8"),
+            ("MXFP4", "fp8",  None),
+            ("MXFP8", None,   "fp8"),
+            ("MXFP8", "fp8",  None),
+            ("NVFP4", None,   "fp8"),
+            ("NVFP4", "fp8",  None),
+        ]
+    )
+    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
+        model_name = self.model_name
+
+        autoround = AutoRound(
+            model_name,
+            scheme=scheme,
+            iters=0,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            static_kv_dtype=static_kv_dtype,
+            static_attention_dtype=static_attention_dtype,
+        )
+
+        quantized_model_path = self.save_dir
+        compressed_model, _ = autoround.quantize_and_save(
+            output_dir=quantized_model_path,
+            format="auto_round",
+        )
+
+        attn = compressed_model.model.decoder.layers[3].self_attn
+        q_proj = attn.q_proj
+
+        # weight_scale should exist for all quantized schemes
+        assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}"
+
+        # Only when static_kv_dtype / static_attention_dtype are fp8 do we expect FP8 KV scales
+        if static_kv_dtype == "fp8" or static_attention_dtype == "fp8":
+            assert attn.k_scale is not None and attn.v_scale is not None, (
+                f"Missing k_scale/v_scale in attention for scheme={scheme}, "
+                f"static_kv_dtype={static_kv_dtype}, static_attention_dtype={static_attention_dtype}"
+            )
+
+        if static_attention_dtype == "fp8":
+            assert (
+                getattr(attn, "q_scale", None) is not None
+            ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
+
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
 if __name__ == "__main__":
     unittest.main()

From e5473c34f65915eed33ca61c555b2c9a36331e65 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 03:39:12 +0000
Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_cpu/test_mxfp_nvfp.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 37cde58db..6dc61e3f7 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -370,16 +370,15 @@ def test_qwen_moe_quant_infer(self):
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-
     @parameterized.expand(
         [
             # scheme,  static_kv_dtype, static_attention_dtype
-            ("MXFP4", None,   "fp8"),
-            ("MXFP4", "fp8",  None),
-            ("MXFP8", None,   "fp8"),
-            ("MXFP8", "fp8",  None),
-            ("NVFP4", None,   "fp8"),
-            ("NVFP4", "fp8",  None),
+            ("MXFP4", None, "fp8"),
+            ("MXFP4", "fp8", None),
+            ("MXFP8", None, "fp8"),
+            ("MXFP8", "fp8", None),
+            ("NVFP4", None, "fp8"),
+            ("NVFP4", "fp8", None),
         ]
     )
     def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):

From 831e770b200ae139147f837198b9aad2bf63f5d5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 19:40:26 -0800
Subject: [PATCH 05/15] revert

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../vllm_ext/auto_round_ext.py                |  7 --
 auto_round_extension/vllm_ext/kv_cache.py     | 65 -------------------
 2 files changed, 72 deletions(-)
 delete mode 100644 auto_round_extension/vllm_ext/kv_cache.py

diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py
index 5d5d323cf..d665fd568 100644
--- a/auto_round_extension/vllm_ext/auto_round_ext.py
+++ b/auto_round_extension/vllm_ext/auto_round_ext.py
@@ -33,13 +33,6 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig):
 
     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
         # FIXME: (yi) make it compatible with `AutoRoundConfig`
-        from vllm.attention.layer import Attention
-
-        if isinstance(layer, Attention):
-            from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod
-
-            return AutoRoundKVCacheMethod(self)
-
         if isinstance(layer, FusedMoE):
             quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix)
             return quant_method
diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py
deleted file mode 100644
index a1630e61d..000000000
--- a/auto_round_extension/vllm_ext/kv_cache.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import TYPE_CHECKING, Any, Literal, Optional, cast
-
-from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-
-
-class AutoRoundKVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from compressed-tensors
-    checkpoints.
-    """
-
-    def __init__(self, quant_config):
-        self.validate_kv_cache_scheme(quant_config)
-        super().__init__(quant_config)
-
-    @staticmethod
-    def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None):
-        """
-        Validator for the kv cache scheme. Useful for controlling the
-        kv cache quantization schemes, that are being supported in vLLM
-        :param kv_cache_scheme: the compressed-tensors kv cache scheme
-        """
-        return True
-        if kv_cache_scheme is None:
-            return
-
-        type_ = kv_cache_scheme.get("type")
-        num_bits = kv_cache_scheme.get("num_bits")
-
-        if type_ != "float" and num_bits != 8:
-            raise NotImplementedError(
-                "Currently supported kv cache quantization is "
-                "num_bits=8, type=float, however "
-                f"received num_bits={num_bits}, type={type_}"
-            )
-
-        strategy = kv_cache_scheme.get("strategy")
-        if strategy != "tensor":
-            raise NotImplementedError(
-                "Only support per-tensor scaling factor "
-                "for compressed-tensors KV cache. "
-                f"Expected strategy: tensor, found strategy: {strategy}"
-            )
-
-        is_symmetric = kv_cache_scheme.get("symmetric")
-        if not is_symmetric:
-            raise NotImplementedError(
-                "Only support symmetric scaling factor "
-                "for compressed-tensors KV cache. "
-                f"However found symmetric: {is_symmetric}"
-            )

From 97e61b225ec3ee7fedf51aff327682729549bce8 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 19:49:52 -0800
Subject: [PATCH 06/15] update config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/compressors/base.py  | 2 ++
 test/test_cpu/test_mxfp_nvfp.py | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 6f74c2dc7..aa867e659 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -3139,6 +3139,8 @@ def save_quantized(
             "super_bits",
             "super_group_size",
             "regex_config",
+            "static_kv_dtype",
+            "static_attention_dtype",
         ]
         if isinstance(self.dataset, str):
             serialization_keys.append("dataset")
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 6dc61e3f7..d73f1ad53 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -405,6 +405,10 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
 
         # weight_scale should exist for all quantized schemes
         assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}"
+        if static_kv_dtype == "fp8":
+            assert (
+                compressed_model.config.quantization_config["static_kv_dtype"] == "fp8"
+            ), f"Invalid static_kv_dtype in config for scheme={scheme}, static_kv_dtype={static_kv_dtype}"
 
         # Only when static_kv_dtype / static_attention_dtype are fp8 do we expect FP8 KV scales
         if static_kv_dtype == "fp8" or static_attention_dtype == "fp8":
@@ -414,10 +418,12 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
             )
 
         if static_attention_dtype == "fp8":
+            assert (
+                compressed_model.config.quantization_config["static_attention_dtype"] == "fp8"
+            ), f"Invalid static_attention_dtype in config for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
             assert (
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
-
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 

From 4b850bf9bc7b6fbffbb35b6a079f6c6f062e2280 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 21:36:15 -0800
Subject: [PATCH 07/15] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round_extension/vllm_ext/auto_round_ext.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py
index d665fd568..8ef85069a 100644
--- a/auto_round_extension/vllm_ext/auto_round_ext.py
+++ b/auto_round_extension/vllm_ext/auto_round_ext.py
@@ -33,6 +33,12 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig):
 
     def get_quant_method(self, layer: torch.nn.Module, prefix: str):
         # FIXME: (yi) make it compatible with `AutoRoundConfig`
+        from vllm.attention.layer import Attention
+
+        if isinstance(layer, Attention):
+            from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod
+
+            return AutoRoundKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
             quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix)
             return quant_method

From 66ffe1d86e726487052ec529f49ff3a7706c6bad Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 2 Dec 2025 22:25:34 -0800
Subject: [PATCH 08/15] add ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../vllm_ext/tests/test_fp8kv.py              | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 auto_round_extension/vllm_ext/tests/test_fp8kv.py

diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
new file mode 100644
index 000000000..f01cca8f0
--- /dev/null
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import pytest
+from vllm.platforms import current_platform
+
+
+def cuda_capability_at_least(major, minor):
+    device_capability = torch.cuda.get_device_capability()
+    return device_capability[0] >= major or (device_capability[0] == major and device_capability[1] >= minor)
+
+
+MODELS = ["/home/yiliu7/workspace/auto-round/examples/Qwen2.5-0.5B-Instruct-ar-MXFP4-fp8"]
+
+
+@pytest.fixture(autouse=True)
+def set_vllm_ar_env(monkeypatch):
+    monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1")
+    monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1")
+    monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0")
+    monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0")
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
+    monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1")
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "1")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="only supports CUDA backend.",
+)
+@pytest.mark.skipif(
+    not cuda_capability_at_least(10, 0), reason="FP8 KV cache only supported on CUDA with compute capability >= 10.0"
+)
+@pytest.mark.parametrize("model", MODELS)
+def test_auto_fp8_kv(vllm_runner, model):
+    with vllm_runner(model, enforce_eager=True, kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
+        assert (
+            llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype
+            == torch.uint8
+        ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}"
+    assert output
+    print(f"output is: {output[0][1]}")
\ No newline at end of file

From a4b94ab06c86c22f9b02f4bec4a9a8eef59f50e1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 3 Dec 2025 06:26:35 +0000
Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round_extension/vllm_ext/tests/test_fp8kv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
index f01cca8f0..ab9471d92 100644
--- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import torch
 import pytest
+import torch
 from vllm.platforms import current_platform
 
 
@@ -53,4 +53,4 @@ def test_auto_fp8_kv(vllm_runner, model):
             == torch.uint8
         ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}"
     assert output
-    print(f"output is: {output[0][1]}")
\ No newline at end of file
+    print(f"output is: {output[0][1]}")

From 25b12ab544bea8c63bb5e94424a332e0a84b2343 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Dec 2025 00:58:31 -0800
Subject: [PATCH 10/15] fix mxfp4 linear

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../vllm_ext/linear_impl_mxfp4.py             | 25 ++++++++-----------
 .../vllm_ext/tests/test_fp8kv.py              |  4 ++-
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/auto_round_extension/vllm_ext/linear_impl_mxfp4.py b/auto_round_extension/vllm_ext/linear_impl_mxfp4.py
index 04d5e20f8..e544fdbbf 100644
--- a/auto_round_extension/vllm_ext/linear_impl_mxfp4.py
+++ b/auto_round_extension/vllm_ext/linear_impl_mxfp4.py
@@ -86,8 +86,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer) -> None:
         # FIXME: may dequant to bf16
-        if envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS:
-
+        if envs.VLLM_MXFP4_PRE_UNPACK_TO_FP8:
             weight_fp8, scale_bf16 = dequant_mxfp4_to_fp8(
                 data_lp=layer.weight_packed,
                 scale_e8m0=layer.weight_scale,
@@ -110,20 +109,16 @@ def process_weights_after_loading(self, layer) -> None:
                     requires_grad=False,
                 ),
             )
+        else:
+            raise NotImplementedError("Only VLLM_MXFP4_PRE_UNPACK_TO_FP8 is supported now.")
 
     def apply_weights(
         self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
-        if not envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS:
-            out = run_mxfp4_emulations(x=x, weight=layer.weight_packed, weight_scale=layer.weight_scale)
-            if bias is not None:
-                out = out + bias
-            return out
-        else:
-            out = mxfp4_gemm_with_unpacked_weight(
-                x=x,
-                weight_fp8=layer.weight_unpacked_fp8,
-                weight_scale_bf16=layer.weight_scale_bf16,
-                bias=bias,
-            )
-            return out
+        out = mxfp4_gemm_with_unpacked_weight(
+            x=x,
+            weight_fp8=layer.weight_unpacked_fp8,
+            weight_scale_bf16=layer.weight_scale_bf16,
+            bias=bias,
+        )
+        return out
diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
index ab9471d92..cc3a171ad 100644
--- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -46,7 +46,9 @@ def set_vllm_ar_env(monkeypatch):
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_fp8_kv(vllm_runner, model):
-    with vllm_runner(model, enforce_eager=True, kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
+    with vllm_runner(model, 
+    enforce_eager=True, 
+    kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
         assert (
             llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype

From c3784f3dcf77debb137d656e240cc67b509bc295 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Dec 2025 00:59:28 -0800
Subject: [PATCH 11/15] enable compile for fp8 kv

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round_extension/vllm_ext/tests/test_fp8kv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
index cc3a171ad..1f7b515fc 100644
--- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -47,7 +47,7 @@ def set_vllm_ar_env(monkeypatch):
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_fp8_kv(vllm_runner, model):
     with vllm_runner(model, 
-    enforce_eager=True, 
+    # enforce_eager=True, 
     kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
         assert (

From 955cacdf9ae424294be057660f87935c03a741f8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 4 Dec 2025 09:01:01 +0000
Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round_extension/vllm_ext/tests/test_fp8kv.py | 9 ++++++---
 test/test_cpu/test_mxfp_nvfp.py                   | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
index 1f7b515fc..709308f9a 100644
--- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -46,9 +46,12 @@ def set_vllm_ar_env(monkeypatch):
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_fp8_kv(vllm_runner, model):
-    with vllm_runner(model, 
-    # enforce_eager=True, 
-    kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
+    with vllm_runner(
+        model,
+        # enforce_eager=True,
+        kv_cache_dtype="fp8",
+        gpu_memory_utilization=0.1,
+    ) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
         assert (
             llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 5b3449bac..ba9d3a1a8 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -410,7 +410,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
 
         attn = compressed_model.model.decoder.layers[0].self_attn
         q_proj = attn.q_proj
-        
+
         # weight_scale should exist for all quantized schemes
         assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}"
         if static_kv_dtype == "fp8":

From d0e048f49a22f96226ee678326fee7d01436c64c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Dec 2025 01:02:29 -0800
Subject: [PATCH 13/15] add kv

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round_extension/vllm_ext/kv_cache.py | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 auto_round_extension/vllm_ext/kv_cache.py

diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py
new file mode 100644
index 000000000..d5b2cc4c9
--- /dev/null
+++ b/auto_round_extension/vllm_ext/kv_cache.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+import torch
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+class AutoRoundKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from compressed-tensors
+    checkpoints.
+    """
+
+    def __init__(self, quant_config):
+        self.validate_kv_cache_scheme(quant_config)
+        super().__init__(quant_config)
+    
+    @staticmethod
+    def validate_kv_cache_scheme(quant_config):
+        # FIXME: parse from quant_config
+        return True
\ No newline at end of file

From e641fc182cf21a19c55ea0715bdbfdf9a9a01039 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Dec 2025 01:03:13 -0800
Subject: [PATCH 14/15] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round_extension/vllm_ext/tests/test_fp8kv.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
index 1f7b515fc..709308f9a 100644
--- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py
+++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py
@@ -46,9 +46,12 @@ def set_vllm_ar_env(monkeypatch):
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_fp8_kv(vllm_runner, model):
-    with vllm_runner(model, 
-    # enforce_eager=True, 
-    kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm:
+    with vllm_runner(
+        model,
+        # enforce_eager=True,
+        kv_cache_dtype="fp8",
+        gpu_memory_utilization=0.1,
+    ) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
         assert (
             llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype

From 1fe26d98f0195428cad9d594c9b55cdd6e5760d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 4 Dec 2025 09:03:51 +0000
Subject: [PATCH 15/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round_extension/vllm_ext/kv_cache.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py
index d5b2cc4c9..ec2b7e179 100644
--- a/auto_round_extension/vllm_ext/kv_cache.py
+++ b/auto_round_extension/vllm_ext/kv_cache.py
@@ -13,14 +13,15 @@
 # limitations under the License.
 
 
-
 from typing import TYPE_CHECKING, Any, Literal, Optional, cast
+
 import torch
-from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 
 logger = init_logger(__name__)
 
+
 class AutoRoundKVCacheMethod(BaseKVCacheMethod):
     """
     Supports loading kv-cache scaling factors from compressed-tensors
@@ -30,8 +31,8 @@ class AutoRoundKVCacheMethod(BaseKVCacheMethod):
     def __init__(self, quant_config):
         self.validate_kv_cache_scheme(quant_config)
         super().__init__(quant_config)
-    
+
     @staticmethod
     def validate_kv_cache_scheme(quant_config):
         # FIXME: parse from quant_config
-        return True
\ No newline at end of file
+        return True