From 493c384526c9ac353251c833bd002455011cbaa6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 02:44:45 -0500 Subject: [PATCH 01/15] add fp8 kv Signed-off-by: yiliu30 --- .../vllm_ext/auto_round_ext.py | 7 ++ auto_round_extension/vllm_ext/kv_cache.py | 65 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 auto_round_extension/vllm_ext/kv_cache.py diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py index d665fd568..5d5d323cf 100644 --- a/auto_round_extension/vllm_ext/auto_round_ext.py +++ b/auto_round_extension/vllm_ext/auto_round_ext.py @@ -33,6 +33,13 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig): def get_quant_method(self, layer: torch.nn.Module, prefix: str): # FIXME: (yi) make it compatible with `AutoRoundConfig` + from vllm.attention.layer import Attention + + if isinstance(layer, Attention): + from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod + + return AutoRoundKVCacheMethod(self) + if isinstance(layer, FusedMoE): quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix) return quant_method diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py new file mode 100644 index 000000000..a1630e61d --- /dev/null +++ b/auto_round_extension/vllm_ext/kv_cache.py @@ -0,0 +1,65 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING, Any, Literal, Optional, cast + +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod + + +class AutoRoundKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from compressed-tensors + checkpoints. + """ + + def __init__(self, quant_config): + self.validate_kv_cache_scheme(quant_config) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None): + """ + Validator for the kv cache scheme. Useful for controlling the + kv cache quantization schemes, that are being supported in vLLM + :param kv_cache_scheme: the compressed-tensors kv cache scheme + """ + return True + if kv_cache_scheme is None: + return + + type_ = kv_cache_scheme.get("type") + num_bits = kv_cache_scheme.get("num_bits") + + if type_ != "float" and num_bits != 8: + raise NotImplementedError( + "Currently supported kv cache quantization is " + "num_bits=8, type=float, however " + f"received num_bits={num_bits}, type={type_}" + ) + + strategy = kv_cache_scheme.get("strategy") + if strategy != "tensor": + raise NotImplementedError( + "Only support per-tensor scaling factor " + "for compressed-tensors KV cache. " + f"Expected strategy: tensor, found strategy: {strategy}" + ) + + is_symmetric = kv_cache_scheme.get("symmetric") + if not is_symmetric: + raise NotImplementedError( + "Only support symmetric scaling factor " + "for compressed-tensors KV cache. " + f"However found symmetric: {is_symmetric}" + ) From c0aff3548190458c280cacf574d1c2a9c29348d0 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 03:07:09 -0800 Subject: [PATCH 02/15] calib for kv/attn Signed-off-by: yiliu30 --- auto_round/compressors/base.py | 7 +++++-- auto_round/compressors/utils.py | 8 +++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 17470e2ae..6f74c2dc7 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -1408,11 +1408,14 @@ def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]: enable_imatrix = True elif self.data_type == "int" and self.sym: enable_imatrix = True - if enable_imatrix: self._quant_rtn_with_imatrix(all_to_quantized_module_names) elif self.act_bits <= 8 and check_need_act_calibration( - self.act_dynamic, self.act_data_type, self.act_bits + self.act_dynamic, + self.act_data_type, + self.act_bits, + self.static_kv_dtype, + self.static_attention_dtype, ): # TODO, mixed datatype has bug hook_handles = self._register_act_max_hook(self.model) try: diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 9c6c02877..1e5683623 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -146,8 +146,14 @@ def check_skippable_keywords(key): def check_need_act_calibration( - is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: Union[int, None] = 16 + is_act_dynamic: Union[bool, None], + act_data_type: Union[str, None] = None, + act_bits: Union[int, None] = 16, + static_kv_dtype: Union[str, None] = None, + static_attention_dtype: Union[str, None] = None, ) -> bool: + if static_kv_dtype is not None or static_attention_dtype is not None: + return True if act_bits is None or act_bits > 8: return False # None is dynamic From aea1c1c01904726c4987167e71a27e43337c0527 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 19:37:13 -0800 Subject: [PATCH 03/15] add ut Signed-off-by: yiliu30 --- test/test_cpu/test_mxfp_nvfp.py | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index ce61a9435..37cde58db 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -371,5 +371,56 @@ def test_qwen_moe_quant_infer(self): shutil.rmtree(quantized_model_path, ignore_errors=True) + @parameterized.expand( + [ + # scheme, static_kv_dtype, static_attention_dtype + ("MXFP4", None, "fp8"), + ("MXFP4", "fp8", None), + ("MXFP8", None, "fp8"), + ("MXFP8", "fp8", None), + ("NVFP4", None, "fp8"), + ("NVFP4", "fp8", None), + ] + ) + def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): + model_name = self.model_name + + autoround = AutoRound( + model_name, + scheme=scheme, + iters=0, + seqlen=2, + dataset=self.llm_dataloader, + static_kv_dtype=static_kv_dtype, + static_attention_dtype=static_attention_dtype, + ) + + quantized_model_path = self.save_dir + compressed_model, _ = autoround.quantize_and_save( + output_dir=quantized_model_path, + format="auto_round", + ) + + attn = compressed_model.model.decoder.layers[3].self_attn + q_proj = attn.q_proj + + # weight_scale should exist for all quantized schemes + assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}" + + # Only when static_kv_dtype / static_attention_dtype are fp8 do we expect FP8 KV scales + if static_kv_dtype == "fp8" or static_attention_dtype == "fp8": + assert attn.k_scale is not None and attn.v_scale is not None, ( + f"Missing k_scale/v_scale in attention for scheme={scheme}, " + f"static_kv_dtype={static_kv_dtype}, static_attention_dtype={static_attention_dtype}" + ) + + if static_attention_dtype == "fp8": + assert ( + getattr(attn, "q_scale", None) is not None + ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" + + shutil.rmtree(quantized_model_path, ignore_errors=True) + + if __name__ == "__main__": unittest.main() From e5473c34f65915eed33ca61c555b2c9a36331e65 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 03:39:12 +0000 Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_cpu/test_mxfp_nvfp.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 37cde58db..6dc61e3f7 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -370,16 +370,15 @@ def test_qwen_moe_quant_infer(self): self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) shutil.rmtree(quantized_model_path, ignore_errors=True) - @parameterized.expand( [ # scheme, static_kv_dtype, static_attention_dtype - ("MXFP4", None, "fp8"), - ("MXFP4", "fp8", None), - ("MXFP8", None, "fp8"), - ("MXFP8", "fp8", None), - ("NVFP4", None, "fp8"), - ("NVFP4", "fp8", None), + ("MXFP4", None, "fp8"), + ("MXFP4", "fp8", None), + ("MXFP8", None, "fp8"), + ("MXFP8", "fp8", None), + ("NVFP4", None, "fp8"), + ("NVFP4", "fp8", None), ] ) def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): From 831e770b200ae139147f837198b9aad2bf63f5d5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 19:40:26 -0800 Subject: [PATCH 05/15] revert Signed-off-by: yiliu30 --- .../vllm_ext/auto_round_ext.py | 7 -- auto_round_extension/vllm_ext/kv_cache.py | 65 ------------------- 2 files changed, 72 deletions(-) delete mode 100644 auto_round_extension/vllm_ext/kv_cache.py diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py index 5d5d323cf..d665fd568 100644 --- a/auto_round_extension/vllm_ext/auto_round_ext.py +++ b/auto_round_extension/vllm_ext/auto_round_ext.py @@ -33,13 +33,6 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig): def get_quant_method(self, layer: torch.nn.Module, prefix: str): # FIXME: (yi) make it compatible with `AutoRoundConfig` - from vllm.attention.layer import Attention - - if isinstance(layer, Attention): - from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod - - return AutoRoundKVCacheMethod(self) - if isinstance(layer, FusedMoE): quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix) return quant_method diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py deleted file mode 100644 index a1630e61d..000000000 --- a/auto_round_extension/vllm_ext/kv_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import TYPE_CHECKING, Any, Literal, Optional, cast - -from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod - - -class AutoRoundKVCacheMethod(BaseKVCacheMethod): - """ - Supports loading kv-cache scaling factors from compressed-tensors - checkpoints. - """ - - def __init__(self, quant_config): - self.validate_kv_cache_scheme(quant_config) - super().__init__(quant_config) - - @staticmethod - def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None): - """ - Validator for the kv cache scheme. Useful for controlling the - kv cache quantization schemes, that are being supported in vLLM - :param kv_cache_scheme: the compressed-tensors kv cache scheme - """ - return True - if kv_cache_scheme is None: - return - - type_ = kv_cache_scheme.get("type") - num_bits = kv_cache_scheme.get("num_bits") - - if type_ != "float" and num_bits != 8: - raise NotImplementedError( - "Currently supported kv cache quantization is " - "num_bits=8, type=float, however " - f"received num_bits={num_bits}, type={type_}" - ) - - strategy = kv_cache_scheme.get("strategy") - if strategy != "tensor": - raise NotImplementedError( - "Only support per-tensor scaling factor " - "for compressed-tensors KV cache. " - f"Expected strategy: tensor, found strategy: {strategy}" - ) - - is_symmetric = kv_cache_scheme.get("symmetric") - if not is_symmetric: - raise NotImplementedError( - "Only support symmetric scaling factor " - "for compressed-tensors KV cache. " - f"However found symmetric: {is_symmetric}" - ) From 97e61b225ec3ee7fedf51aff327682729549bce8 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 19:49:52 -0800 Subject: [PATCH 06/15] update config Signed-off-by: yiliu30 --- auto_round/compressors/base.py | 2 ++ test/test_cpu/test_mxfp_nvfp.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 6f74c2dc7..aa867e659 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -3139,6 +3139,8 @@ def save_quantized( "super_bits", "super_group_size", "regex_config", + "static_kv_dtype", + "static_attention_dtype", ] if isinstance(self.dataset, str): serialization_keys.append("dataset") diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 6dc61e3f7..d73f1ad53 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -405,6 +405,10 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): # weight_scale should exist for all quantized schemes assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}" + if static_kv_dtype == "fp8": + assert ( + compressed_model.config.quantization_config["static_kv_dtype"] == "fp8" + ), f"Invalid static_kv_dtype in config for scheme={scheme}, static_kv_dtype={static_kv_dtype}" # Only when static_kv_dtype / static_attention_dtype are fp8 do we expect FP8 KV scales if static_kv_dtype == "fp8" or static_attention_dtype == "fp8": @@ -414,10 +418,12 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): ) if static_attention_dtype == "fp8": + assert ( + compressed_model.config.quantization_config["static_attention_dtype"] == "fp8" + ), f"Invalid static_attention_dtype in config for scheme={scheme}, static_attention_dtype={static_attention_dtype}" assert ( getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" - shutil.rmtree(quantized_model_path, ignore_errors=True) From 4b850bf9bc7b6fbffbb35b6a079f6c6f062e2280 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 21:36:15 -0800 Subject: [PATCH 07/15] update Signed-off-by: yiliu30 --- auto_round_extension/vllm_ext/auto_round_ext.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/auto_round_extension/vllm_ext/auto_round_ext.py b/auto_round_extension/vllm_ext/auto_round_ext.py index d665fd568..8ef85069a 100644 --- a/auto_round_extension/vllm_ext/auto_round_ext.py +++ b/auto_round_extension/vllm_ext/auto_round_ext.py @@ -33,6 +33,12 @@ class AutoRoundExtensionConfig(_BaseAutoRoundConfig): def get_quant_method(self, layer: torch.nn.Module, prefix: str): # FIXME: (yi) make it compatible with `AutoRoundConfig` + from vllm.attention.layer import Attention + + if isinstance(layer, Attention): + from auto_round_extension.vllm_ext.kv_cache import AutoRoundKVCacheMethod + + return AutoRoundKVCacheMethod(self) if isinstance(layer, FusedMoE): quant_method = AutoRoundMoEMethod.get_moe_method(self, layer, prefix) return quant_method From 66ffe1d86e726487052ec529f49ff3a7706c6bad Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 2 Dec 2025 22:25:34 -0800 Subject: [PATCH 08/15] add ut Signed-off-by: yiliu30 --- .../vllm_ext/tests/test_fp8kv.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 auto_round_extension/vllm_ext/tests/test_fp8kv.py diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py new file mode 100644 index 000000000..f01cca8f0 --- /dev/null +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import pytest +from vllm.platforms import current_platform + + +def cuda_capability_at_least(major, minor): + device_capability = torch.cuda.get_device_capability() + return device_capability[0] >= major or (device_capability[0] == major and device_capability[1] >= minor) + + +MODELS = ["/home/yiliu7/workspace/auto-round/examples/Qwen2.5-0.5B-Instruct-ar-MXFP4-fp8"] + + +@pytest.fixture(autouse=True) +def set_vllm_ar_env(monkeypatch): + monkeypatch.setenv("VLLM_AR_MXFP4_MODULAR_MOE", "1") + monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_TO_FP8", "1") + monkeypatch.setenv("VLLM_MXFP4_PRE_UNPACK_WEIGHTS", "0") + monkeypatch.setenv("VLLM_ENABLE_STATIC_MOE", "0") + monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0") + monkeypatch.setenv("VLLM_ENABLE_AR_EXT", "1") + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + monkeypatch.setenv("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", "1") + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") + + +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="only supports CUDA backend.", +) +@pytest.mark.skipif( + not cuda_capability_at_least(10, 0), reason="FP8 KV cache only supported on CUDA with compute capability >= 10.0" +) +@pytest.mark.parametrize("model", MODELS) +def test_auto_fp8_kv(vllm_runner, model): + with vllm_runner(model, enforce_eager=True, kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: + output = llm.generate_greedy(["The capital of France is"], max_tokens=8) + assert ( + llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype + == torch.uint8 + ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}" + assert output + print(f"output is: {output[0][1]}") \ No newline at end of file From a4b94ab06c86c22f9b02f4bec4a9a8eef59f50e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 06:26:35 +0000 Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round_extension/vllm_ext/tests/test_fp8kv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py index f01cca8f0..ab9471d92 100644 --- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import torch import pytest +import torch from vllm.platforms import current_platform @@ -53,4 +53,4 @@ def test_auto_fp8_kv(vllm_runner, model): == torch.uint8 ), f"Expected kv_cache_dtype to be torch.uint8, but got {llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype}" assert output - print(f"output is: {output[0][1]}") \ No newline at end of file + print(f"output is: {output[0][1]}") From 25b12ab544bea8c63bb5e94424a332e0a84b2343 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Dec 2025 00:58:31 -0800 Subject: [PATCH 10/15] fix mxfp4 linear Signed-off-by: yiliu30 --- .../vllm_ext/linear_impl_mxfp4.py | 25 ++++++++----------- .../vllm_ext/tests/test_fp8kv.py | 4 ++- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/auto_round_extension/vllm_ext/linear_impl_mxfp4.py b/auto_round_extension/vllm_ext/linear_impl_mxfp4.py index 04d5e20f8..e544fdbbf 100644 --- a/auto_round_extension/vllm_ext/linear_impl_mxfp4.py +++ b/auto_round_extension/vllm_ext/linear_impl_mxfp4.py @@ -86,8 +86,7 @@ def create_weights( def process_weights_after_loading(self, layer) -> None: # FIXME: may dequant to bf16 - if envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS: - + if envs.VLLM_MXFP4_PRE_UNPACK_TO_FP8: weight_fp8, scale_bf16 = dequant_mxfp4_to_fp8( data_lp=layer.weight_packed, scale_e8m0=layer.weight_scale, @@ -110,20 +109,16 @@ def process_weights_after_loading(self, layer) -> None: requires_grad=False, ), ) + else: + raise NotImplementedError("Only VLLM_MXFP4_PRE_UNPACK_TO_FP8 is supported now.") def apply_weights( self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None ) -> torch.Tensor: - if not envs.VLLM_MXFP4_PRE_UNPACK_WEIGHTS: - out = run_mxfp4_emulations(x=x, weight=layer.weight_packed, weight_scale=layer.weight_scale) - if bias is not None: - out = out + bias - return out - else: - out = mxfp4_gemm_with_unpacked_weight( - x=x, - weight_fp8=layer.weight_unpacked_fp8, - weight_scale_bf16=layer.weight_scale_bf16, - bias=bias, - ) - return out + out = mxfp4_gemm_with_unpacked_weight( + x=x, + weight_fp8=layer.weight_unpacked_fp8, + weight_scale_bf16=layer.weight_scale_bf16, + bias=bias, + ) + return out diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py index ab9471d92..cc3a171ad 100644 --- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -46,7 +46,9 @@ def set_vllm_ar_env(monkeypatch): ) @pytest.mark.parametrize("model", MODELS) def test_auto_fp8_kv(vllm_runner, model): - with vllm_runner(model, enforce_eager=True, kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: + with vllm_runner(model, + enforce_eager=True, + kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert ( llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype From c3784f3dcf77debb137d656e240cc67b509bc295 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Dec 2025 00:59:28 -0800 Subject: [PATCH 11/15] enable compile for fp8 kv Signed-off-by: yiliu30 --- auto_round_extension/vllm_ext/tests/test_fp8kv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py index cc3a171ad..1f7b515fc 100644 --- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -47,7 +47,7 @@ def set_vllm_ar_env(monkeypatch): @pytest.mark.parametrize("model", MODELS) def test_auto_fp8_kv(vllm_runner, model): with vllm_runner(model, - enforce_eager=True, + # enforce_eager=True, kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert ( From 955cacdf9ae424294be057660f87935c03a741f8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:01:01 +0000 Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round_extension/vllm_ext/tests/test_fp8kv.py | 9 ++++++--- test/test_cpu/test_mxfp_nvfp.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py index 1f7b515fc..709308f9a 100644 --- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -46,9 +46,12 @@ def set_vllm_ar_env(monkeypatch): ) @pytest.mark.parametrize("model", MODELS) def test_auto_fp8_kv(vllm_runner, model): - with vllm_runner(model, - # enforce_eager=True, - kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: + with vllm_runner( + model, + # enforce_eager=True, + kv_cache_dtype="fp8", + gpu_memory_utilization=0.1, + ) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert ( llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 5b3449bac..ba9d3a1a8 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -410,7 +410,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): attn = compressed_model.model.decoder.layers[0].self_attn q_proj = attn.q_proj - + # weight_scale should exist for all quantized schemes assert hasattr(q_proj, "weight_scale"), f"Missing weight_scale in q_proj for scheme={scheme}" if static_kv_dtype == "fp8": From d0e048f49a22f96226ee678326fee7d01436c64c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Dec 2025 01:02:29 -0800 Subject: [PATCH 13/15] add kv Signed-off-by: yiliu30 --- auto_round_extension/vllm_ext/kv_cache.py | 37 +++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 auto_round_extension/vllm_ext/kv_cache.py diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py new file mode 100644 index 000000000..d5b2cc4c9 --- /dev/null +++ b/auto_round_extension/vllm_ext/kv_cache.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from typing import TYPE_CHECKING, Any, Literal, Optional, cast +import torch +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.logger import init_logger + +logger = init_logger(__name__) + +class AutoRoundKVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from compressed-tensors + checkpoints. + """ + + def __init__(self, quant_config): + self.validate_kv_cache_scheme(quant_config) + super().__init__(quant_config) + + @staticmethod + def validate_kv_cache_scheme(quant_config): + # FIXME: parse from quant_config + return True \ No newline at end of file From e641fc182cf21a19c55ea0715bdbfdf9a9a01039 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Dec 2025 01:03:13 -0800 Subject: [PATCH 14/15] fix Signed-off-by: yiliu30 --- auto_round_extension/vllm_ext/tests/test_fp8kv.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/auto_round_extension/vllm_ext/tests/test_fp8kv.py b/auto_round_extension/vllm_ext/tests/test_fp8kv.py index 1f7b515fc..709308f9a 100644 --- a/auto_round_extension/vllm_ext/tests/test_fp8kv.py +++ b/auto_round_extension/vllm_ext/tests/test_fp8kv.py @@ -46,9 +46,12 @@ def set_vllm_ar_env(monkeypatch): ) @pytest.mark.parametrize("model", MODELS) def test_auto_fp8_kv(vllm_runner, model): - with vllm_runner(model, - # enforce_eager=True, - kv_cache_dtype="fp8", gpu_memory_utilization=0.1) as llm: + with vllm_runner( + model, + # enforce_eager=True, + kv_cache_dtype="fp8", + gpu_memory_utilization=0.1, + ) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert ( llm.llm.llm_engine.engine_core.engine_core.model_executor.driver_worker.worker.model_runner.kv_cache_dtype From 1fe26d98f0195428cad9d594c9b55cdd6e5760d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:03:51 +0000 Subject: [PATCH 15/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round_extension/vllm_ext/kv_cache.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/auto_round_extension/vllm_ext/kv_cache.py b/auto_round_extension/vllm_ext/kv_cache.py index d5b2cc4c9..ec2b7e179 100644 --- a/auto_round_extension/vllm_ext/kv_cache.py +++ b/auto_round_extension/vllm_ext/kv_cache.py @@ -13,14 +13,15 @@ # limitations under the License. - from typing import TYPE_CHECKING, Any, Literal, Optional, cast + import torch -from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod logger = init_logger(__name__) + class AutoRoundKVCacheMethod(BaseKVCacheMethod): """ Supports loading kv-cache scaling factors from compressed-tensors @@ -30,8 +31,8 @@ class AutoRoundKVCacheMethod(BaseKVCacheMethod): def __init__(self, quant_config): self.validate_kv_cache_scheme(quant_config) super().__init__(quant_config) - + @staticmethod def validate_kv_cache_scheme(quant_config): # FIXME: parse from quant_config - return True \ No newline at end of file + return True