Revert "[Fix]Load kv-cache dtype from hf_quant_config.json automatically" (vllm-project#30653)

robertgshaw2-redhat · web-flow · commit e2ed238885be · 2025-12-14T19:33:41.000-05:00
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
@@ -194,33 +194,12 @@ def get_kv_cache_torch_dtype(
     return torch_dtype
 
 
-def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None:
-    quant_method = quant_cfg.get("quant_method", "")
-    if quant_method.startswith("modelopt"):
-        quantization_inner = quant_cfg.get("quantization", quant_cfg)
-        # Check if quant config is specified and use kv cache quant algo
-        kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get(
-            "kv_cache_quant_algo"
-        )
-        if isinstance(kv_algo, str):
-            return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()]
-    return None
-
-
 def kv_cache_dtype_str_to_dtype(
     kv_cache_dtype: str, model_config: ModelConfig
 ) -> torch.dtype:
-    # Model config may not be specified for unit tests, default to float16
-    dtype = model_config.dtype if model_config else torch.half
     if kv_cache_dtype == "auto":
-        hf_cfg = getattr(model_config, "hf_config", None)
-        if hf_cfg is not None:
-            quant_cfg = getattr(hf_cfg, "quantization_config", None)
-            if quant_cfg is not None:
-                kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg)
-                return kv_algo_dtype if kv_algo_dtype is not None else dtype
-        return dtype
-
+        # Model config may not be specified for unit tests, default to float16
+        return model_config.dtype if model_config else torch.half
     return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]