Add MPS GGUF dequantization support

robtaylor · robtaylor · commit de4495bff2f1 · 2026-03-10T00:26:32.000Z
Add Metal kernel path for GGUF quantized models on MPS (Apple Metal).
Implements dequant+matmul for Q4_0, Q8_0, and Q4_K types via the
dequant_gguf kernel package, with a numpy-based fallback using the
gguf Python library.

Changes:
- gguf.py: Add MPS branch in _fused_mul_mat_gguf and _apply_gguf_embedding
  to route through gguf_dequant_on_mps instead of CUDA ops
- gguf.py: Fix get_supported_act_dtypes and get_min_capability for MPS
- mps_dequant.py: Add GGUF section with Metal kernel import, numpy
  fallback, and gguf_dequant_on_mps entry point

Co-developed-by: Claude Code v2.1.58 (claude-opus-4-6)
Signed-off-by: Rob Taylor &lt;rob.taylor@chipflow.io&gt;
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
@@ -62,13 +62,17 @@ def get_name(self) -> QuantizationMethods:
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
         # GGUF dequantization kernels use half precision (fp16) internally.
         # bfloat16 has precision issues on Blackwell devices.
+        if current_platform.is_mps():
+            return [torch.half, torch.float32]
         if current_platform.has_device_capability(100):
             logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.")
             return [torch.half, torch.float32]
         return [torch.half, torch.bfloat16, torch.float32]
 
     @classmethod
     def get_min_capability(cls) -> int:
+        if current_platform.is_mps():
+            return -1  # MPS has no CUDA compute capability
         return 60
 
     @classmethod
@@ -188,17 +192,34 @@ def is_layer_skipped_gguf(
 def _fused_mul_mat_gguf(
     x: torch.Tensor, qweight: torch.Tensor, qweight_type: int
 ) -> torch.Tensor:
-    if qweight_type in IMATRIX_QUANT_TYPES:
-        mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
-    else:
-        mmvq_safe = 2 if qweight.shape[0] > 5120 else 6
     # HACK: when doing chunked prefill we don't generate output tokens
     # so input to logits generator is empty which causes invalid parameter
     if x.shape[0] == 0:
         return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T
+
+    # MPS path: dequantize then matmul (no fused CUDA kernels available)
+    if current_platform.is_mps():
+        if qweight_type in DEQUANT_TYPES:
+            from vllm.model_executor.layers.quantization.utils.mps_dequant import (
+                gguf_dequant_on_mps,
+            )
+
+            block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+            shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
+            weight = gguf_dequant_on_mps(qweight, qweight_type, *shape, x.dtype)
+            return x @ weight.T
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type on MPS: {qweight_type}"
+        )
+
+    if qweight_type in IMATRIX_QUANT_TYPES:
+        mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
+    else:
+        mmvq_safe = 2 if qweight.shape[0] > 5120 else 6
     # enable MMVQ in contiguous batching with batch_size=1
     if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES:
         y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
@@ -385,9 +406,18 @@ def _apply_gguf_embedding(
         x_flat = x.flatten()
         assert hidden_size == qweight.shape[1] // type_size * block_size
         quant = torch.index_select(qweight, dim=0, index=x_flat)
-        dequant = ops.ggml_dequantize(
-            quant, qweight_type, hidden_size, x_flat.shape[0], dtype
-        )
+        if current_platform.is_mps():
+            from vllm.model_executor.layers.quantization.utils.mps_dequant import (
+                gguf_dequant_on_mps,
+            )
+
+            dequant = gguf_dequant_on_mps(
+                quant, qweight_type, x_flat.shape[0], hidden_size, dtype
+            )
+        else:
+            dequant = ops.ggml_dequantize(
+                quant, qweight_type, hidden_size, x_flat.shape[0], dtype
+            )
         return dequant.view(*x.shape, hidden_size)
     else:
         qweight_type = WeightType(qweight_type)
diff --git a/vllm/model_executor/layers/quantization/utils/mps_dequant.py b/vllm/model_executor/layers/quantization/utils/mps_dequant.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""MPS (Metal) dequantization utilities for AWQ and GPTQ int4 models.
+"""MPS (Metal) dequantization utilities for AWQ, GPTQ, and GGUF models.
 
-Uses the dequant_int4 Metal kernel package when available, with a pure
-PyTorch fallback for environments where the kernel isn't installed.
+Uses Metal kernel packages when available, with pure PyTorch/numpy
+fallbacks for environments where the kernels aren't installed.
 """
 
 from typing import Any
@@ -223,3 +223,70 @@ def gptq_dequant_matmul(
     if bias is not None:
         out.add_(bias)
     return out.reshape(out_shape)
+
+
+# ── GGUF ──
+
+_metal_dequant_gguf = None
+_metal_gguf_import_attempted = False
+
+
+def _get_metal_dequant_gguf():
+    """Try to import Metal dequant_gguf kernel package (cached)."""
+    global _metal_dequant_gguf, _metal_gguf_import_attempted
+    if not _metal_gguf_import_attempted:
+        _metal_gguf_import_attempted = True
+        try:
+            import dequant_gguf
+
+            _metal_dequant_gguf = dequant_gguf
+            logger.info("Using Metal dequant_gguf kernel for GGUF dequantization")
+        except ImportError:
+            logger.info(
+                "dequant_gguf Metal kernel not found, "
+                "falling back to numpy-based GGUF dequantization"
+            )
+    return _metal_dequant_gguf
+
+
+def _pytorch_dequant_gguf(
+    W: torch.Tensor,
+    quant_type: int,
+    m: int,
+    n: int,
+    dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    """Fallback GGUF dequantization using the gguf Python library.
+
+    This does a GPU→CPU→GPU round-trip via numpy, so it's slow but correct.
+    """
+    import numpy as np
+    from gguf import GGMLQuantizationType, dequantize
+
+    qt = GGMLQuantizationType(quant_type)
+    w_np = W.cpu().numpy().view(np.uint8)
+    result = dequantize(w_np, qt)
+    out_dtype = dtype if dtype is not None else torch.float16
+    return torch.tensor(result, dtype=out_dtype, device=W.device).reshape(m, n)
+
+
+def gguf_dequant_on_mps(
+    W: torch.Tensor,
+    quant_type: int,
+    m: int,
+    n: int,
+    dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    """Dequantize GGUF weights on MPS.
+
+    Uses Metal kernel if available for all standard GGUF types,
+    falls back to gguf library (numpy) for unsupported types (IQ*).
+    """
+    # Metal kernel types: Q4_0=2, Q4_1=3, Q5_0=6, Q5_1=7, Q8_0=8,
+    # Q2_K=10, Q3_K=11, Q4_K=12, Q5_K=13, Q6_K=14
+    _METAL_GGUF_TYPES = {2, 3, 6, 7, 8, 10, 11, 12, 13, 14}
+
+    metal = _get_metal_dequant_gguf()
+    if metal is not None and quant_type in _METAL_GGUF_TYPES:
+        return metal.dequantize_gguf(W, quant_type, m, n, dtype)
+    return _pytorch_dequant_gguf(W, quant_type, m, n, dtype)