Calculate vmem limit dynamically in the quantized matmul kernel. (#9470)

vanbasten23 · web-flow · commit 83dc9da87cf9 · 2025-07-10T15:51:34.000-07:00
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -1073,7 +1073,6 @@ def quantized_matmul_int8(
     batch_block_size: int | None = None,
     out_block_size: int | None = None,
     in_block_size: int | None = None,
-    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
 ) -> torch.Tensor:
   from torch_xla.experimental.pallas_kernels.quantized_matmul_kernel import (
       quantized_matmul_int8,
@@ -1084,6 +1083,7 @@ def quantized_matmul_int8(
   n_out_features, _ = w.shape
   jax_dtype = convert_torch_dtype_to_jax(x.dtype)
   import jax.numpy as jnp
+  # We fetch the tuned block sizes here instead of in the kernel because if we cannot find the block sizes (meaning we haven't tuned the kernel for that case), then we fall back to the XLA quantized matmul kernel, which has better perf than using kernel with a default but crappy block size.
   batch_block_size, out_block_size, in_block_size = get_tuned_block_sizes(
       TUNED_BLOCK_SIZES, bs, n_out_features, n_in_features,
       jnp.dtype(jax_dtype).name, quantize_activation)
@@ -1096,7 +1096,6 @@ def quantized_matmul_int8(
             "batch_block_size": batch_block_size,
             "out_block_size": out_block_size,
             "in_block_size": in_block_size,
-            "vmem_limit_bytes": vmem_limit_bytes
         })
   from torch_xla.experimental.xla_quantized_matmul import quantized_matmul_xla
   return quantized_matmul_xla(
@@ -1737,7 +1736,7 @@ def gmm_non_xla(lhs: torch.Tensor,
 
 
 XLA_LIB.define(
-    "quantized_matmul_int8(Tensor x, Tensor w, Tensor scalar, Tensor? zero_point=None, Tensor? quant_block_size=None, bool quantize_activation=False, int? batch_block_size=None, int? out_block_size=None, int? in_block_size=None, int? vmem_limit_bytes=None) -> Tensor",
+    "quantized_matmul_int8(Tensor x, Tensor w, Tensor scalar, Tensor? zero_point=None, Tensor? quant_block_size=None, bool quantize_activation=False, int? batch_block_size=None, int? out_block_size=None, int? in_block_size=None) -> Tensor",
 )
 
 
@@ -1752,11 +1751,10 @@ def quantized_matmul_int8_xla(
     batch_block_size: int | None = None,
     out_block_size: int | None = None,
     in_block_size: int | None = None,
-    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
 ) -> torch.Tensor:
   return quantized_matmul_int8(x, w, scalar, zero_point, quant_block_size,
                                quantize_activation, batch_block_size,
-                               out_block_size, in_block_size, vmem_limit_bytes)
+                               out_block_size, in_block_size)
 
 
 @impl(XLA_LIB, "quantized_matmul_int8", "CompositeExplicitAutograd")
@@ -1770,7 +1768,6 @@ def quantized_matmul_int8_non_xla(
     batch_block_size: int | None = None,
     out_block_size: int | None = None,
     in_block_size: int | None = None,
-    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
 ) -> torch.Tensor:
   # This will be called when dynamo use fake tensor to construct the fake output.
   # We need to make sure output tensor's shape is correct.
diff --git a/torch_xla/experimental/pallas_kernels/quantized_matmul_kernel.py b/torch_xla/experimental/pallas_kernels/quantized_matmul_kernel.py
@@ -90,7 +90,6 @@ def _next_multiple(x, multiple):
         'batch_block_size',
         'out_block_size',
         'in_block_size',
-        'vmem_limit_bytes',
     ])
 def quantized_matmul_int8(
     x: jax.Array,  # [bs, n_input_features]
@@ -104,7 +103,6 @@ def quantized_matmul_int8(
     batch_block_size: int | None = None,
     out_block_size: int | None = None,
     in_block_size: int | None = None,
-    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
 ):
   assert zero_point is None, "Not implemented: zero_point is not supported."
   assert quant_block_size is None, "Not implemented: quant_block_size is not supported."
@@ -152,6 +150,15 @@ def quantized_matmul_int8(
       1] % in_block_size == 0, f"x.shape[1] ({x.shape[1]}) must be a multiple of block size ({in_block_size})"
 
   acc_dtype = jnp.int32 if quantize_activation else x.dtype
+  vmem_to_be_transferred = 2 * (
+      batch_block_size * in_block_size * x.dtype.itemsize +
+      out_block_size * in_block_size * w.dtype.itemsize + out_block_size *
+      scalar.dtype.itemsize + batch_block_size * x_abs_max_val.dtype.itemsize +
+      batch_block_size * out_block_size * x.dtype.itemsize
+  ) + batch_block_size * out_block_size * jnp.dtype(acc_dtype).itemsize
+  # Within the kernel, it will use some extra VMEM for computation or vreg spills.
+  vmem_used = vmem_to_be_transferred * 2
+  vmem_limit_bytes = min(vmem_used * 2, 96 * 1024 * 1024)
   kernel = pl.pallas_call(
       functools.partial(
           matmul_kernel,