Optimize KV cache dequantization performance (#9528)

kyuyeunk · web-flow · commit 9995e971d70a · 2025-08-01T16:52:04.000-07:00
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py
@@ -1954,12 +1954,14 @@ def masked_store(ref, val, start, end, group=1):
       # kv lens will be contracting dim, we should mask out the NaNs.
       kv_mask = (
           lax.broadcasted_iota(jnp.int32, k.shape, 0) < kv_len - kv_len_start)
-      k = jnp.where(kv_mask, k.astype(jnp.float32), 0).astype(k.dtype)
-      v = jnp.where(kv_mask, v.astype(jnp.float32), 0).astype(v.dtype)
-
-      qk = (
-          jnp.einsum("nd,md->nm", q, k, preferred_element_type=jnp.float32) *
-          sm_scale)
+      k = jnp.where(kv_mask, k, 0)
+      v = jnp.where(kv_mask, v, 0)
+
+      qk = jnp.einsum("nd,md->nm", q, k, preferred_element_type=jnp.float32)
+      if k_scale is not None:
+        qk *= sm_scale * k_scale
+      else:
+        qk *= sm_scale
       store_start = jnp.maximum(q_start - q_len_start, 0)
       store_end = jnp.minimum(q_end - q_len_start, num_q_per_blk)
 
@@ -2007,6 +2009,8 @@ def init_scratch_ref():
       m_curr = jnp.max(qk, axis=1, keepdims=True)
       s_curr = jnp.exp(qk - m_curr)
       qkv = jnp.dot(s_curr, v, preferred_element_type=jnp.float32)
+      if v_scale is not None:
+        qkv *= v_scale
       lm_store_shape = head_m_ref.shape
       m_curr = jnp.broadcast_to(m_curr, lm_store_shape)
       l_curr = jnp.broadcast_to(
@@ -2088,14 +2092,6 @@ def prefetch_next_kv_blk():
         for step_idx in range(kv_load_step):
           k = k_list[step_idx]
           v = v_list[step_idx]
-          if k_scale is not None:
-            # NOTE: Conversion between arbitrary data types is not supported.
-            # That's why it is converted to float32 first.
-            k = k.astype(jnp.float32) * k_scale
-            k = k.astype(q_ref.dtype)
-          if v_scale is not None:
-            v = v.astype(jnp.float32) * v_scale
-            v = v.astype(q_ref.dtype)
           kv_head_idx = kv_head_chunk_idx + step_idx
           q_head_idx = kv_head_idx * num_q_heads_per_kv_head
           # TODO(jevinjiang): extra handlig for packed type that can start at