Why does num_warps affect numerical precision? #7751

yaozhenghangma · 2025-08-03T14:04:48Z

yaozhenghangma
Aug 3, 2025

I'm testing a Flash Attention kernel, and I've placed the test code at the end. In short, when performing an attention operation using q, k, and v matrices with num_batchs = 8, seq_len = 128, num_heads = 128, and head_dim = 128, I observed that setting num_warps = 4 and num_warps = 8 produce different output results. The maximum difference between the output matrices is 0.00048828125, and this difference remains consistent across repeated runs. Moreover, when keeping all other parameters the same and setting seq_len to 32 or smaller, the discrepancy disappears.

I'm using an Nvidia T4 GPU, with CUDA version 12.4 and Triton version 3.1.0.

import math
import torch
import triton
import triton.language as tl


@triton.jit
def flash_attention_kernel(
    q_ptr, k_ptr, v_ptr, o_ptr,
    start_loc, seq_len,
    q_stride_batch_seq, q_stride_head, q_stride_dim,
    k_stride_batch_seq, k_stride_head, k_stride_dim,
    v_stride_batch_seq, v_stride_head, v_stride_dim,
    o_stride_batch_seq, o_stride_head, o_stride_dim,
    num_head,
    kv_group_size,
    d_scale,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_D: tl.constexpr
):
    # q k v: [batchs*seq, heads, dim]
    q_block_id = tl.program_id(0) # m id
    batch_and_head_id = tl.program_id(1)
    batch_id = batch_and_head_id // num_head
    head_id = batch_and_head_id % num_head
    kv_head_id = head_id // kv_group_size

    batch_start = tl.load(start_loc + batch_id)
    batch_seq_len = tl.load(seq_len + batch_id)

    m_ranges = tl.arange(0, BLOCK_SIZE_M)
    n_ranges = tl.arange(0, BLOCK_SIZE_N)
    m_start = q_block_id * BLOCK_SIZE_M
    m_offsets = m_ranges + m_start
    embed_offsets = tl.arange(0, BLOCK_SIZE_D)

    q_offsets = (
        (batch_start + m_offsets[:,None]) * q_stride_batch_seq
        + head_id * q_stride_head
        + embed_offsets[None,:] * q_stride_dim
    )

    k_offsets = (
        (batch_start + n_ranges[:,None]) * k_stride_batch_seq
        + kv_head_id * k_stride_head
        + embed_offsets[None,:] * k_stride_dim
    )

    v_offsets = (
        (batch_start + n_ranges[:,None]) * v_stride_batch_seq
        + kv_head_id * v_stride_head
        + embed_offsets[None,:] * v_stride_dim
    )

    o_offsets = (
        (batch_start + m_offsets[:,None]) * o_stride_batch_seq
        + head_id * o_stride_head
        + embed_offsets[None,:] * o_stride_dim
    )

    q_mask = m_offsets[:,None]  < batch_seq_len
    q = tl.load(q_ptr + q_offsets, mask=q_mask, other=0.0)

    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_D), dtype=tl.float32)
    m_i = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32) - float("inf")
    l_i = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)

    m_mask = tl.where(m_start < batch_seq_len, 1, 0) # loop for m > seq_len is ignored
    n_size = tl.minimum(m_start + BLOCK_SIZE_M, batch_seq_len) # inter block casual mask

    for n_idx in range(0, n_size * m_mask, BLOCK_SIZE_N):
        k_mask = n_idx + n_ranges[:,None] < n_size
        k = tl.load(k_ptr + k_offsets + n_idx * k_stride_batch_seq, mask=k_mask, other=0.0)
        v = tl.load(v_ptr + v_offsets + n_idx * v_stride_batch_seq, mask=k_mask, other=0.0)

        qk = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        qk += tl.dot(q, tl.trans(k))

        # intra block casual mask
        q_off = m_offsets
        k_off = n_idx + n_ranges
        mask = q_off[:,None] >= k_off[None,:]
        qk = tl.where(mask, qk * d_scale, -float("inf"))

        pad_mask = n_idx + n_ranges < n_size
        qk = tl.where(pad_mask[None,:], qk, -float("inf"))

        m_j = tl.max(qk, axis=1)
        m_new = tl.maximum(m_i, m_j)
        alpha = tl.exp2(m_i - m_new)

        P_j = tl.exp2(qk - m_new[:,None])
        l_j = tl.sum(P_j, axis=1)
        l_i = l_i * alpha + l_j

        o_scale = alpha
        acc = acc * o_scale[:, None]
        P_j = P_j.to(v_ptr.dtype.element_ty)
        acc += tl.dot(P_j, v)

        m_i = m_new

    acc = acc / l_i[:, None]

    tl.store(o_ptr + o_offsets, acc, mask=q_mask)


def flash_attention(q: torch.Tensor, k, v, start_loc, seq_len, num_warps):
    _, num_head, d_size = q.shape
    _, num_kv_head, _ = k.shape
    kv_group_size = num_head // num_kv_head
    num_batch = seq_len.shape[0]
    max_seq_len = int(torch.max(seq_len).item())
    d_scale = 1.0 / math.sqrt(d_size) * 1.4426950408889634

    grid = lambda meta: (
        triton.cdiv(max_seq_len, meta["BLOCK_SIZE_M"]),
        num_batch * num_head,
        1
    )
    num_stages = 1

    o = torch.empty_like(q, dtype=q.dtype, device=q.device)
    flash_attention_kernel[grid](
        q, k, v, o,
        start_loc, seq_len,
        *q.stride(), *k.stride(), *v.stride(), *o.stride(),
        num_head=num_head,
        kv_group_size=kv_group_size,
        d_scale=d_scale,
        BLOCK_SIZE_M=64,
        BLOCK_SIZE_N=64,
        BLOCK_SIZE_D=d_size,
        num_warps=num_warps,
        num_stages=num_stages,
    )

    return o


class TestFlashAttention:
    def test_flash_attention(self):
        num_batchs = 8
        seq_len = 32
        num_heads = 128
        head_dim = 128
        dtype = torch.float16
        device = 'cuda'
        tensor_shape = (num_batchs, num_heads, seq_len, head_dim)

        q = torch.rand(tensor_shape, dtype=dtype, device=device)
        k = torch.rand(tensor_shape, dtype=dtype, device=device)
        v = torch.rand(tensor_shape, dtype=dtype, device=device)

        qt = q.permute(0,2,1,3).contiguous().view(-1, num_heads, head_dim)
        kt = k.permute(0,2,1,3).contiguous().view(-1, num_heads, head_dim)
        vt = v.permute(0,2,1,3).contiguous().view(-1, num_heads, head_dim)
        start_loc = torch.arange(0, num_batchs*seq_len, seq_len, device=device)
        seq_len_list = torch.full((num_batchs,), seq_len, device=device)


        out_warps_4 = flash_attention(qt, kt, vt, start_loc, seq_len_list, num_warps=4)
        o_warps_4 = out_warps_4.view(num_batchs, seq_len, num_heads, head_dim).permute(0,2,1,3)

        out_warps_8 = flash_attention(qt, kt, vt, start_loc, seq_len_list, num_warps=8)
        o_warps_8 = out_warps_8.view(num_batchs, seq_len, num_heads, head_dim).permute(0,2,1,3)

        max_error = torch.max(torch.abs(o_warps_4 - o_warps_8))
        assert torch.allclose(o_warps_4, o_warps_8), f"max error {max_error}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Why does num_warps affect numerical precision? #7751

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Why does num_warps affect numerical precision? #7751

Uh oh!

yaozhenghangma Aug 3, 2025

Replies: 0 comments

yaozhenghangma
Aug 3, 2025