Adds head size padding and comments sequence length padding

algo-home · algo-home · commit e23b08f8f563 · 2025-09-17T11:46:12.000+08:00
Improves memory alignment by ensuring head dimensions are padded to multiples of 8 for 16-bit memory allocations.

Comments out sequence length padding implementation for future consideration, including corresponding mask and bias padding logic in both forward and backward passes.
diff --git a/flash_dmattn/flash_dmattn_interface.py b/flash_dmattn/flash_dmattn_interface.py
@@ -241,11 +241,20 @@ def forward(
         if return_softmax is None:
             return_softmax = False
 
+        # Padding to multiple of 8 for 16-bit memory allocations
         head_size_og = q.size(3)
         if head_size_og % 8 != 0:
             q = torch.nn.functional.pad(q, [0, 8 - head_size_og % 8])
             k = torch.nn.functional.pad(k, [0, 8 - head_size_og % 8])
             v = torch.nn.functional.pad(v, [0, 8 - head_size_og % 8])
+        # seqlen_k_og = k.shape[1]
+        # if seqlen_k_og % 8 != 0:
+        #     k = torch.nn.functional.pad(k, [0, 0, 0, 0, 0, 8 - seqlen_k_og % 8])
+        #     v = torch.nn.functional.pad(v, [0, 0, 0, 0, 0, 8 - seqlen_k_og % 8])
+        #     if mask is not None:
+        #         mask = torch.nn.functional.pad(mask, [0, 8 - seqlen_k_og % 8], value=False)
+        #     if bias is not None:
+        #         bias = torch.nn.functional.pad(bias, [0, 8 - seqlen_k_og % 8], value=0.0)
 
         out_padded, softmax_lse, S_dmask = _wrapped_flash_dmattn_forward(
             q,
@@ -265,6 +274,7 @@ def forward(
             ctx.is_causal = is_causal
             ctx.softcap = softcap
             ctx.deterministic = deterministic
+            # ctx.seqlen_k_og = seqlen_k_og
 
         out = out_padded[..., :head_size_og]
 
@@ -307,6 +317,11 @@ def backward(
         dk = dk[..., : dout.shape[-1]]
         dv = dv[..., : dout.shape[-1]]
 
+        # if ctx.seqlen_k_og % 8 != 0:
+        #     dk = dk[:, : ctx.seqlen_k_og, :, :]
+        #     dv = dv[:, : ctx.seqlen_k_og, :, :]
+        #     dbias = dbias[..., : ctx.seqlen_k_og]
+
         return dq, dk, dv, None, dbias, None, None, None, None, None, None