Removes accum_dbias; fixes zeros dtype

algo-home · algo-home · commit 7535784acbe3 · 2025-10-22T20:14:13.000+08:00
Simplifies bias-gradient handling by deriving accumulation from the bias sequence-length condition, removing the redundant parameter and related plumbing.

Aligns zero-init of bias buffers with provided tensor options (no forced float), preventing mixed-precision dtype mismatches and improving correctness for MQA/GQA bias shapes.

Streamlines the backward API with no intended behavior changes beyond dtype fix.
diff --git a/csrc/flash_dmattn/flash_api.cpp b/csrc/flash_dmattn/flash_api.cpp
@@ -190,7 +190,6 @@ void set_params_dgrad(
     const float softcap,
     bool has_mask,
     bool has_bias,
-    bool accum_dbias,
     bool deterministic,
     const bool unpadded_lse
 ) {
@@ -246,8 +245,6 @@ void set_params_dgrad(
     // Softmax sum
     params.dsoftmax_sum = dsoftmax_sum_d;
 
-    params.accum_dbias = accum_dbias;
-
     params.deterministic = deterministic;
 }
 
@@ -982,11 +979,10 @@ mha_bwd(
     dbias_expanded = has_bias
         ? (num_heads_bias != num_heads || batch_size_bias == 1 || seqlen_q_bias == 1)     // MQA / GQA or dbias has different batch size or seqlen_q
             ? (seqlen_q_bias == 1)
-                ? torch::zeros({batch_size, num_heads, 1, seqlen_k_rounded}, opts.dtype(at::kFloat))
+                ? torch::zeros({batch_size, num_heads, 1, seqlen_k_rounded}, opts)
                 : torch::zeros({batch_size, num_heads, seqlen_q, seqlen_k_rounded}, opts)
             : dbias
         : torch::empty({0}, opts);
-    bool accum_dbias = has_bias && (seqlen_q_bias == 1 && seqlen_q != 1);
 
     Flash_bwd_params params;
 
@@ -1013,7 +1009,6 @@ mha_bwd(
         softcap,
         has_mask,
         has_bias,
-        accum_dbias,
         deterministic,
         /*unpadded_lse*/false
     );
@@ -1041,7 +1036,7 @@ mha_bwd(
         if (num_heads_bias != num_heads && batch_size_bias == batch_size && seqlen_q_bias == seqlen_q) {
             at::sum_out(dbias, at::reshape(dbias_expanded, {batch_size, num_heads_bias, num_heads / num_heads_bias, seqlen_q, seqlen_k_rounded}), {2});
         } else {
-            if (accum_dbias) {
+            if (seqlen_q_bias == 1) {
                 dbias_expanded = at::sum(at::reshape(dbias_expanded, {batch_size, num_heads_bias, num_heads / num_heads_bias, 1, seqlen_k_rounded}), {2});
             } else {
                 dbias_expanded = at::sum(at::reshape(dbias_expanded, {batch_size, num_heads_bias, num_heads / num_heads_bias, seqlen_q, seqlen_k_rounded}), {2});
@@ -1244,7 +1239,6 @@ mha_varlen_bwd(
         softcap,
         has_mask,
         has_bias,
-        /*accum_dbias*/false,
         deterministic,
         /*unpadded_lse*/true
     );