Refactors mha_bwd to use torch::zeros for bias initialization and removes unnecessary zeroing of dbias_expanded

algo-home · algo-home · commit 8effe3cdf33a · 2025-10-12T23:55:30.000+08:00
diff --git a/csrc/flash_dmattn/flash_api.cpp b/csrc/flash_dmattn/flash_api.cpp
@@ -979,13 +979,10 @@ mha_bwd(
     dbias_expanded = has_bias
         ? (
             (num_heads_bias != num_heads || batch_size_bias != batch_size || seqlen_q_bias != seqlen_q)     // MQA / GQA or dbias has different batch size or seqlen_q
-                ? torch::empty({batch_size, num_heads, seqlen_q, seqlen_k_rounded}, opts)
+                ? torch::zeros({batch_size, num_heads, seqlen_q, seqlen_k_rounded}, opts)
                 : dbias
         )
         : torch::empty({0}, opts);
-    if (has_bias) {
-        dbias_expanded.zero_();
-    }
 
     Flash_bwd_params params;
 
@@ -1050,7 +1047,7 @@ mha_bwd(
         }
     }
 
-    return { dq, dk, dv, dbias, softmax_d };
+    return {dq, dk, dv, dbias, softmax_d};
 }
 
 std::vector<at::Tensor>

Original file line number	Diff line number	Diff line change
`@@ -979,13 +979,10 @@ mha_bwd(`
`979`	`979`	`dbias_expanded = has_bias`
`980`	`980`	`? (`
`981`	`981`	`(num_heads_bias != num_heads \|\| batch_size_bias != batch_size \|\| seqlen_q_bias != seqlen_q) // MQA / GQA or dbias has different batch size or seqlen_q`
`982`		`- ? torch::empty({batch_size, num_heads, seqlen_q, seqlen_k_rounded}, opts)`
	`982`	`+ ? torch::zeros({batch_size, num_heads, seqlen_q, seqlen_k_rounded}, opts)`
`983`	`983`	`: dbias`
`984`	`984`	`)`
`985`	`985`	`: torch::empty({0}, opts);`
`986`		`- if (has_bias) {`
`987`		`- dbias_expanded.zero_();`
`988`		`- }`
`989`	`986`
`990`	`987`	`Flash_bwd_params params;`
`991`	`988`
`@@ -1050,7 +1047,7 @@ mha_bwd(`
`1050`	`1047`	`}`
`1051`	`1048`	`}`
`1052`	`1049`
`1053`		`- return { dq, dk, dv, dbias, softmax_d };`
	`1050`	`+ return {dq, dk, dv, dbias, softmax_d};`
`1054`	`1051`	`}`
`1055`	`1052`
`1056`	`1053`	`std::vector<at::Tensor>`