multihead_attention_optimization: move heads separation out of sdpa backward

OneAdder · OneAdder · commit 0a399cf49f9e · 2025-02-27T23:38:49.000+04:00
diff --git a/src/nf/nf_multihead_attention_submodule.f90 b/src/nf/nf_multihead_attention_submodule.f90
@@ -21,6 +21,10 @@ pure module subroutine common_backward(self, input, gradient, attention_mask)
 
     integer :: head, seq, i, j
 
+    self % v_heads = self % split_heads(self % value_layer % output)
+    self % k_heads = self % split_heads(self % key_layer % output)
+    self % q_heads = self % split_heads(self % query_layer % output)
+
     ! bakward through attention mechanism
     call self % sdpa_backward(gradient, attention_mask)
 
@@ -80,9 +84,6 @@ pure module subroutine sdpa_backward(self, gradient, attention_mask)
 
     ! split heads from output gradient
     self % d_output = self % split_heads(self % output_layer % gradient)
-    self % v_heads = self % split_heads(self % value_layer % output)
-    self % k_heads = self % split_heads(self % key_layer % output)
-    self % q_heads = self % split_heads(self % query_layer % output)
 
     ! iterate over heads to calculate deltas for each of them
     do concurrent(head = 1: self % n_heads)