Merge pull request #71 from mayank31398/mqa

RaymondLi0 · web-flow · commit c82a5a1edaf2 · 2023-07-19T10:23:33.000-04:00
Fixed MQA outputs not matching with HF model with non-flash case
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -414,7 +414,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
             # alibi: (batch_size * num_attention_heads, 1, max_seq_len)
             # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk)
             matmul_input_buffer = alibi[:bs * np, :, :sk].view(bs, np, sk)
-            matmul_input_buffer = matmul_input_buffer.repeat(1, sq, 1)  # [b, np * sq, sk]
+            matmul_input_buffer = matmul_input_buffer.unsqueeze(2).expand(bs, np, sq, sk).reshape(bs, np * sq, sk) # [b, np * sq, sk]
 
         if alibi is None:
             # Raw attention scores. [b, np * sq, sk]