bias is positional argument

CarloLucibello · CarloLucibello · commit 19d377a39db7 · 2023-01-05T15:34:47.000+01:00
diff --git a/src/attention.jl b/src/attention.jl
@@ -3,21 +3,28 @@ const AA4{T} = AbstractArray{T,4}
 const AA{N,T} = AbstractArray{T,N}
 
 """
-    dot_product_attention(query, key, value; [bias, fdrop, mask, nheads])
+    dot_product_attention(query, key, value [bias]; fdrop, mask, nheads])
 
 Multihead dot product attention used in transformer architectures. 
 
 The input arrays must have the first two dimensions given by the number of features
-and the sequece length, then an arbitrary number of batch dimensions or none. 
+and the sequece length, then an arbitrary number of batch dimensions or none.
+
+Returns the attention output array of size `(v_dim, q_len, batch_size...)` and the attention scores.
+of size `(kv_len, q_len, nheads, batch_size...)`.
+
+See also [`dot_product_attention_scores`](@ref) if you only need the attention scores.
 
 # Arguments
 
 - `query`: Query array of size `(qk_dim, q_len, batch_size...)`.
 - `key`: Key array of size `(qk_dim, kv_len, batch_size...)`.
 - `value`: Value array of size `(v_dim, kv_len, batch_size...)`.
-- `bias`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, nheads, batch_size)`. 
+- `bias`: Either `nothing` or an array broadcastable to size `(kv_len, q_len, nheads, batch_size)`.
+          It will be added to the attention scores before applying the softmax. Default `nothing`.
 - `fdrop`: A dropout function or layer to apply on the attention scores. Default `identity` (no dropout). 
-- `mask`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, nheads, batch_size)`. 
+- `mask`: Either `nothing` or a boolean array broadcastable to size `(kv_len, q_len, nheads, batch_size)`.
+          The mask be applied to the attention scores before applying the softmax.
           Can also be set to `mask=:causal` to apply a causal mask. Default `nothing`.
 - `nheads`: Number of heads to split the input arrays into. Default `1`.
 
@@ -28,36 +35,37 @@ q, k, v = rand(10, 20, 2), rand(10, 30, 2), rand(20, 30, 2)
 y, α = dot_product_attention(q, k, v)
 ```
 """
-function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}; nheads=1, kws...) where N
+function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}, args...; kws...) where N
     batch_size = size(q)[3:end]
-    
     batch_size == size(k)[3:end] == size(v)[3:end] || throw(ArgumentError("Batch dimensions have to be the same."))
-    size(q, 1) == size(k, 1) || throw(ArgumentError("First dimension in query and key has to be the same."))
-    size(k, 2) == size(v, 2)  || throw(ArgumentError("Second dimension in key and value has to be the same."))
-    
     q, k, v = map(x -> reshape(x, size(x, 1), size(x, 2), :), (q, k, v))
 
-    x, α = dot_product_attention(q, k, v; nheads, kws...)
+    x, α = dot_product_attention(q, k, v, args...; kws...)
 
     x = reshape(x, size(x, 1), size(x, 2), batch_size...)
     α = reshape(α, size(α)[1:3]..., batch_size...)
     return x, α
 end
 
-function dot_product_attention(q::AA3, k::AA3, v::AA3; nheads=1, kws...)
+function dot_product_attention(q::AA3, k::AA3, v::AA3, bias=nothing; 
+            fdrop=identity, mask=nothing, nheads=1)
+
+    (size(q, 3) == size(k, 3) == size(v, 3)) || throw(ArgumentError("Batch dimensions have to be the same."))
+    size(q, 1) == size(k, 1) || throw(ArgumentError("First dimension in query and key has to be the same."))
+    size(k, 2) == size(v, 2) || throw(ArgumentError("Second dimension in key and value has to be the same."))
+    
     # Multihead attention. TODO create fastpath for singlehead attention.
     q, k, v = split_heads.((q, k, v), nheads)
-    x, α = _dot_product_attention(q, k, v; kws...)
+    x, α = _dot_product_attention(q, k, v, bias, fdrop, mask)
     return join_heads(x), α
 end
 
-function _dot_product_attention(q::AA4, k::AA4, v::AA4; 
-            fdrop=identity, bias=nothing, mask=nothing)
+function _dot_product_attention(q::AA4, k::AA4, v::AA4, bias, fdrop, mask)
     # [q] = [qk_dim ÷ nheads, nheads, q_len, batch_size]
     # [k] = [qk_dim ÷ nheads, nheads, kv_len, batch_size]
     # [v] = [v_dim ÷ nheads, nheads, kv_len, batch_size]
-    
-    α = dot_product_attention_scores(q, k; fdrop, bias, mask)
+
+    α = dot_product_attention_scores(q, k, bias; fdrop, mask)
     # [α] = [kv_len, q_len, nheads, batch_size]
     
     # The following permutedims and batched_mul are equivalent to
@@ -70,14 +78,16 @@ function _dot_product_attention(q::AA4, k::AA4, v::AA4;
 end
 
 """
-    dot_product_attention_scores(query, key; [bias, droput_fn, mask])
+    dot_product_attention_scores(query, key, [bias]; [fdrop, mask])
 
 Return the attention scores for the [`dot_product_attention`](@ref).
+Input arrays must have dimensions 
+`(num_features ÷ nheads, nheads, sequence_length, batch_size)`.
 
-Input arrays must have dimensions `(num_features ÷ nheads, nheads, sequence_length, batch_size)`.
+See [`dot_product_attention`](@ref) for more details.
 """
-function dot_product_attention_scores(q::AA4{T}, k::AA4{T}; 
-            fdrop=identity, mask=nothing, bias=nothing) where T
+function dot_product_attention_scores(q::AA4{T}, k::AA4{T}, bias=nothing; 
+            fdrop=identity, mask=nothing) where T
 
     # The following permutedims and batched_mul are equivalent to
     # @tullio logits[j, i, h, b] := q[d, h, i, b] * k[d, h, j, b] / √T(qk_dim)