address some review comments

CarloLucibello · CarloLucibello · commit eabcc02286d7 · 2023-01-05T15:34:47.000+01:00
diff --git a/src/attention.jl b/src/attention.jl
@@ -3,7 +3,7 @@ const AA4{T} = AbstractArray{T,4}
 const AA{N,T} = AbstractArray{T,N}
 
 """
-    dot_product_attention(query, key, value; [bias, droput_fn, mask, num_heads])
+    dot_product_attention(query, key, value; [bias, fdrop, mask, nheads])
 
 Multihead dot product attention used in transformer architectures. 
 
@@ -15,12 +15,11 @@ and the sequece length, then an arbitrary number of batch dimensions or none.
 - `query`: Query array of size `(qk_dim, q_len, batch_size...)`.
 - `key`: Key array of size `(qk_dim, kv_len, batch_size...)`.
 - `value`: Value array of size `(v_dim, kv_len, batch_size...)`.
-- `bias`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, num_heads, batch_size)`. 
+- `bias`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, nheads, batch_size)`. 
+- `fdrop`: A dropout function or layer to apply on the attention scores. Default `identity` (no dropout). 
+- `mask`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, nheads, batch_size)`. 
           Can also be set to `mask=:causal` to apply a causal mask. Default `nothing`.
-- `dropout_fn`: A dropout function to apply on the attention scores. Default `nothing`. 
-- `mask`: Either `nothing` or an input array broadcastable to size `(kv_len, q_len, num_heads, batch_size)`. 
-          Can also be set to `mask=:causal` to apply a causal mask. Default `nothing`.
-- `num_heads`: Number of heads to split the input arrays into. Default `1`.
+- `nheads`: Number of heads to split the input arrays into. Default `1`.
 
 # Examples
     
@@ -29,7 +28,7 @@ q, k, v = rand(10, 20, 2), rand(10, 30, 2), rand(20, 30, 2)
 y, α = dot_product_attention(q, k, v)
 ```
 """
-function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}; num_heads=1, kws...) where N
+function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}; nheads=1, kws...) where N
     batch_size = size(q)[3:end]
     
     batch_size == size(k)[3:end] == size(v)[3:end] || throw(ArgumentError("Batch dimensions have to be the same."))
@@ -39,7 +38,7 @@ function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}; num_heads=1, kws...
     q, k, v = map(x -> reshape(x, size(x, 1), size(x, 2), :), (q, k, v))
 
     # Multihead attention. TODO create fastpath for singlehead attention.
-    q, k, v = split_heads.((q, k, v), num_heads)
+    q, k, v = split_heads.((q, k, v), nheads)
     x, α = _dot_product_attention(q, k, v; kws...)
     x = join_heads(x)
 
@@ -49,17 +48,17 @@ function dot_product_attention(q::AA{N}, k::AA{N}, v::AA{N}; num_heads=1, kws...
 end
 
 function _dot_product_attention(q::AA4, k::AA4, v::AA4; 
-        dropout_fn=nothing, bias=nothing, mask=nothing)
+        fdrop=nothing, bias=nothing, mask=nothing)
 
-    α = dot_product_attention_scores(q, k; dropout_fn, bias, mask)
-    # [α] = [kv_len, q_len, num_heads, batch_size]
+    α = dot_product_attention_scores(q, k; fdrop, bias, mask)
+    # [α] = [kv_len, q_len, nheads, batch_size]
     
     # The following permutedims and batched_mul are equivalent to
     # @tullio x[d, h, i, b] := α[j, i, h, b] * v[d, h, j, b]
     vt = permutedims(v, (1, 3, 2, 4))
     x = batched_mul(vt, α)
     x = permutedims(x, (1, 3, 2, 4))
-    # [x] = [kv_dim ÷ num_heads, num_heads, q_len, batch_size]
+    # [x] = [kv_dim ÷ nheads, nheads, q_len, batch_size]
     return x, α
 end
 
@@ -68,35 +67,33 @@ end
 
 Return the attention scores for the [`dot_product_attention`](@ref).
 
-Input arrays must have dimensions `(num_features ÷ num_heads, num_heads, sequence_length, batch_size)`
+Input arrays must have dimensions `(num_features ÷ nheads, nheads, sequence_length, batch_size)`.
 
 """
 function dot_product_attention_scores(q::AA4{T}, k::AA4{T}; 
-            dropout_fn=nothing, mask=nothing, bias=nothing) where T
+            fdrop=identity, mask=nothing, bias=nothing) where T
 
-    q  = q ./ √T(size(q, 1))
-    
     # The following permutedims and batched_mul are equivalent to
-    # @tullio α[j, i, h, b] := q[d, h, i, b] * k[d, h, j, b]
+    # @tullio logits[j, i, h, b] := q[d, h, i, b] * k[d, h, j, b] / √T(qk_dim)
     kt = permutedims(k, (3, 1, 2, 4))
-    qt = permutedims(q, (1, 3, 2, 4))
-    α = batched_mul(kt, qt)
-    # [α] = [kv_len, q_len, num_heads, batch_size]
+    qt = permutedims(q, (1, 3, 2, 4)) ./ √T(size(q, 1))
+    logits = batched_mul(kt, qt)
+    # [logits] = [kv_len, q_len, nheads, batch_size]
 
     if bias !== nothing
-        α = α .+ bias
+        logits = logits .+ bias
     end
 
     if mask !== nothing
         if mask === :causal
-            mask = make_causal_mask(α)
+            mask = make_causal_mask(logits)
         end
-        neginf = typemin(eltype(α))
-        α = ifelse.(mask, α, neginf)
+        neginf = typemin(eltype(logits))
+        logits = ifelse.(mask, logits, neginf)
     end
 
-    α = softmax(α, dims=1)
-    return dropout_fn === nothing ? α : dropout_fn(α)
+    α = softmax(logits, dims=1)
+    return fdrop(α)
 end
 
 """ 
@@ -116,7 +113,7 @@ end
 trues_like(x::AbstractArray, sz=size(x)) = fill!(similar(x, Bool, sz), true)
 falses_like(x::AbstractArray, sz=size(x)) = fill!(similar(x, Bool, sz), false)
 
-split_heads(x, num_heads) = reshape(x, size(x, 1) ÷ num_heads, num_heads, size(x)[2:end]...)
+split_heads(x, nheads) = reshape(x, size(x, 1) ÷ nheads, nheads, size(x)[2:end]...)
 join_heads(x) = reshape(x, :, size(x)[3:end]...)
 
 @non_differentiable make_causal_mask(x)