generic attention

CarloLucibello · CarloLucibello · commit 8ac9e6b3823c · 2022-12-30T17:27:15.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -5,13 +5,16 @@ version = "0.13.10"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
+NeuralAttentionlib = "12afc1b8-fad6-47e1-9132-84abc478905f"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
@@ -21,6 +24,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -1,63 +1,175 @@
+using Flux, Test, LinearAlgebra, Random, Statistics
+using CUDA, CUDAKernels, LoopVectorization
+using Tullio
+using NeuralAttentionlib
+using BenchmarkTools
+
+const A3{T} = AbstractArray{T, 3}
+
 """
-    MHAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false, 
-                attn_dropout_prob = 0., proj_dropout_prob = 0.)
+    MultiHeadAttention(dims, num_heads; 
+              [bias, init, attn_dropout_prob, proj_dropout_prob])
 
-Multi-head self-attention layer.
+Multi-head dot-product attention layer.
 
 # Arguments
 
-- `planes`: number of input channels
+- `dims`: ...
 - `nheads`: number of heads
-- `qkv_bias`: whether to use bias in the layer to get the query, key and value
+- `init`: weight initializer for the Dense layers.
+- `bias` : whether pointwise QKVO dense transforms use bias.
 - `attn_dropout_prob`: dropout probability after the self-attention layer
 - `proj_dropout_prob`: dropout probability after the projection layer
+
+# Forward
+
+- `in_q`: input tensor of shape `(batch_size, seq_len, dims)
+- `in_k`: input tensor of shape `(batch_size, seq_len, dims)
+- `in_v`: input tensor of shape `(batch_size, seq_len, dims)
+- `mask`: input tensor of shape `(batch_size, seq_len, seq_len)`
+- `return_weights`: whether to return the attention weights
+
+# Examples
+
+```julia
+mha = MultiHeadAttention(64, 8)
+```
 """
-struct MultiHeadAttention{P, Q, R}
-  nheads::Int
-  qkv_layer::P
-  attn_drop::Q
-  projection::R
-end
-
-@functor MHAttention
-
-function MultiHeadAttention(planes::Integer, nheads::Integer = 8; qkv_bias::Bool = false,
-                     attn_dropout_prob = 0.0, proj_dropout_prob = 0.0)
-    @assert planes % nheads==0 "planes should be divisible by nheads"
-    qkv_layer = Dense(planes, planes * 3; bias = qkv_bias)
-    attn_drop = Dropout(attn_dropout_prob)
-    proj = Chain(Dense(planes, planes), Dropout(proj_dropout_prob))
-    return MultiHeadAttention(nheads, qkv_layer, attn_drop, proj)
-end
-
-function (m::MultiHeadAttention)(x::AbstractArray{T, 3}) where {T}
-    nfeatures, seq_len, batch_size = size(x)
-    x_reshaped = reshape(x, nfeatures, seq_len * batch_size)
-    qkv = m.qkv_layer(x_reshaped)
-    qkv_reshaped = reshape(qkv, nfeatures ÷ m.nheads, m.nheads, seq_len, 3 * batch_size)
-    query, key, value = chunk(qkv_reshaped, 3; dims = 4)
-    scale = convert(T, sqrt(size(query, 1) / m.nheads))
-    key_reshaped = reshape(permutedims(key, (2, 1, 3, 4)), m.nheads, nfeatures ÷ m.nheads,
-                           seq_len * batch_size)
-    query_reshaped = reshape(permutedims(query, (1, 2, 3, 4)), nfeatures ÷ m.nheads,
-                             m.nheads, seq_len * batch_size)
-
-    attention = softmax(batched_mul(query_reshaped, key_reshaped) .* scale)                         
-    attention = m.attn_drop(attention)
+struct MultiHeadAttention
+  num_heads::Int
+  qkv_proj
+  attn_drop
+  out_proj
+end
+
+@functor MultiHeadAttention
+
+function MultiHeadAttention(dims, num_heads::Int; 
+                     bias::Bool = false,
+                    #  init = glorot_uniform, # TODO
+                     attn_dropout_prob = 0.0, 
+                     out_proj_dropout_prob = 0.0)
+
+  dims = mha_process_dims(dims)
+  @assert dims.qkv % num_heads == 0 "qkv_dim should be divisible by num_heads"
+  qkv_proj = QKVProj((dims.q_in, dims.k_in, dims.v_in) => dims.qkv; bias)
+  attn_drop = Dropout(attn_dropout_prob)
+  out_proj = Chain(Dense(dims.qkv => dims.out; bias), Dropout(out_proj_dropout_prob))
+  return MultiHeadAttention(num_heads, qkv_proj, attn_drop, out_proj)
+end
+
+mha_process_dims(dims::Int) = (; q_in = dims, k_in = dims, v_in = dims, qkv = dims, out = dims)
+mha_process_dims((in, (qkv, out))::Pair{Int, <:Pair}) = (; q_in = in, k_in = in, v_in = in, qkv, out)
+mha_process_dims((in, (qkv, out))::Pair{<:Tuple, <:Pair}) = (; q_in = in[1], k_in = in[2], v_in = in[3], qkv, out)
+
+# self-attention
+(m::MultiHeadAttention)(x; kws...) = m(x, x, x; kws...)
+
+function (m::MultiHeadAttention)(q_in::A3, k_in::A3, v_in::A3; with_weights=false, v=:tullio)
+  ## [q_in] = [q_in_dim, q_len, batch_size]
+  ## [k_in] = [k_in_dim, kv_len, batch_size] 
+  ## [v_in] = [v_in_dim, kv_len, batch_size]
+
+  if v == :tullio
+    q, k, v = m.qkv_proj(q_in, k_in, v_in, m.num_heads)
+    # [q] = [qkv_dim / num_heads, num_heads, q_len, batch_size]
+    # [k] = [v] = [qkv_dim / num_heads, num_heads, kv_len, batch_size]
     
-    value_reshaped = reshape(permutedims(value, (1, 2, 3, 4)), nfeatures ÷ m.nheads,
-                             m.nheads, seq_len * batch_size)
-    pre_projection = reshape(batched_mul(attention, value_reshaped),
-                             (nfeatures, seq_len, batch_size))
-    y = m.projection(reshape(pre_projection, size(pre_projection, 1), :))
-    return reshape(y, :, seq_len, batch_size)
+    x, α = dot_product_attention(q, k, v; dropout=m.attn_drop)
+    x = reshape(x, :, size(x, 3), size(x, 4))
+  elseif v == :nnalib
+    q, k, v = m.qkv_proj(q_in, k_in, v_in)
+    x = NeuralAttentionlib.multihead_qkv_attention(m.num_heads, q, k, v)
+  else
+    error("Unknown attention implementation")
+  end
+
+  x = m.out_proj(x)
+
+  return x
+  # return with_weights ? (x, α) : x
 end
 
-using Flux, Functors, Test, NNlib, MLUtils
+# Inspired by https://flax.readthedocs.io/en/latest/api_reference/_autosummary/flax.linen.dot_product_attention.html?highlight=dot_product_attention
+function dot_product_attention(q, k, v; dropout=nothing)
+  α = dot_product_attention_weights(q, k; dropout)
+  # [α] = [kv_len, q_len, num_heads, batch_size]
+  @tullio x[d, h, i, b] := α[j, i, h, b] * v[d, h, j, b]
+  # [x] = [kv_dim ÷ num_heads, num_heads, q_len, batch_size]
+  
+  return x, α
+end
 
-mha = MultiHeadAttention(64, 8)
-sz = (64, 100, 32)
-x = rand(Float32, sz)
-y = mha(x)
-@test y isa Array{Float32, 3}
-@test size(y) == sz
+function dot_product_attention_weights(q, k; dropout=nothing)
+  @tullio α[j, i, h, b] := q[d, h, i, b] * k[d, h, j, b]
+  # [α] = [kv_len, q_len, num_heads, batch_size]
+  α = softmax(α, dims=1)
+  return dropout === nothing ? α : dropout(α)
+end
+
+
+struct QKVProj
+  k_proj::Dense
+  v_proj::Dense
+  q_proj::Dense
+end
+
+@functor QKVProj
+
+function QKVProj((in_dim, qkv_dim)::Pair; bias = false)
+  q_in_dim, k_in_dim, v_in_dim = in_dim
+  return QKVProj(
+      Dense(k_in_dim => qkv_dim; bias),
+      Dense(v_in_dim => qkv_dim; bias),
+      Dense(q_in_dim => qkv_dim; bias)
+  )
+end
+
+function (proj::QKVProj)(q_in, k_in, v_in, num_heads)
+  q = proj.q_proj(q_in)
+  sz = size(q)
+  newsz = (sz[1] ÷ num_heads, num_heads, sz[2:end]...)
+  q = reshape(q, newsz)
+  k = reshape(proj.k_proj(k_in), newsz)
+  v = reshape(proj.v_proj(v_in), newsz)
+  return q, k, v
+end
+
+function (proj::QKVProj)(q_in, k_in, v_in)
+  return (proj.q_proj(q_in), proj.k_proj(k_in), proj.v_proj(v_in))
+end
+
+
+function perf(dim, len, batch_size, num_heads)
+  mha = MultiHeadAttention(dim, num_heads)  
+  x = rand(Float32, (dim, len, batch_size))
+
+  y = mha(x, x, x)
+  @test y isa Array{Float32, 3}
+  @test size(y) == (dim, len, batch_size)
+
+  
+  println("tullio")
+  @btime $mha($x, v=:tullio);
+  @btime gradient(m -> sum(m($x, v=:tullio)), $mha);
+
+  println("nnalib")
+  @btime $mha($x, $x, $x, v=:nnalib);
+  @btime gradient(m -> sum(m($x, v=:nnalib)), $mha);
+  
+  if CUDA.functional()
+    mha_gpu = mha |> gpu
+    x_gpu = x |> gpu
+
+    println("tullio - gpu")
+    @btime $mha_gpu($x_gpu, v=:tullio);
+    @btime gradient(m -> sum(m($x_gpu, v=:tullio)), $mha_gpu);
+
+    println("nnalib - gpu")
+    @btime CUDA.@sync $mha_gpu($x_gpu, v=:nnalib);
+    @btime CUDA.@sync gradient(m -> sum(m($x_gpu, v=:nnalib)), $mha_gpu);
+  end
+  return nothing
+end
+
+perf(64, 100, 32, 8)