[ci skip] updates

CarloLucibello · CarloLucibello · commit 742f2b517a50 · 2022-12-31T11:01:47.000+01:00
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -1,10 +1,12 @@
 using Flux, Functors, Test, LinearAlgebra, Random, Statistics
-using CUDA, CUDAKernels, KernelAbstractions, LoopVectorization
-using Tullio
+using CUDA
+using CUDAKernels, KernelAbstractions, LoopVectorization, Tullio
 using NeuralAttentionlib
 using BenchmarkTools
 CUDA.allowscalar(false)
+
 const A3{T} = AbstractArray{T, 3}
+const A4{T} = AbstractArray{T, 4}
 
 """
     MultiHeadAttention(dims, num_heads; 
@@ -48,7 +50,8 @@ function MultiHeadAttention(dims, num_heads::Int;
                      bias::Bool = false,
                     #  init = glorot_uniform, # TODO
                      attn_dropout_prob = 0.0, 
-                     out_proj_dropout_prob = 0.0)
+                     out_proj_dropout_prob = 0.0,
+                     self=false)
 
   dims = mha_process_dims(dims)
   @assert dims.qkv % num_heads == 0 "qkv_dim should be divisible by num_heads"
@@ -58,48 +61,59 @@ function MultiHeadAttention(dims, num_heads::Int;
   return MultiHeadAttention(num_heads, qkv_proj, attn_drop, out_proj)
 end
 
-mha_process_dims(dims::Int) = (; q_in = dims, k_in = dims, v_in = dims, qkv = dims, out = dims)
-mha_process_dims((in, (qkv, out))::Pair{Int, <:Pair}) = (; q_in = in, k_in = in, v_in = in, qkv, out)
-mha_process_dims((in, (qkv, out))::Pair{<:Tuple, <:Pair}) = (; q_in = in[1], k_in = in[2], v_in = in[3], qkv, out)
+mha_process_dims(dims::Int) = 
+  (; q_in = dims, k_in = dims, v_in = dims, qkv = dims, out = dims)
+
+mha_process_dims((in, (qkv, out))::Pair{Int, <:Pair{Int, Int}}) = 
+  (; q_in = in, k_in = in, v_in = in, qkv, out)
+
+mha_process_dims((in, (qkv, out))::Pair{<:Tuple, <:Pair{Int, Int}}) = 
+  (; q_in = in[1], k_in = in[2], v_in = in[3], qkv, out)
 
 # self-attention
 (m::MultiHeadAttention)(x; kws...) = m(x, x, x; kws...)
 
-function (m::MultiHeadAttention)(q_in::A3, k_in::A3, v_in::A3; with_weights=false, v=:tullio)
+function (m::MultiHeadAttention)(q_in::A3, k_in::A3, v_in::A3; with_weights=false, impl=:tullio)
   ## [q_in] = [q_in_dim, q_len, batch_size]
   ## [k_in] = [k_in_dim, kv_len, batch_size] 
   ## [v_in] = [v_in_dim, kv_len, batch_size]
 
-  if v == :tullio
-    q, k, v = m.qkv_proj(q_in, k_in, v_in, m.num_heads)
-    # [q] = [qkv_dim / num_heads, num_heads, q_len, batch_size]
-    # [k] = [v] = [qkv_dim / num_heads, num_heads, kv_len, batch_size]
-    
-    x, α = dot_product_attention(q, k, v; dropout=m.attn_drop)
-    x = reshape(x, :, size(x, 3), size(x, 4))
-  elseif v == :nnalib
-    q, k, v = m.qkv_proj(q_in, k_in, v_in)
-    x = NeuralAttentionlib.multihead_qkv_attention(m.num_heads, q, k, v)
+  q, k, v = m.qkv_proj(q_in, k_in, v_in)
+  # [q] = [qkv_dim, q_len, batch_size]
+  # [k] = [v] = [qkv_dim, kv_len, batch_size]
+  if impl == :tullio
+    x, α = dot_product_attention(m.num_heads, q, k, v; dropout=m.attn_drop)
+  elseif impl == :nnalib
+    x, α = NeuralAttentionlib.multihead_qkv_attention(
+              NeuralAttentionlib.score_returning, 
+              m.num_heads, q, k, v)
   else
     error("Unknown attention implementation")
   end
 
   x = m.out_proj(x)
 
-  return x
-  # return with_weights ? (x, α) : x
+  return with_weights ? (x, α) : x
 end
 
-# Inspired by https://flax.readthedocs.io/en/latest/api_reference/_autosummary/flax.linen.dot_product_attention.html?highlight=dot_product_attention
-function dot_product_attention(q, k, v; dropout=nothing)
+# Inspired by https://flax.readthedocs.io/en/latest/api_reference/_autosummary/flax.linen.dot_product_attention.html
+function dot_product_attention(q::A4, k::A4, v::A4; dropout=nothing)
   α = dot_product_attention_weights(q, k; dropout)
   # [α] = [kv_len, q_len, num_heads, batch_size]
   @tullio x[d, h, i, b] := α[j, i, h, b] * v[d, h, j, b]
   # [x] = [kv_dim ÷ num_heads, num_heads, q_len, batch_size]
-  
   return x, α
 end
 
+function dot_product_attention(num_heads::Int, q::A3, k::A3, v::A3; kws...)
+  q, k, v = reshape_heads.((q, k, v), num_heads)
+  x, α = dot_product_attention(q, k, v; kws...)
+  return flatten_heads(x), α
+end
+
+reshape_heads(x, num_heads) = reshape(x, size(x, 1) ÷ num_heads, num_heads, size(x)[2:end]...)
+flatten_heads(x) = reshape(x, :, size(x)[3:end]...)
+
 function dot_product_attention_weights(q, k; dropout=nothing)
   @tullio α[j, i, h, b] := q[d, h, i, b] * k[d, h, j, b]
   # [α] = [kv_len, q_len, num_heads, batch_size]
@@ -125,16 +139,6 @@ function QKVProj((in_dim, qkv_dim)::Pair; bias = false)
   )
 end
 
-function (proj::QKVProj)(q_in, k_in, v_in, num_heads)
-  q = proj.q_proj(q_in)
-  sz = size(q)
-  newsz = (sz[1] ÷ num_heads, num_heads, sz[2:end]...)
-  q = reshape(q, newsz)
-  k = reshape(proj.k_proj(k_in), newsz)
-  v = reshape(proj.v_proj(v_in), newsz)
-  return q, k, v
-end
-
 function (proj::QKVProj)(q_in, k_in, v_in)
   return (proj.q_proj(q_in), proj.k_proj(k_in), proj.v_proj(v_in))
 end
@@ -145,51 +149,73 @@ function perf(dim, len, batch_size, num_heads)
   x = rand(Float32, (dim, len, batch_size))
 
   println("tullio")
-  @btime $mha($x, v=:tullio);
-  @btime gradient(m -> sum(m($x, v=:tullio)), $mha);
+  @btime $mha($x, impl=:tullio);
+  @btime gradient(m -> sum(m($x, impl=:tullio)), $mha);
 
   println("nnalib")
-  @btime $mha($x, $x, $x, v=:nnalib);
-  @btime gradient(m -> sum(m($x, v=:nnalib)), $mha);
+  @btime $mha($x, $x, $x, impl=:nnalib);
+  @btime gradient(m -> sum(m($x, impl=:nnalib)), $mha);
   
   if CUDA.functional()
     mha_gpu = mha |> gpu
     x_gpu = x |> gpu
 
     println("tullio - gpu")
-    @btime $mha_gpu($x_gpu, v=:tullio);
-    @btime gradient(m -> sum(m($x_gpu, v=:tullio)), $mha_gpu);
+    @btime $mha_gpu($x_gpu, impl=:tullio);
+    @btime gradient(m -> sum(m($x_gpu, impl=:tullio)), $mha_gpu);
 
     println("nnalib - gpu")
-    @btime CUDA.@sync $mha_gpu($x_gpu, v=:nnalib);
-    @btime CUDA.@sync gradient(m -> sum(m($x_gpu, v=:nnalib)), $mha_gpu);
+    @btime CUDA.@sync $mha_gpu($x_gpu, impl=:nnalib);
+    @btime CUDA.@sync gradient(m -> sum(m($x_gpu, impl=:nnalib)), $mha_gpu);
   end
   return nothing
 end
 
-function test(dim, len, batch_size, num_heads)
+function test(dim, num_heads, len, batch_size)
   mha = MultiHeadAttention(dim, num_heads)  
   x = rand(Float32, (dim, len, batch_size))
-  y = mha(x, v=:tullio)
+  y, α = mha(x, impl=:tullio, with_weights=true)
   @test y isa Array{Float32, 3}
   @test size(y) == (dim, len, batch_size)
-  y2 = mha(x, v=:nnalib)
+  @test α isa Array{Float32, 4}
+  @test size(α) == (len, len, num_heads, batch_size)
+
+  y2, α2 = mha(x, impl=:nnalib, with_weights=true)
   @test size(y) == size(y2)
-  @test y2 ≈ y
+  @test y2 ≈ y atol=1e-1
+  @test size(α) == size(α2)
+  @test α2 ≈ α atol=1e-1
   
   if CUDA.functional()
     mha_gpu = mha |> gpu
     x_gpu = x |> gpu
 
-    y_gpu = mha_gpu(x_gpu, v=:tullio)
-    y_gpu2 = mha_gpu(x_gpu, v=:nnalib)
-    @test Array(y_gpu) ≈ Array(y_gpu2)
+    y_gpu = mha_gpu(x_gpu, impl=:tullio)
+    y_gpu2 = mha_gpu(x_gpu, impl=:nnalib)
+    @test Array(y_gpu) ≈ Array(y_gpu2) atol=1e-1
     @test Array(y_gpu) ≈ y
   end
   return nothing
 end
 
 
-test(12, 3, 2, 4)
-
-perf(64, 100, 32, 4)
+test(4, 2, 2, 1)
+
+perf(128, 8, 128, 32)
+# tullio
+#   5.862 ms (85 allocations: 6.75 MiB)
+#   14.291 ms (1046 allocations: 17.17 MiB)
+# nnalib
+#   6.331 ms (90 allocations: 7.75 MiB)
+#   16.186 ms (690 allocations: 16.17 MiB)
+# tullio - gpu
+#   141.365 μs (499 allocations: 22.81 KiB)
+#   804.018 μs (2228 allocations: 113.45 KiB)
+# nnalib - gpu
+#   163.487 μs (410 allocations: 18.02 KiB)
+#   673.463 μs (1521 allocations: 84.64 KiB)
+
+dim = 4; num_heads=2; len=2; batch_size=1
+mha = MultiHeadAttention(dim, num_heads)  
+x = rand(Float32, (dim, len, batch_size))
+y, α = mha(x, impl=:tullio, with_weights=true)