[ci skip] add native implementation

CarloLucibello · CarloLucibello · commit b9fa2363bb25 · 2023-01-02T00:35:40.000+01:00
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -111,9 +111,11 @@ function (m::MultiHeadAttention)(q_in::A3, k_in::A3, v_in::A3;
   # [v] = [v_dim, kv_len, batch_size]
 
   if impl == :tullio
-    x, α = dot_product_attention(m.num_heads, q, k, v; mask, dropout=m.attn_drop)
+    x, α = dot_product_attention_tullio(m.num_heads, q, k, v; mask, dropout=m.attn_drop)
   elseif impl == :nalib
     x, α = NeuralAttentionlib.multihead_qkv_attention(score_returning, m.num_heads, q, k, v, mask)
+  elseif impl == :native
+    x, α = dot_product_attention_native(m.num_heads, q, k, v; mask, dropout=m.attn_drop)
   else
     error("Unknown attention implementation")
   end
@@ -126,24 +128,30 @@ end
 reshape_heads(x, num_heads) = reshape(x, size(x, 1) ÷ num_heads, num_heads, size(x)[2:end]...)
 flatten_heads(x) = reshape(x, :, size(x)[3:end]...)
 
-function dot_product_attention(num_heads::Int, q::A3, k::A3, v::A3; kws...)
+function dot_product_attention_tullio(num_heads::Int, q::A3, k::A3, v::A3; kws...)
   q, k, v = reshape_heads.((q, k, v), num_heads)
-  x, α = dot_product_attention(q, k, v; kws...)
+  x, α = dot_product_attention_tullio(q, k, v; kws...)
+  return flatten_heads(x), α
+end
+
+function dot_product_attention_native(num_heads::Int, q::A3, k::A3, v::A3; kws...)
+  q, k, v = reshape_heads.((q, k, v), num_heads)
+  x, α = dot_product_attention_native(q, k, v; kws...)
   return flatten_heads(x), α
 end
 
 # Inspired by https://flax.readthedocs.io/en/latest/api_reference/_autosummary/flax.linen.dot_product_attention.html
-function dot_product_attention(q::A4, k::A4, v::A4; 
+function dot_product_attention_tullio(q::A4, k::A4, v::A4; 
             dropout=nothing, bias=nothing, mask=nothing)
 
-  α = dot_product_attention_weights(q, k; dropout, bias, mask)
+  α = dot_product_attention_weights_tullio(q, k; dropout, bias, mask)
   # [α] = [kv_len, q_len, num_heads, batch_size]
   @tullio x[d, h, i, b] := α[j, i, h, b] * v[d, h, j, b]
   # [x] = [kv_dim ÷ num_heads, num_heads, q_len, batch_size]
   return x, α
 end
 
-function dot_product_attention_weights(q::A4{T}, k::A4{T}; 
+function dot_product_attention_weights_tullio(q::A4{T}, k::A4{T}; 
             dropout=nothing, mask=nothing, bias=nothing) where T
 
   q  = q ./ √T(size(q, 1))
@@ -162,6 +170,49 @@ function dot_product_attention_weights(q::A4{T}, k::A4{T};
   return dropout === nothing ? α : dropout(α)
 end
 
+function NNlib.batched_mul(x::AbstractArray{T1,N}, y::AbstractArray{T2,N}) where {T1,T2,N}
+  sz = size(x)[3:end]
+  @assert sz == size(y)[3:end]
+  x2 = reshape(x, size(x, 1), size(x, 2), :)
+  y2 = reshape(y, size(y, 1), size(y, 2), :)
+  z = NNlib.batched_mul(x2, y2)
+  return reshape(z, size(z, 1), size(z, 2), sz...)
+end
+
+function dot_product_attention_native(q::A4, k::A4, v::A4; 
+            dropout=nothing, bias=nothing, mask=nothing)
+
+  α = dot_product_attention_weights_native(q, k; dropout, bias, mask)
+  # [α] = [kv_len, q_len, num_heads, batch_size]
+  
+  vt = permutedims(v, (1, 3, 2, 4))
+  x = NNlib.batched_mul(vt, α)
+  x = permutedims(x, (1, 3, 2, 4))
+  # [x] = [kv_dim ÷ num_heads, num_heads, q_len, batch_size]
+  return x, α
+end
+
+function dot_product_attention_weights_native(q::A4{T}, k::A4{T}; 
+            dropout=nothing, mask=nothing, bias=nothing) where T
+
+  q  = q ./ √T(size(q, 1))
+  kt = permutedims(k, (3, 1, 2, 4))
+  qt = permutedims(q, (1, 3, 2, 4))
+  α = NNlib.batched_mul(kt, qt)
+  # [α] = [kv_len, q_len, num_heads, batch_size]
+
+  if bias !== nothing
+    α = α .+ bias
+  end
+  if mask !== nothing
+    neginf = typemin(eltype(α))
+    α = ifelse.(mask, α, neginf)
+  end
+
+  α = softmax(α, dims=1)
+  return dropout === nothing ? α : dropout(α)
+end
+
 
 struct QKVProj
   q_proj::Dense
@@ -206,6 +257,10 @@ function perf(dim, len, batch_size, num_heads)
   println("nalib")
   @btime $mha($x, $x, $x, impl=:nalib);
   @btime gradient(m -> sum(m($x, impl=:nalib)), $mha);
+
+  println("native")
+  @btime $mha($x, $x, $x, impl=:native);
+  @btime gradient(m -> sum(m($x, impl=:native)), $mha);
   
   if CUDA.functional()
     mha_gpu = mha |> gpu
@@ -218,6 +273,10 @@ function perf(dim, len, batch_size, num_heads)
     println("nalib - gpu")
     @btime CUDA.@sync $mha_gpu($x_gpu, impl=:nalib);
     @btime CUDA.@sync gradient(m -> sum(m($x_gpu, impl=:nalib)), $mha_gpu);
+
+    println("native - gpu")
+    @btime CUDA.@sync $mha_gpu($x_gpu, impl=:native);
+    @btime CUDA.@sync gradient(m -> sum(m($x_gpu, impl=:native)), $mha_gpu);
   end
   return nothing
 end
@@ -240,6 +299,12 @@ function test(dim, num_heads, len, batch_size)
   @test size(α) == size(α2)
   @test α2 ≈ α
 
+  y2b, α2b = mha(q, k, v, impl=:native, with_weights=true)
+  @test size(y) == size(y2b)
+  @test y2b ≈ y
+  @test size(α) == size(α2b)
+  @test α2b ≈ α
+
   mask = make_causal_mask(q)
   y3, α3 = mha(q, k, v; impl=:tullio, with_weights=true, mask)
   y4, α4 = mha(q, k, v, impl=:nalib, with_weights=true, mask=NeuralAttentionlib.CausalMask())
@@ -273,16 +338,22 @@ perf(128, 8, 128, 32)
 # nalib - 6 threads
 #   7.832 ms (187 allocations: 7.76 MiB)
 #   29.823 ms (988 allocations: 16.19 MiB)
+# native
+#   6.269 ms (90 allocations: 9.25 MiB)
+#   15.492 ms (1250 allocations: 22.19 MiB)
 # tullio - gpu
 #   147.746 μs (523 allocations: 24.59 KiB)
 #   957.111 μs (2413 allocations: 127.88 KiB)
 # nalib - gpu
 #   165.109 μs (411 allocations: 18.05 KiB)
 #   659.685 μs (1527 allocations: 86.09 KiB)
-
-dim = 2; len = 3; batch_size = 1; num_heads = 1
-mha = MultiHeadAttention(dim, num_heads)  
-x = rand(Float32, (dim, len, batch_size))
-mask = make_causal_mask(x)
-y, α = mha(x; impl=:tullio, with_weights=true, mask)
-y2, α2 = mha(x; impl=:nalib, with_weights=true, mask=NeuralAttentionlib.CausalMask())
+# native - gpu
+#   158.396 μs (443 allocations: 20.06 KiB)
+#   920.633 μs (2308 allocations: 118.78 KiB)
+
+# dim = 2; len = 3; batch_size = 1; num_heads = 1
+# mha = MultiHeadAttention(dim, num_heads)  
+# x = rand(Float32, (dim, len, batch_size))
+# mask = make_causal_mask(x)
+# y, α = mha(x; impl=:tullio, with_weights=true, mask)
+# y2, α2 = mha(x; impl=:nalib, with_weights=true, mask=NeuralAttentionlib.CausalMask())
diff --git a/test_jax.py b/test_jax.py
@@ -9,6 +9,7 @@
 #import tensorflow_datasets as tfds     # TFDS for MNIST
 # %%
 x = jnp.arange(16).reshape(1,2,2,4) / 16
+alpha = nn.dot_product_attention_weights(x, x)
 y = nn.dot_product_attention(x, x, x)
 yt = y.transpose((3,2,1,0))