[ci skip] updates

CarloLucibello · CarloLucibello · commit 1b313f21865c · 2022-12-30T18:31:42.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
diff --git a/src/layers/attention.jl b/src/layers/attention.jl
@@ -1,9 +1,9 @@
-using Flux, Test, LinearAlgebra, Random, Statistics
-using CUDA, CUDAKernels, LoopVectorization
+using Flux, Functors, Test, LinearAlgebra, Random, Statistics
+using CUDA, CUDAKernels, KernelAbstractions, LoopVectorization
 using Tullio
 using NeuralAttentionlib
 using BenchmarkTools
-
+CUDA.allowscalar(false)
 const A3{T} = AbstractArray{T, 3}
 
 """
@@ -144,11 +144,6 @@ function perf(dim, len, batch_size, num_heads)
   mha = MultiHeadAttention(dim, num_heads)  
   x = rand(Float32, (dim, len, batch_size))
 
-  y = mha(x, x, x)
-  @test y isa Array{Float32, 3}
-  @test size(y) == (dim, len, batch_size)
-
-  
   println("tullio")
   @btime $mha($x, v=:tullio);
   @btime gradient(m -> sum(m($x, v=:tullio)), $mha);
@@ -172,4 +167,29 @@ function perf(dim, len, batch_size, num_heads)
   return nothing
 end
 
-perf(64, 100, 32, 8)
+function test(dim, len, batch_size, num_heads)
+  mha = MultiHeadAttention(dim, num_heads)  
+  x = rand(Float32, (dim, len, batch_size))
+  y = mha(x, v=:tullio)
+  @test y isa Array{Float32, 3}
+  @test size(y) == (dim, len, batch_size)
+  y2 = mha(x, v=:nnalib)
+  @test size(y) == size(y2)
+  @test y2 ≈ y
+  
+  if CUDA.functional()
+    mha_gpu = mha |> gpu
+    x_gpu = x |> gpu
+
+    y_gpu = mha_gpu(x_gpu, v=:tullio)
+    y_gpu2 = mha_gpu(x_gpu, v=:nnalib)
+    @test Array(y_gpu) ≈ Array(y_gpu2)
+    @test Array(y_gpu) ≈ y
+  end
+  return nothing
+end
+
+
+test(12, 3, 2, 4)
+
+perf(64, 100, 32, 4)