From 2234d4ffb8009d9ed2369350e8433afde091fff0 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Tue, 12 May 2020 19:13:41 +0200
Subject: [PATCH 01/34] Solved tests for Matern

---
 src/KernelFunctions.jl    |  4 ++--
 src/basekernels/matern.jl | 11 +++++------
 src/utils.jl              |  3 ++-
 src/zygote_adjoints.jl    | 14 ++++++++++++++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/KernelFunctions.jl b/src/KernelFunctions.jl
index f89db37ae..5ffa58654 100644
--- a/src/KernelFunctions.jl
+++ b/src/KernelFunctions.jl
@@ -34,8 +34,8 @@ export spectral_mixture_kernel, spectral_mixture_product_kernel
 using Compat
 using Requires
 using Distances, LinearAlgebra
-using SpecialFunctions: logabsgamma, besselk
-using ZygoteRules: @adjoint
+using SpecialFunctions: logabsgamma, besselk, polygamma
+using ZygoteRules: @adjoint, pullback
 using StatsFuns: logtwo
 using InteractiveUtils: subtypes
 using StatsBase
diff --git a/src/basekernels/matern.jl b/src/basekernels/matern.jl
index 2adda86ae..44b5eb989 100644
--- a/src/basekernels/matern.jl
+++ b/src/basekernels/matern.jl
@@ -17,12 +17,11 @@ end
 
 @inline function kappa(κ::MaternKernel, d::Real)
     ν = first(κ.ν)
-    iszero(d) ? one(d) :
-    exp(
-        (one(d) - ν) * logtwo - logabsgamma(ν)[1] +
-        ν * log(sqrt(2ν) * d) +
-        log(besselk(ν, sqrt(2ν) * d))
-    )
+    iszero(d) ? one(d) : _matern(ν, d)
+end
+
+function _matern(ν::Real, d::Real)
+    exp((one(d) - ν) * logtwo - loggamma(ν) + ν * log(sqrt(2ν) * d) + log(besselk(ν, sqrt(2ν) * d)))
 end
 
 metric(::MaternKernel) = Euclidean()
diff --git a/src/utils.jl b/src/utils.jl
index ab738c165..ed11f2428 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,5 +1,7 @@
 hadamard(x, y) = x .* y
 
+loggamma(x) = first(logabsgamma(x))
+
 # Macro for checking arguments
 macro check_args(K, param, cond, desc=string(cond))
     quote
@@ -124,4 +126,3 @@ function validate_dims(x::AbstractVector, y::AbstractVector)
         ))
     end
 end
-
diff --git a/src/zygote_adjoints.jl b/src/zygote_adjoints.jl
index 7c6311477..dc5198579 100644
--- a/src/zygote_adjoints.jl
+++ b/src/zygote_adjoints.jl
@@ -4,6 +4,20 @@
   end
 end
 
+@adjoint function loggamma(x)
+    first(logabsgamma(x)) , Δ -> (Δ .* polygamma(0, x), )
+end
+
+@adjoint function kappa(κ::MaternKernel, d::Real)
+    ν = first(κ.ν)
+    val, grad = pullback(_matern, ν, d)
+    return ((iszero(d) ? one(d) : val),
+    Δ -> begin
+        ∇ = grad(Δ)
+        return ((ν = [∇[1]],), iszero(d) ? zero(d) : ∇[2])
+    end)
+end
+
 @adjoint function ColVecs(X::AbstractMatrix)
     back(Δ::NamedTuple) = (Δ.X,)
     back(Δ::AbstractMatrix) = (Δ,)

From a6159e10123197a9ad73d733e3398fb43ae6f460 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Tue, 12 May 2020 19:13:50 +0200
Subject: [PATCH 02/34] Solved tests for dotproduct

---
 src/distances/dotproduct.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/distances/dotproduct.jl b/src/distances/dotproduct.jl
index 7d75266db..79ffff4dd 100644
--- a/src/distances/dotproduct.jl
+++ b/src/distances/dotproduct.jl
@@ -1,7 +1,7 @@
 struct DotProduct <: Distances.PreMetric end
 # struct DotProduct <: Distances.UnionSemiMetric end
 
-@inline function Distances._evaluate(::DotProduct, a::AbstractVector{T}, b::AbstractVector{T}) where {T}
+@inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
     @boundscheck if length(a) != length(b)
         throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
     end

From 4aeb0e3e2c2c2cee90b7512112525d586a09e9ef Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Tue, 12 May 2020 19:13:58 +0200
Subject: [PATCH 03/34] First draft of AD tests

---
 test/test_AD.jl  | 168 +++++++++++++++++++++--------------------------
 test/utils_AD.jl |  75 ++++++++++++++-------
 2 files changed, 128 insertions(+), 115 deletions(-)

diff --git a/test/test_AD.jl b/test/test_AD.jl
index 9ee6e8566..43135cf42 100644
--- a/test/test_AD.jl
+++ b/test/test_AD.jl
@@ -1,119 +1,101 @@
 using KernelFunctions
-using Zygote, ForwardDiff
-using Test, LinearAlgebra
+using KernelFunctions: kappa
+using Flux: params
+import Zygote, ForwardDiff, ReverseDiff
+using Test, LinearAlgebra, Random
 using FiniteDifferences
 
-dims = [10,5]
+include("utils_AD.jl")
+
+dims = [3, 3]
+ν = 3.0
+
+rng = MersenneTwister(42)
+
+A = rand(rng, dims...)
+B = rand(rng, dims...)
+K = [zeros(dims[1], dims[1]), zeros(dims[2], dims[2])]
+
+x = rand(rng, dims[1])
+y = rand(rng, dims[1])
+
+l = rand(rng)
+vl = l * ones(dims[1])
+
+kernels = [
+    SqExponentialKernel(),
+    ExponentialKernel(),
+    MaternKernel(ν = ν),
+    # transform(SqExponentialKernel(), l),
+    # transform(SqExponentialKernel(), vl),
+    # ExponentiatedKernel() + LinearKernel(),
+    # 2.0 * PolynomialKernel() * Matern32Kernel(),
+]
+
+ds = log.([eps(), rand(rng)])
+
+testfunction(k, A, B, dim) = det(kernelmatrix(k, A, B, obsdim = dim))
+testfunction(k, A, dim) = det(kernelmatrix(k, A, obsdim = dim))
+ADs = [:Zygote, :ForwardDiff, :ReverseDiff]
 
-A = rand(dims...)
-B = rand(dims...)
-K = [zeros(dims[1],dims[1]),zeros(dims[2],dims[2])]
-kernels_noparams = [:SqExponentialKernel,:ExponentialKernel,:GammaExponentialKernel,
- :MaternKernel,:Matern32Kernel,:Matern52Kernel,
- :LinearKernel,:PolynomialKernel,
- :RationalQuadraticKernel,:GammaRationalQuadraticKernel,
- :ExponentiatedKernel]
-l = 2.0
-ds = [0.0,3.0]
-vl = l*ones(dims[1])
-testfunction(k,A,B) = det(kernelmatrix(k,A,B))
-testfunction(k,A) = det(kernelmatrix(k,A))
-ADs = [:Zygote,:ForwardDiff]
 
 ## Test kappa functions
+
 @testset "Kappa functions" begin
-    for AD in ADs
-        @testset "$AD" begin
-            for k in kernels_noparams
-                for d in ds
-                    @eval begin @test kappa_AD(Val(Symbol($AD)),$k(),$d) ≈ kappa_fdm($k(),$d) atol=1e-8 end
+    for k in kernels[isa.(kernels, KernelFunctions.SimpleKernel)]
+        @testset "$k" begin
+            @test_nowarn gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), ds[1]) # Check FiniteDiff does the right thing
+            for AD in ADs
+                @testset "$AD" begin
+                    for d in ds
+                        @test_nowarn gradient(Val(AD), x -> kappa(k, exp(x[1])), [d])
+                        @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8
+                    end
                 end
             end
-            # Linear -> C
-            # Polynomial -> C,D
-            # Gamma (etc) -> gamma
-            #
         end
     end
 end
 
-@testset "Transform Operations" begin
-    for AD in ADs
-        @testset "$AD" begin
-            @eval begin
-            # Scale Transform
-            transform_AD(Val(Symbol($AD)),ScaleTransform(l),A)
-            # ARD Transform
-            transform_AD(Val(Symbol($AD)),ARDTransform(vl),A)
-            # Linear transform
-            transform_AD(Val(Symbol($AD)), LinearTransform(rand(2,10)),A)
-            # Chain Transform
-            # transform_AD(Val(Symbol($AD)), LinearTransform, A)
+@testset "Kernel evaluations" begin
+    for k in kernels
+        @testset "$k" begin
+            for AD in ADs
+                @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), x)
+                @testset "$AD" begin
+                    for d in ds
+                        @test_nowarn gradient(Val(AD), x -> k(x, y), x)
+                        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8
+                    end
+                end
             end
         end
     end
 end
 
-##TODO Eventually store real results in file
-@testset "Zygote Automatic Differentiation test" begin
-    @testset "ARD" begin
-        for k in kernels
-            @testset "$k" begin
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(x),A,B),vl)[1], ForwardDiff.gradient(x->testfunction(k(x),A,B),vl)))
-                @test  all(isapprox.(Zygote.gradient(x->testfunction(k(vl),x,B),A)[1],ForwardDiff.gradient(x->testfunction(k(vl),x,B),A)))
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(x),A),vl)[1],ForwardDiff.gradient(x->testfunction(k(x),A),vl)))
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(vl),x),A)[1],ForwardDiff.gradient(x->testfunction(k(vl),x),A)))
-            end
-        end
-    end
-    @testset "ISO" begin
-        for k in kernels
-            @testset "$k" begin
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(x),A,B),l)[1],ForwardDiff.gradient(x->testfunction(k(x[1]),A,B),[l])[1]))
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(l),x,B),A)[1],ForwardDiff.gradient(x->testfunction(k(l),x,B),A)))
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(x),A),l)[1],ForwardDiff.gradient(x->testfunction(k(x[1]),A),[l])))
-                @test all(isapprox.(Zygote.gradient(x->testfunction(k(l),x),A)[1],ForwardDiff.gradient(x->testfunction(k(l[1]),x),A)))
+@testset "Kernel Matrices" begin
+    for k in kernels
+        @testset "$k" begin
+            for AD in ADs
+                # @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), )
+                @testset "$AD" begin
+                    for dim in [1,2]
+                        @test_nowarn gradient(Val(AD), x -> testfunction(k, x, dim), A)
+                        @test_nowarn gradient(Val(AD), x -> testfunction(k, x, B, dim), A)
+                        @test gradient(Val(AD), x -> testfunction(k, x, B, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, B, dim), A) atol=1e-8
+                        @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8
+                    end
+                end
             end
         end
     end
 end
 
-@testset "ForwardDiff AutomaticDifferentation test" begin
-    @testset "ARD" begin
-        for k in kernels
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(x),A,B),vl)
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(vl),x,B),A)
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(x),A),vl)
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(vl),x),A)
-        end
-    end
-    @testset "ISO" begin
-        for k in kernels
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(x[1]),A,B),[l])
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(l),x,B),A)
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(x[1]),A),[l])
-            @test_nowarn ForwardDiff.gradient(x->testfunction(k(l[1]),x),A)
-        end
-    end
-end
-
-
-@testset "Tracker AutomaticDifferentation test" begin
-    @testset "ARD" begin
-        for k in kernels
-            @test_broken all(Tracker.gradient(x->testfunction(k(x),A,B),vl)[1] .≈ ForwardDiff.gradient(x->testfunction(k(x),A,B),vl))
-            @test_broken all(Tracker.gradient(x->testfunction(k(vl),x,B),A)[1] .≈ ForwardDiff.gradient(x->testfunction(k(vl),x,B),A))
-            @test_broken all(Tracker.gradient(x->testfunction(k(x),A),vl)[1] .≈  ForwardDiff.gradient(x->testfunction(k(x),A),vl))
-            @test_broken all.(Tracker.gradient(x->testfunction(k(vl),x),A) .≈ ForwardDiff.gradient(x->testfunction(k(vl),x),A))
-        end
-    end
-    @testset "ISO" begin
-        for k in kernels
-            @test_broken Tracker.gradient(x->testfunction(k(x[1]),A,B),[l])
-            @test_broken Tracker.gradient(x->testfunction(k(l),x,B),A)
-            @test_broken Tracker.gradient(x->testfunction(k(x[1]),A),[l])
-            @test_broken Tracker.gradient(x->testfunction(k(l),x),A)
-
+@testset "Params differentiation" begin
+    for k in kernels
+        @testset "$k" begin
+            ps = params(k)
+            @test_nowarn gradient(Val(:Zygote), () -> k(x, y), ps)
         end
     end
 end
diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 77647e6d1..2baeb4676 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -1,39 +1,70 @@
-allapprox(x,y,tol=1e-8) = all(isapprox.(x,y,atol=tol))
-FDM = central_fdm(5,1)
+allapprox(x, y, tol = 1e-8) = all(isapprox.(x, y, atol = tol))
+FDM = central_fdm(5, 1)
 
+function gradient(::Val{:Zygote}, f::Function, args)
+    first(Zygote.gradient(f, args))
+end
+
+function gradient(::Val{:Zygote}, f::Function, args::Zygote.Params)
+    Zygote.gradient(f, args)
+end
 
-function kappa_AD(::Val{:Zygote},k::Kernel,d::Real)
-    first(Zygote.gradient(x->kappa(k,x),d))
+function gradient(::Val{:ForwardDiff}, f::Function, args)
+    ForwardDiff.gradient(f, args)
 end
 
-function kappa_AD(::Val{:ForwardDiff},k::Kernel,d::Real)
-    first(ForwardDiff.gradient(x->kappa(k,first(x)),[d]))
+function gradient(::Val{:ReverseDiff}, f::Function, args)
+    ReverseDiff.gradient(f, args)
 end
 
-function kappa_fdm(k::Kernel,d::Real)
-    first(FiniteDifferences.grad(FDM,x->kappa(k,x),d))
+function gradient(::Val{:FiniteDiff}, f::Function, args)
+    first(FiniteDifferences.grad(FDM, f, args))
 end
 
 
-function transform_AD(::Val{:Zygote},t::Transform,A)
+
+function transform_AD(::Val{:Zygote}, t::Transform, A)
     ps = KernelFunctions.params(t)
-    @test allapprox(first(Zygote.gradient(p->transform_with_duplicate(p,t,A),ps)),
-        first(FiniteDifferences.grad(FDM,p->transform_with_duplicate(p,t,A),ps)))
-    @test allapprox(first(Zygote.gradient(X->sum(transform(t,X,2)),A)),
-            first(FiniteDifferences.grad(FDM,X->sum(transform(t,X,2)),A)))
+    @test allapprox(
+        first(Zygote.gradient(p -> transform_with_duplicate(p, t, A), ps)),
+        first(FiniteDifferences.grad(
+            FDM,
+            p -> transform_with_duplicate(p, t, A),
+            ps,
+        )),
+    )
+    @test allapprox(
+        first(Zygote.gradient(X -> sum(transform(t, X, 2)), A)),
+        first(FiniteDifferences.grad(FDM, X -> sum(transform(t, X, 2)), A)),
+    )
 end
 
-function transform_AD(::Val{:ForwardDiff},t::Transform,A)
+function transform_AD(::Val{:ForwardDiff}, t::Transform, A)
     ps = KernelFunctions.params(t)
     if t isa ScaleTransform
-        @test allapprox(first(ForwardDiff.gradient(p->transform_with_duplicate(first(p),t,A),[ps])),
-            first(FiniteDifferences.grad(FDM,p->transform_with_duplicate(p,t,A),ps)))
+        @test allapprox(
+            first(ForwardDiff.gradient(
+                p -> transform_with_duplicate(first(p), t, A),
+                [ps],
+            )),
+            first(FiniteDifferences.grad(
+                FDM,
+                p -> transform_with_duplicate(p, t, A),
+                ps,
+            )),
+        )
     else
-        @test allapprox(ForwardDiff.gradient(p->transform_with_duplicate(p,t,A),ps),
-            first(FiniteDifferences.grad(FDM,p->transform_with_duplicate(p,t,A),ps)))
+        @test allapprox(
+            ForwardDiff.gradient(p -> transform_with_duplicate(p, t, A), ps),
+            first(FiniteDifferences.grad(
+                FDM,
+                p -> transform_with_duplicate(p, t, A),
+                ps,
+            )),
+        )
     end
-    @test allapprox(ForwardDiff.gradient(X->sum(transform(t,X,2)),A),
-            first(FiniteDifferences.grad(FDM,X->sum(transform(t,X,2)),A)))
+    @test allapprox(
+        ForwardDiff.gradient(X -> sum(transform(t, X, 2)), A),
+        first(FiniteDifferences.grad(FDM, X -> sum(transform(t, X, 2)), A)),
+    )
 end
-
-transform_with_duplicate(p,t,A) = sum(transform(KernelFunctions.duplicate(t,p),A,2))

From b6a7901c0e53edc60687d53da6b9d1325ec07103 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Tue, 12 May 2020 19:21:40 +0200
Subject: [PATCH 04/34] Removing unnecessary functions and uncommented all
 cases

---
 test/test_AD.jl  | 11 ++++++-----
 test/utils_AD.jl | 49 ------------------------------------------------
 2 files changed, 6 insertions(+), 54 deletions(-)

diff --git a/test/test_AD.jl b/test/test_AD.jl
index 43135cf42..bb6f537b6 100644
--- a/test/test_AD.jl
+++ b/test/test_AD.jl
@@ -1,7 +1,8 @@
 using KernelFunctions
-using KernelFunctions: kappa
+using KernelFunctions: kappa, ColVecs, RowVecs
 using Flux: params
 import Zygote, ForwardDiff, ReverseDiff
+using Zygote: pullback
 using Test, LinearAlgebra, Random
 using FiniteDifferences
 
@@ -26,10 +27,10 @@ kernels = [
     SqExponentialKernel(),
     ExponentialKernel(),
     MaternKernel(ν = ν),
-    # transform(SqExponentialKernel(), l),
-    # transform(SqExponentialKernel(), vl),
-    # ExponentiatedKernel() + LinearKernel(),
-    # 2.0 * PolynomialKernel() * Matern32Kernel(),
+    transform(SqExponentialKernel(), l),
+    transform(SqExponentialKernel(), vl),
+    ExponentiatedKernel() + LinearKernel(),
+    2.0 * PolynomialKernel() * Matern32Kernel(),
 ]
 
 ds = log.([eps(), rand(rng)])
diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 2baeb4676..47309db3f 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -1,4 +1,3 @@
-allapprox(x, y, tol = 1e-8) = all(isapprox.(x, y, atol = tol))
 FDM = central_fdm(5, 1)
 
 function gradient(::Val{:Zygote}, f::Function, args)
@@ -20,51 +19,3 @@ end
 function gradient(::Val{:FiniteDiff}, f::Function, args)
     first(FiniteDifferences.grad(FDM, f, args))
 end
-
-
-
-function transform_AD(::Val{:Zygote}, t::Transform, A)
-    ps = KernelFunctions.params(t)
-    @test allapprox(
-        first(Zygote.gradient(p -> transform_with_duplicate(p, t, A), ps)),
-        first(FiniteDifferences.grad(
-            FDM,
-            p -> transform_with_duplicate(p, t, A),
-            ps,
-        )),
-    )
-    @test allapprox(
-        first(Zygote.gradient(X -> sum(transform(t, X, 2)), A)),
-        first(FiniteDifferences.grad(FDM, X -> sum(transform(t, X, 2)), A)),
-    )
-end
-
-function transform_AD(::Val{:ForwardDiff}, t::Transform, A)
-    ps = KernelFunctions.params(t)
-    if t isa ScaleTransform
-        @test allapprox(
-            first(ForwardDiff.gradient(
-                p -> transform_with_duplicate(first(p), t, A),
-                [ps],
-            )),
-            first(FiniteDifferences.grad(
-                FDM,
-                p -> transform_with_duplicate(p, t, A),
-                ps,
-            )),
-        )
-    else
-        @test allapprox(
-            ForwardDiff.gradient(p -> transform_with_duplicate(p, t, A), ps),
-            first(FiniteDifferences.grad(
-                FDM,
-                p -> transform_with_duplicate(p, t, A),
-                ps,
-            )),
-        )
-    end
-    @test allapprox(
-        ForwardDiff.gradient(X -> sum(transform(t, X, 2)), A),
-        first(FiniteDifferences.grad(FDM, X -> sum(transform(t, X, 2)), A)),
-    )
-end

From f70adc129e2595938c3320309dd793fc9167a0ae Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 14 May 2020 17:51:09 +0200
Subject: [PATCH 05/34] Created two function for testing any kernel, any AD and
 compare with FiniteDifferences.jl

---
 test/test_AD.jl  | 103 +++++------------------------------------------
 test/utils_AD.jl |  79 +++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 94 deletions(-)

diff --git a/test/test_AD.jl b/test/test_AD.jl
index bb6f537b6..356578892 100644
--- a/test/test_AD.jl
+++ b/test/test_AD.jl
@@ -1,102 +1,19 @@
 using KernelFunctions
 using KernelFunctions: kappa, ColVecs, RowVecs
-using Flux: params
-import Zygote, ForwardDiff, ReverseDiff
-using Zygote: pullback
+import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 using Test, LinearAlgebra, Random
-using FiniteDifferences
 
 include("utils_AD.jl")
-
-dims = [3, 3]
-ν = 3.0
-
-rng = MersenneTwister(42)
-
-A = rand(rng, dims...)
-B = rand(rng, dims...)
-K = [zeros(dims[1], dims[1]), zeros(dims[2], dims[2])]
-
-x = rand(rng, dims[1])
-y = rand(rng, dims[1])
-
-l = rand(rng)
-vl = l * ones(dims[1])
-
-kernels = [
-    SqExponentialKernel(),
-    ExponentialKernel(),
-    MaternKernel(ν = ν),
-    transform(SqExponentialKernel(), l),
-    transform(SqExponentialKernel(), vl),
-    ExponentiatedKernel() + LinearKernel(),
-    2.0 * PolynomialKernel() * Matern32Kernel(),
-]
-
-ds = log.([eps(), rand(rng)])
-
-testfunction(k, A, B, dim) = det(kernelmatrix(k, A, B, obsdim = dim))
-testfunction(k, A, dim) = det(kernelmatrix(k, A, obsdim = dim))
 ADs = [:Zygote, :ForwardDiff, :ReverseDiff]
 
-
-## Test kappa functions
-
-@testset "Kappa functions" begin
-    for k in kernels[isa.(kernels, KernelFunctions.SimpleKernel)]
-        @testset "$k" begin
-            @test_nowarn gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), ds[1]) # Check FiniteDiff does the right thing
-            for AD in ADs
-                @testset "$AD" begin
-                    for d in ds
-                        @test_nowarn gradient(Val(AD), x -> kappa(k, exp(x[1])), [d])
-                        @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8
-                    end
-                end
-            end
-        end
-    end
-end
-
-@testset "Kernel evaluations" begin
-    for k in kernels
-        @testset "$k" begin
-            for AD in ADs
-                @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), x)
-                @testset "$AD" begin
-                    for d in ds
-                        @test_nowarn gradient(Val(AD), x -> k(x, y), x)
-                        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8
-                    end
-                end
-            end
-        end
-    end
-end
-
-@testset "Kernel Matrices" begin
-    for k in kernels
-        @testset "$k" begin
-            for AD in ADs
-                # @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), )
-                @testset "$AD" begin
-                    for dim in [1,2]
-                        @test_nowarn gradient(Val(AD), x -> testfunction(k, x, dim), A)
-                        @test_nowarn gradient(Val(AD), x -> testfunction(k, x, B, dim), A)
-                        @test gradient(Val(AD), x -> testfunction(k, x, B, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, B, dim), A) atol=1e-8
-                        @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8
-                    end
-                end
-            end
-        end
-    end
-end
-
-@testset "Params differentiation" begin
-    for k in kernels
-        @testset "$k" begin
-            ps = params(k)
-            @test_nowarn gradient(Val(:Zygote), () -> k(x, y), ps)
-        end
+kname = "SEKernel_lengthscale"
+kfunction = () -> SEKernel()
+kfunction = (l -> transform(SEKernel(), first(l)))
+# args = nothing
+args = [2.0]
+v = test_FiniteDiff(kname, kfunction, args)
+if !v.anynonpass
+    for AD in ADs
+        test_AD(AD, kname, kfunction, args)
     end
 end
diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 47309db3f..fa5c42ace 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -1,4 +1,4 @@
-FDM = central_fdm(5, 1)
+FDM = FiniteDifferences.central_fdm(5, 1)
 
 function gradient(::Val{:Zygote}, f::Function, args)
     first(Zygote.gradient(f, args))
@@ -19,3 +19,80 @@ end
 function gradient(::Val{:FiniteDiff}, f::Function, args)
     first(FiniteDifferences.grad(FDM, f, args))
 end
+
+
+testfunction(k, A, B, dim) = sum(kernelmatrix(k, A, B, obsdim = dim))
+testfunction(k, A, dim) = sum(kernelmatrix(k, A, obsdim = dim))
+
+function test_FiniteDiff(kernelname, kernelfunction, args = nothing)
+    # Init arguments :
+    k = if args === nothing
+        kernelfunction()
+    else
+        kernelfunction(args)
+    end
+    dims = [3, 3]
+    rng = MersenneTwister(42)
+    @testset "FiniteDifferences with $(kernelname)" begin
+        if k isa SimpleKernel
+            for d in log.([eps(), rand(rng)])
+                @test_nowarn gradient(Val(:FiniteDiff), x -> kappa(k, exp(first(x))), [d])
+            end
+        end
+        ## Testing Kernel Functions
+        x = rand(rng, dims[1])
+        y = rand(rng, dims[1])
+        @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), x)
+        if !(args === nothing)
+            @test_nowarn gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args)
+        end
+        ## Testing Kernel Matrices
+        A = rand(rng, dims...)
+        B = rand(rng, dims...)
+        for dim in 1:2
+            @test_nowarn gradient(Val(:FiniteDiff), a -> testfunction(k, a, dim), A)
+            @test_nowarn gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A)
+            @test_nowarn gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B)
+            if !(args === nothing)
+                @test_nowarn gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, B, dim), args)
+            end
+        end
+    end
+end
+
+function test_AD(AD, kernelname, kernelfunction, args = nothing)
+    @testset "Testing $(kernelname) with AD : $(AD)" begin
+        # Test kappa function
+        dims = [3, 3]
+        k = if args === nothing
+            kernelfunction()
+        else
+            kernelfunction(args)
+        end
+        rng = MersenneTwister(42)
+        if k isa SimpleKernel
+            for d in log.([eps(), rand(rng)])
+                @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8
+            end
+        end
+        # Testing kernel evaluations
+        x = rand(rng, dims[1])
+        y = rand(rng, dims[1])
+        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8
+        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) atol=1e-8
+        if !(args === nothing)
+            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) atol=1e-8
+        end
+        # Testing kernel matrices
+        A = rand(rng, dims...)
+        B = rand(rng, dims...)
+        for dim in 1:2
+            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8
+            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) atol=1e-8
+            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) atol=1e-8
+            if !(args === nothing)
+                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) atol=1e-8
+            end
+        end
+    end
+end

From 2ae0cd6bb2b9f7cea560faa4142e465f0dbce618 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 14 May 2020 17:51:37 +0200
Subject: [PATCH 06/34] Needed type promotion for ForwardDiff.jl

---
 src/distances/dotproduct.jl | 2 ++
 src/distances/sinus.jl      | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/distances/dotproduct.jl b/src/distances/dotproduct.jl
index 79ffff4dd..880c494df 100644
--- a/src/distances/dotproduct.jl
+++ b/src/distances/dotproduct.jl
@@ -8,6 +8,8 @@ struct DotProduct <: Distances.PreMetric end
     return dot(a,b)
 end
 
+Distances.result_type(::DotProduct, Ta::Type, Tb::Type) = promote_type(Ta, Tb)
+
 @inline Distances.eval_op(::DotProduct, a::Real, b::Real) = a * b
 @inline (dist::DotProduct)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist, a, b)
 @inline (dist::DotProduct)(a::Number,b::Number) = a * b
diff --git a/src/distances/sinus.jl b/src/distances/sinus.jl
index 7276e2e48..f4bdd6b97 100644
--- a/src/distances/sinus.jl
+++ b/src/distances/sinus.jl
@@ -8,7 +8,9 @@ Distances.parameters(d::Sinus) = d.r
 @inline (dist::Sinus)(a::AbstractArray, b::AbstractArray) = Distances._evaluate(dist, a, b)
 @inline (dist::Sinus)(a::Number, b::Number) = abs2(sinpi(a - b) / first(dist.r))
 
-@inline function Distances._evaluate(d::Sinus, a::AbstractVector{T}, b::AbstractVector{T}) where {T}
+Distances.result_type(::Sinus{T}, Ta::Type, Tb::Type) where {T} = promote_type(T, Ta, Tb)
+
+@inline function Distances._evaluate(d::Sinus, a::AbstractVector, b::AbstractVector) where {T}
     @boundscheck if (length(a) != length(b)) || length(a) != length(d.r)
         throw(DimensionMismatch("Dimensions of the inputs are not matching : a = $(length(a)), b = $(length(b)), r = $(length(d.r))"))
     end

From d88dcff1547fd52563c33c2ced0963198cab86fd Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 14 May 2020 17:52:05 +0200
Subject: [PATCH 07/34] Created indirection from Base.map to _map for creating
 adjoints

---
 src/transform/ardtransform.jl      | 4 ++--
 src/transform/functiontransform.jl | 4 ++--
 src/transform/lineartransform.jl   | 4 ++--
 src/transform/scaletransform.jl    | 4 ++--
 src/transform/selecttransform.jl   | 4 ++--
 src/transform/transform.jl         | 6 +-----
 6 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/transform/ardtransform.jl b/src/transform/ardtransform.jl
index d9bf019a9..85950f557 100644
--- a/src/transform/ardtransform.jl
+++ b/src/transform/ardtransform.jl
@@ -25,8 +25,8 @@ dim(t::ARDTransform) = length(t.v)
 (t::ARDTransform)(x) = t.v .* x
 
 Base.map(t::ARDTransform, x::AbstractVector{<:Real}) = t.v' .* x
-Base.map(t::ARDTransform, x::ColVecs) = ColVecs(t.v .* x.X)
-Base.map(t::ARDTransform, x::RowVecs) = RowVecs(t.v' .* x.X)
+_map(t::ARDTransform, x::ColVecs) = ColVecs(t.v .* x.X)
+_map(t::ARDTransform, x::RowVecs) = RowVecs(t.v' .* x.X)
 
 Base.isequal(t::ARDTransform, t2::ARDTransform) = isequal(t.v, t2.v)
 
diff --git a/src/transform/functiontransform.jl b/src/transform/functiontransform.jl
index 5c3729dc3..09085740d 100644
--- a/src/transform/functiontransform.jl
+++ b/src/transform/functiontransform.jl
@@ -16,8 +16,8 @@ end
 (t::FunctionTransform)(x) = t.f(x)
 
 Base.map(t::FunctionTransform, x::AbstractVector{<:Real}) = map(t.f, x)
-Base.map(t::FunctionTransform, x::ColVecs) = ColVecs(mapslices(t.f, x.X; dims=1))
-Base.map(t::FunctionTransform, x::RowVecs) = RowVecs(mapslices(t.f, x.X; dims=2))
+_map(t::FunctionTransform, x::ColVecs) = ColVecs(mapslices(t.f, x.X; dims=1))
+_map(t::FunctionTransform, x::RowVecs) = RowVecs(mapslices(t.f, x.X; dims=2))
 
 duplicate(t::FunctionTransform,f) = FunctionTransform(f)
 
diff --git a/src/transform/lineartransform.jl b/src/transform/lineartransform.jl
index 43224f90c..a86e6cabe 100644
--- a/src/transform/lineartransform.jl
+++ b/src/transform/lineartransform.jl
@@ -28,8 +28,8 @@ end
 (t::LinearTransform)(x::AbstractVector{<:Real}) = t.A * x
 
 Base.map(t::LinearTransform, x::AbstractVector{<:Real}) = ColVecs(t.A * x')
-Base.map(t::LinearTransform, x::ColVecs) = ColVecs(t.A * x.X)
-Base.map(t::LinearTransform, x::RowVecs) = RowVecs(x.X * t.A')
+_map(t::LinearTransform, x::ColVecs) = ColVecs(t.A * x.X)
+_map(t::LinearTransform, x::RowVecs) = RowVecs(x.X * t.A')
 
 function Base.show(io::IO, t::LinearTransform)
     print(io::IO, "Linear transform (size(A) = ", size(t.A), ")")
diff --git a/src/transform/scaletransform.jl b/src/transform/scaletransform.jl
index af09b27ef..dbe1794b2 100644
--- a/src/transform/scaletransform.jl
+++ b/src/transform/scaletransform.jl
@@ -20,8 +20,8 @@ set!(t::ScaleTransform,ρ::Real) = t.s .= [ρ]
 (t::ScaleTransform)(x) = first(t.s) .* x
 
 Base.map(t::ScaleTransform, x::AbstractVector{<:Real}) = first(t.s) .* x
-Base.map(t::ScaleTransform, x::ColVecs) = ColVecs(first(t.s) .* x.X)
-Base.map(t::ScaleTransform, x::RowVecs) = RowVecs(first(t.s) .* x.X)
+_map(t::ScaleTransform, x::ColVecs) = ColVecs(first(t.s) .* x.X)
+_map(t::ScaleTransform, x::RowVecs) = RowVecs(first(t.s) .* x.X)
 
 Base.isequal(t::ScaleTransform,t2::ScaleTransform) = isequal(first(t.s),first(t2.s))
 
diff --git a/src/transform/selecttransform.jl b/src/transform/selecttransform.jl
index 66631ff13..608e55b1d 100644
--- a/src/transform/selecttransform.jl
+++ b/src/transform/selecttransform.jl
@@ -25,7 +25,7 @@ duplicate(t::SelectTransform,θ) = t
 
 (t::SelectTransform)(x::AbstractVector) = view(x, t.select)
 
-Base.map(t::SelectTransform, x::ColVecs) = ColVecs(view(x.X, t.select, :))
-Base.map(t::SelectTransform, x::RowVecs) = RowVecs(view(x.X, :, t.select))
+_map(t::SelectTransform, x::ColVecs) = ColVecs(view(x.X, t.select, :))
+_map(t::SelectTransform, x::RowVecs) = RowVecs(view(x.X, :, t.select))
 
 Base.show(io::IO, t::SelectTransform) = print(io, "Select Transform (dims: ", t.select, ")")
diff --git a/src/transform/transform.jl b/src/transform/transform.jl
index 7d2bbe22c..2deec6060 100644
--- a/src/transform/transform.jl
+++ b/src/transform/transform.jl
@@ -5,12 +5,8 @@ include("functiontransform.jl")
 include("selecttransform.jl")
 include("chaintransform.jl")
 
-"""
-    apply(t::Transform, x; obsdim::Int=defaultobs)
 
-Apply the transform `t` vector-wise on the array `x`
-"""
-apply
+Base.map(t::Transform, x::Union{ColVecs, RowVecs}) = _map(t, x)
 
 """
     IdentityTransform()

From 6875aee5c52770ac69787639fc3115fe6a90f941 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 14 May 2020 17:52:27 +0200
Subject: [PATCH 08/34] Created full adjoints for DotProduct and evaluate for
 Sinus

---
 src/zygote_adjoints.jl | 52 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/src/zygote_adjoints.jl b/src/zygote_adjoints.jl
index dc5198579..f73b4bce1 100644
--- a/src/zygote_adjoints.jl
+++ b/src/zygote_adjoints.jl
@@ -4,6 +4,44 @@
   end
 end
 
+@adjoint function pairwise(d::DotProduct, X::AbstractMatrix, Y::AbstractMatrix; dims=2)
+  D = pairwise(d, X, Y; dims = dims)
+  if dims == 1
+      return D, Δ -> (nothing, Δ * Y, (X' * Δ)')
+  else
+      return D, Δ -> (nothing, (Δ * Y')', X * Δ)
+  end
+end
+
+@adjoint function pairwise(d::DotProduct, X::AbstractMatrix; dims=2)
+  D = pairwise(d, X; dims = dims)
+  if dims == 1
+      return D, Δ -> (nothing, 2 * Δ * X)
+  else
+      return D, Δ -> (nothing, 2 * X * Δ)
+  end
+end
+
+@adjoint function evaluate(s::Sinus, x::AbstractVector, y::AbstractVector)
+  d = (x - y)
+  sind = sinpi.(d)
+  val = sum(abs2, sind ./ s.r)
+  gradx = 2π .* cospi.(d) .* sind ./ (s.r .^ 2)
+  val, Δ -> begin
+    ((r = -2Δ .* abs2.(sind) ./ s.r,), Δ * gradx, - Δ * gradx)
+  end
+end
+
+@adjoint function pairwise(s::Sinus, X::AbstractMatrix, Y::AbstractMatrix; dims=2)
+    D = pairwise(d, X, Y; dims = dims)
+    throw(error("Sinus metric has no defined adjoint for now... PR welcome!"))
+end
+
+@adjoint function pairwise(s::Sinus, X::AbstractMatrix; dims=2)
+  D = pairwise(d, X; dims = dims)
+  throw(error("Sinus metric has no defined adjoint for now... PR welcome!"))
+end
+
 @adjoint function loggamma(x)
     first(logabsgamma(x)) , Δ -> (Δ .* polygamma(0, x), )
 end
@@ -36,10 +74,10 @@ end
     return RowVecs(X), back
 end
 
-# @adjoint function evaluate(s::Sinus, x::AbstractVector, y::AbstractVector)
-#   d = evaluate(s, x, y)
-#   s = sum(sin.(π*(x-y)))
-#   d, Δ -> begin
-#     (Sinus(Δ ./ s.r), 2Δ .* cos.(x - y) * d, -2Δ .* cos.(x - y) * d)
-#   end
-# end
+@adjoint function Base.map(t::Transform, X::ColVecs)
+    pullback(_map, t, X)
+end
+
+@adjoint function Base.map(t::Transform, X::RowVecs)
+    pullback(_map, t, X)
+end

From 44368fb8736b3874471dd73bfd0a5c61fc6aa63b Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 14 May 2020 18:17:07 +0200
Subject: [PATCH 09/34] Fixing ambiguity for Identity transform

---
 src/transform/transform.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transform/transform.jl b/src/transform/transform.jl
index 2deec6060..7e1ece67b 100644
--- a/src/transform/transform.jl
+++ b/src/transform/transform.jl
@@ -17,6 +17,7 @@ struct IdentityTransform <: Transform end
 
 (t::IdentityTransform)(x) = x
 Base.map(::IdentityTransform, x::AbstractVector) = x
+_map(::IdentityTransform, x::AbstractVector) = x
 
 ### TODO Maybe defining adjoints could help but so far it's not working
 

From b3142f6e6f869bcc389d08ef321e26b03ecf952e Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 15 May 2020 11:32:45 +0200
Subject: [PATCH 10/34] Adding test dependencies for AD

---
 Project.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 27662e132..a43efa1ff 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,11 +26,13 @@ julia = "1.3"
 [extras]
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Random", "Test", "FiniteDifferences", "Zygote", "PDMats", "Kronecker", "Flux"]
+test = ["Random", "Test", "FiniteDifferences", "Zygote", "ReverseDiff", "ForwardDiff", "PDMats", "Kronecker", "Flux"]

From 44ad0cdaa61fe43621e2b9de8c93ce68c2143619 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 15 May 2020 11:33:00 +0200
Subject: [PATCH 11/34] Put everything under _map to avoid ambiguities

---
 src/transform/ardtransform.jl      | 2 +-
 src/transform/chaintransform.jl    | 2 +-
 src/transform/functiontransform.jl | 2 +-
 src/transform/lineartransform.jl   | 2 +-
 src/transform/scaletransform.jl    | 2 +-
 src/transform/transform.jl         | 3 +--
 6 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/transform/ardtransform.jl b/src/transform/ardtransform.jl
index 85950f557..d5231c1bf 100644
--- a/src/transform/ardtransform.jl
+++ b/src/transform/ardtransform.jl
@@ -24,7 +24,7 @@ dim(t::ARDTransform) = length(t.v)
 (t::ARDTransform)(x::Real) = first(t.v) * x
 (t::ARDTransform)(x) = t.v .* x
 
-Base.map(t::ARDTransform, x::AbstractVector{<:Real}) = t.v' .* x
+_map(t::ARDTransform, x::AbstractVector{<:Real}) = t.v' .* x
 _map(t::ARDTransform, x::ColVecs) = ColVecs(t.v .* x.X)
 _map(t::ARDTransform, x::RowVecs) = RowVecs(t.v' .* x.X)
 
diff --git a/src/transform/chaintransform.jl b/src/transform/chaintransform.jl
index d8d3bc1f5..b1ed93ffb 100644
--- a/src/transform/chaintransform.jl
+++ b/src/transform/chaintransform.jl
@@ -27,7 +27,7 @@ Base.:∘(tc::ChainTransform, t::Transform) = ChainTransform(vcat(t, tc.transfor
 
 (t::ChainTransform)(x) = foldl((x, t) -> t(x), t.transforms; init=x)
 
-function Base.map(t::ChainTransform, x::AbstractVector)
+function _map(t::ChainTransform, x::AbstractVector)
     return foldl((x, t) -> map(t, x), t.transforms; init=x)
 end
 
diff --git a/src/transform/functiontransform.jl b/src/transform/functiontransform.jl
index 09085740d..c1d09b418 100644
--- a/src/transform/functiontransform.jl
+++ b/src/transform/functiontransform.jl
@@ -15,7 +15,7 @@ end
 
 (t::FunctionTransform)(x) = t.f(x)
 
-Base.map(t::FunctionTransform, x::AbstractVector{<:Real}) = map(t.f, x)
+_map(t::FunctionTransform, x::AbstractVector{<:Real}) = map(t.f, x)
 _map(t::FunctionTransform, x::ColVecs) = ColVecs(mapslices(t.f, x.X; dims=1))
 _map(t::FunctionTransform, x::RowVecs) = RowVecs(mapslices(t.f, x.X; dims=2))
 
diff --git a/src/transform/lineartransform.jl b/src/transform/lineartransform.jl
index a86e6cabe..dcbd55873 100644
--- a/src/transform/lineartransform.jl
+++ b/src/transform/lineartransform.jl
@@ -27,7 +27,7 @@ end
 (t::LinearTransform)(x::Real) = vec(t.A * x)
 (t::LinearTransform)(x::AbstractVector{<:Real}) = t.A * x
 
-Base.map(t::LinearTransform, x::AbstractVector{<:Real}) = ColVecs(t.A * x')
+_map(t::LinearTransform, x::AbstractVector{<:Real}) = ColVecs(t.A * x')
 _map(t::LinearTransform, x::ColVecs) = ColVecs(t.A * x.X)
 _map(t::LinearTransform, x::RowVecs) = RowVecs(x.X * t.A')
 
diff --git a/src/transform/scaletransform.jl b/src/transform/scaletransform.jl
index dbe1794b2..37aa1fef9 100644
--- a/src/transform/scaletransform.jl
+++ b/src/transform/scaletransform.jl
@@ -19,7 +19,7 @@ set!(t::ScaleTransform,ρ::Real) = t.s .= [ρ]
 
 (t::ScaleTransform)(x) = first(t.s) .* x
 
-Base.map(t::ScaleTransform, x::AbstractVector{<:Real}) = first(t.s) .* x
+_map(t::ScaleTransform, x::AbstractVector{<:Real}) = first(t.s) .* x
 _map(t::ScaleTransform, x::ColVecs) = ColVecs(first(t.s) .* x.X)
 _map(t::ScaleTransform, x::RowVecs) = RowVecs(first(t.s) .* x.X)
 
diff --git a/src/transform/transform.jl b/src/transform/transform.jl
index 7e1ece67b..b6ab0f397 100644
--- a/src/transform/transform.jl
+++ b/src/transform/transform.jl
@@ -6,7 +6,7 @@ include("selecttransform.jl")
 include("chaintransform.jl")
 
 
-Base.map(t::Transform, x::Union{ColVecs, RowVecs}) = _map(t, x)
+Base.map(t::Transform, x::AbstractVector) = _map(t, x)
 
 """
     IdentityTransform()
@@ -16,7 +16,6 @@ Return exactly the input
 struct IdentityTransform <: Transform end
 
 (t::IdentityTransform)(x) = x
-Base.map(::IdentityTransform, x::AbstractVector) = x
 _map(::IdentityTransform, x::AbstractVector) = x
 
 ### TODO Maybe defining adjoints could help but so far it's not working

From 07631b6b7864991025855a7a52d151c8dd2969d3 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 15 May 2020 17:31:05 +0200
Subject: [PATCH 12/34] Tests passing for constant kernels/modified Zygote to
 return zeros instead of nothing

---
 src/distances/delta.jl       |   6 +-
 src/zygote_adjoints.jl       |  27 +++++++++
 test/basekernels/constant.jl |   3 +
 test/runtests.jl             | 114 +++++++++++++++++------------------
 test/utils_AD.jl             |  27 ++++++---
 5 files changed, 108 insertions(+), 69 deletions(-)

diff --git a/src/distances/delta.jl b/src/distances/delta.jl
index b986ef73f..54da36ad5 100644
--- a/src/distances/delta.jl
+++ b/src/distances/delta.jl
@@ -1,12 +1,14 @@
 struct Delta <: Distances.PreMetric
 end
 
-@inline function Distances._evaluate(::Delta,a::AbstractVector{T},b::AbstractVector{T}) where {T}
+@inline function Distances._evaluate(::Delta, a::AbstractVector, b::AbstractVector) where {T}
     @boundscheck if length(a) != length(b)
         throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
     end
     return a == b
 end
 
+Distances.result_type(::Delta, Ta::Type, Tb::Type) = promote_type(Ta, Tb)
+
 @inline (dist::Delta)(a::AbstractArray, b::AbstractArray) = Distances._evaluate(dist, a, b)
-@inline (dist::Delta)(a::Number,b::Number) = a == b
+@inline (dist::Delta)(a::Number, b::Number) = a == b
diff --git a/src/zygote_adjoints.jl b/src/zygote_adjoints.jl
index f73b4bce1..38f131cb9 100644
--- a/src/zygote_adjoints.jl
+++ b/src/zygote_adjoints.jl
@@ -1,3 +1,29 @@
+## Adjoints Delta
+@adjoint function evaluate(s::Delta, x::AbstractVector, y::AbstractVector)
+  evaluate(s, x, y), Δ -> begin
+    (nothing, nothing, nothing)
+  end
+end
+
+@adjoint function pairwise(d::Delta, X::AbstractMatrix, Y::AbstractMatrix; dims=2)
+  D = pairwise(d, X, Y; dims = dims)
+  if dims == 1
+      return D, Δ -> (nothing, nothing, nothing)
+  else
+      return D, Δ -> (nothing, nothing, nothing)
+  end
+end
+
+@adjoint function pairwise(d::Delta, X::AbstractMatrix; dims=2)
+  D = pairwise(d, X; dims = dims)
+  if dims == 1
+      return D, Δ -> (nothing, nothing)
+  else
+      return D, Δ -> (nothing, nothing)
+  end
+end
+
+## Adjoints DotProduct
 @adjoint function evaluate(s::DotProduct, x::AbstractVector, y::AbstractVector)
   dot(x, y), Δ -> begin
     (nothing, Δ .* y, Δ .* x)
@@ -22,6 +48,7 @@ end
   end
 end
 
+## Adjoints Sinus
 @adjoint function evaluate(s::Sinus, x::AbstractVector, y::AbstractVector)
   d = (x - y)
   sind = sinpi.(d)
diff --git a/test/basekernels/constant.jl b/test/basekernels/constant.jl
index 9a824e287..f58be2d45 100644
--- a/test/basekernels/constant.jl
+++ b/test/basekernels/constant.jl
@@ -5,6 +5,7 @@
         @test kappa(k,2.0) == 0.0
         @test KernelFunctions.metric(ZeroKernel()) == KernelFunctions.Delta()
         @test repr(k) == "Zero Kernel"
+        test_AD("Zero", ZeroKernel)
     end
     @testset "WhiteKernel" begin
         k = WhiteKernel()
@@ -14,6 +15,7 @@
         @test EyeKernel == WhiteKernel
         @test metric(WhiteKernel()) == KernelFunctions.Delta()
         @test repr(k) == "White Kernel"
+        test_AD("WhiteKernel", WhiteKernel)
     end
     @testset "ConstantKernel" begin
         c = 2.0
@@ -24,5 +26,6 @@
         @test metric(ConstantKernel()) == KernelFunctions.Delta()
         @test metric(ConstantKernel(c=2.0)) == KernelFunctions.Delta()
         @test repr(k) == "Constant Kernel (c = $(c))"
+        test_AD("ConstantKernel", c->ConstantKernel(c=first(c)), [c])
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0ff326256..f55fdbfaf 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,14 +1,12 @@
 using KernelFunctions
 using Distances
-using FiniteDifferences
-using Flux
 using Kronecker
 using LinearAlgebra
 using PDMats
 using Random
 using SpecialFunctions
 using Test
-using Zygote
+import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
 using KernelFunctions: metric, kappa
 
@@ -45,66 +43,66 @@ using KernelFunctions: metric, kappa
 @testset "KernelFunctions" begin
 
     include("utils.jl")
-
-    @testset "distances" begin
-        include(joinpath("distances", "dotproduct.jl"))
-        include(joinpath("distances", "delta.jl"))
-        include(joinpath("distances", "sinus.jl"))
-    end
-
-    @testset "transform" begin
-        include(joinpath("transform", "transform.jl"))
-        include(joinpath("transform", "scaletransform.jl"))
-        include(joinpath("transform", "ardtransform.jl"))
-        include(joinpath("transform", "lineartransform.jl"))
-        include(joinpath("transform", "functiontransform.jl"))
-        include(joinpath("transform", "selecttransform.jl"))
-        include(joinpath("transform", "chaintransform.jl"))
-    end
+    include("utils_AD.jl")
+    # @testset "distances" begin
+    #     include(joinpath("distances", "dotproduct.jl"))
+    #     include(joinpath("distances", "delta.jl"))
+    #     include(joinpath("distances", "sinus.jl"))
+    # end
+    #
+    # @testset "transform" begin
+    #     include(joinpath("transform", "transform.jl"))
+    #     include(joinpath("transform", "scaletransform.jl"))
+    #     include(joinpath("transform", "ardtransform.jl"))
+    #     include(joinpath("transform", "lineartransform.jl"))
+    #     include(joinpath("transform", "functiontransform.jl"))
+    #     include(joinpath("transform", "selecttransform.jl"))
+    #     include(joinpath("transform", "chaintransform.jl"))
+    # end
 
     @testset "basekernels" begin
         include(joinpath("basekernels", "constant.jl"))
-        include(joinpath("basekernels", "cosine.jl"))
-        include(joinpath("basekernels", "exponential.jl"))
-        include(joinpath("basekernels", "exponentiated.jl"))
-        include(joinpath("basekernels", "fbm.jl"))
-        include(joinpath("basekernels", "gabor.jl"))
-        include(joinpath("basekernels", "maha.jl"))
-        include(joinpath("basekernels", "matern.jl"))
-        include(joinpath("basekernels", "nn.jl"))
-        include(joinpath("basekernels", "periodic.jl"))
-        include(joinpath("basekernels", "polynomial.jl"))
-        include(joinpath("basekernels", "piecewisepolynomial.jl"))
-        include(joinpath("basekernels", "rationalquad.jl"))
-        include(joinpath("basekernels", "sm.jl"))
-        include(joinpath("basekernels", "wiener.jl"))
-    end
-
-    @testset "kernels" begin
-        include(joinpath("kernels", "kernelproduct.jl"))
-        include(joinpath("kernels", "kernelsum.jl"))
-        include(joinpath("kernels", "scaledkernel.jl"))
-        include(joinpath("kernels", "tensorproduct.jl"))
-        include(joinpath("kernels", "transformedkernel.jl"))
-
-        # Legacy tests that don't correspond to anything meaningful in src. Unclear how
-        # helpful these are.
-        include(joinpath("kernels", "custom.jl"))
-    end
-
-    @testset "matrix" begin
-        include(joinpath("matrix", "kernelmatrix.jl"))
-        include(joinpath("matrix", "kernelkroneckermat.jl"))
-        include(joinpath("matrix", "kernelpdmat.jl"))
-    end
-
-    @testset "approximations" begin
-        include(joinpath("approximations", "nystrom.jl"))
+        # include(joinpath("basekernels", "cosine.jl"))
+        # include(joinpath("basekernels", "exponential.jl"))
+        # include(joinpath("basekernels", "exponentiated.jl"))
+        # include(joinpath("basekernels", "fbm.jl"))
+        # include(joinpath("basekernels", "gabor.jl"))
+        # include(joinpath("basekernels", "maha.jl"))
+        # include(joinpath("basekernels", "matern.jl"))
+        # include(joinpath("basekernels", "nn.jl"))
+        # include(joinpath("basekernels", "periodic.jl"))
+        # include(joinpath("basekernels", "polynomial.jl"))
+        # include(joinpath("basekernels", "piecewisepolynomial.jl"))
+        # include(joinpath("basekernels", "rationalquad.jl"))
+        # include(joinpath("basekernels", "sm.jl"))
+        # include(joinpath("basekernels", "wiener.jl"))
     end
 
-    include("generic.jl")
-    include("zygote_adjoints.jl")
-    include("trainable.jl")
+    # @testset "kernels" begin
+    #     include(joinpath("kernels", "kernelproduct.jl"))
+    #     include(joinpath("kernels", "kernelsum.jl"))
+    #     include(joinpath("kernels", "scaledkernel.jl"))
+    #     include(joinpath("kernels", "tensorproduct.jl"))
+    #     include(joinpath("kernels", "transformedkernel.jl"))
+    #
+    #     # Legacy tests that don't correspond to anything meaningful in src. Unclear how
+    #     # helpful these are.
+    #     include(joinpath("kernels", "custom.jl"))
+    # end
+    #
+    # @testset "matrix" begin
+    #     include(joinpath("matrix", "kernelmatrix.jl"))
+    #     include(joinpath("matrix", "kernelkroneckermat.jl"))
+    #     include(joinpath("matrix", "kernelpdmat.jl"))
+    # end
+    #
+    # @testset "approximations" begin
+    #     include(joinpath("approximations", "nystrom.jl"))
+    # end
+    #
+    # include("generic.jl")
+    # include("zygote_adjoints.jl")
+    # include("trainable.jl")
 end
 
 # These are legacy tests that I'm not getting rid of, as they appear to be useful, but
diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index fa5c42ace..3dc2ae908 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -1,11 +1,13 @@
+
 FDM = FiniteDifferences.central_fdm(5, 1)
 
 function gradient(::Val{:Zygote}, f::Function, args)
-    first(Zygote.gradient(f, args))
-end
-
-function gradient(::Val{:Zygote}, f::Function, args::Zygote.Params)
-    Zygote.gradient(f, args)
+    g = first(Zygote.gradient(f, args))
+    if isnothing(g)
+        return zeros(size(args)) # To respect the same output as other ADs
+    else
+        return g
+    end
 end
 
 function gradient(::Val{:ForwardDiff}, f::Function, args)
@@ -24,14 +26,22 @@ end
 testfunction(k, A, B, dim) = sum(kernelmatrix(k, A, B, obsdim = dim))
 testfunction(k, A, dim) = sum(kernelmatrix(k, A, obsdim = dim))
 
-function test_FiniteDiff(kernelname, kernelfunction, args = nothing)
+function test_AD(kernelname::String, kernelfunction, args = nothing; ADs = [:Zygote, :ForwardDiff, :ReverseDiff], dims = [3, 3])
+    test_fd = test_FiniteDiff(kernelname, kernelfunction, args, dims)
+    if !test_fd.anynonpass
+        for AD in ADs
+            test_AD(AD, kernelname, kernelfunction, args, dims)
+        end
+    end
+end
+
+function test_FiniteDiff(kernelname, kernelfunction, args = nothing, dims = [3, 3])
     # Init arguments :
     k = if args === nothing
         kernelfunction()
     else
         kernelfunction(args)
     end
-    dims = [3, 3]
     rng = MersenneTwister(42)
     @testset "FiniteDifferences with $(kernelname)" begin
         if k isa SimpleKernel
@@ -60,10 +70,9 @@ function test_FiniteDiff(kernelname, kernelfunction, args = nothing)
     end
 end
 
-function test_AD(AD, kernelname, kernelfunction, args = nothing)
+function test_AD(AD, kernelname, kernelfunction, args = nothing, dims = [3, 3])
     @testset "Testing $(kernelname) with AD : $(AD)" begin
         # Test kappa function
-        dims = [3, 3]
         k = if args === nothing
             kernelfunction()
         else

From 960bad2df286240c17266fb85f26f9386001d50c Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:04:44 +0200
Subject: [PATCH 13/34] Spread tests for all base kernels

---
 test/basekernels/constant.jl            | 6 +++---
 test/basekernels/cosine.jl              | 1 +
 test/basekernels/exponential.jl         | 5 ++++-
 test/basekernels/exponentiated.jl       | 1 +
 test/basekernels/fbm.jl                 | 2 ++
 test/basekernels/gabor.jl               | 2 ++
 test/basekernels/maha.jl                | 2 ++
 test/basekernels/matern.jl              | 4 ++++
 test/basekernels/nn.jl                  | 3 ++-
 test/basekernels/periodic.jl            | 2 ++
 test/basekernels/piecewisepolynomial.jl | 6 ++++--
 test/basekernels/polynomial.jl          | 3 +++
 test/basekernels/rationalquad.jl        | 7 +++++--
 test/basekernels/sm.jl                  | 2 ++
 test/basekernels/wiener.jl              | 3 +++
 test/kernels/kernelproduct.jl           | 2 ++
 test/kernels/kernelsum.jl               | 2 ++
 test/kernels/scaledkernel.jl            | 1 +
 test/kernels/tensorproduct.jl           | 1 +
 test/kernels/transformedkernel.jl       | 1 +
 20 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/test/basekernels/constant.jl b/test/basekernels/constant.jl
index f58be2d45..308fb84b6 100644
--- a/test/basekernels/constant.jl
+++ b/test/basekernels/constant.jl
@@ -5,7 +5,7 @@
         @test kappa(k,2.0) == 0.0
         @test KernelFunctions.metric(ZeroKernel()) == KernelFunctions.Delta()
         @test repr(k) == "Zero Kernel"
-        test_AD("Zero", ZeroKernel)
+        test_ADs(ZeroKernel)
     end
     @testset "WhiteKernel" begin
         k = WhiteKernel()
@@ -15,7 +15,7 @@
         @test EyeKernel == WhiteKernel
         @test metric(WhiteKernel()) == KernelFunctions.Delta()
         @test repr(k) == "White Kernel"
-        test_AD("WhiteKernel", WhiteKernel)
+        test_ADs(WhiteKernel)
     end
     @testset "ConstantKernel" begin
         c = 2.0
@@ -26,6 +26,6 @@
         @test metric(ConstantKernel()) == KernelFunctions.Delta()
         @test metric(ConstantKernel(c=2.0)) == KernelFunctions.Delta()
         @test repr(k) == "Constant Kernel (c = $(c))"
-        test_AD("ConstantKernel", c->ConstantKernel(c=first(c)), [c])
+        test_ADs(c->ConstantKernel(c=first(c)), [c])
     end
 end
diff --git a/test/basekernels/cosine.jl b/test/basekernels/cosine.jl
index 5874c6ba7..bf4c060b4 100644
--- a/test/basekernels/cosine.jl
+++ b/test/basekernels/cosine.jl
@@ -12,4 +12,5 @@
     @test kappa(k,x) ≈ cospi(x) atol=1e-5
     @test k(v1, v2) ≈ cospi(sqrt(sum(abs2.(v1-v2)))) atol=1e-5
     @test repr(k) == "Cosine Kernel"
+    test_ADs(CosineKernel)
 end
diff --git a/test/basekernels/exponential.jl b/test/basekernels/exponential.jl
index d87289711..e890a3a15 100644
--- a/test/basekernels/exponential.jl
+++ b/test/basekernels/exponential.jl
@@ -14,6 +14,7 @@
         @test SEKernel == SqExponentialKernel
         @test repr(k) == "Squared Exponential Kernel"
         @test KernelFunctions.iskroncompatible(k) == true
+        test_ADs(SEKernel)
     end
     @testset "ExponentialKernel" begin
         k = ExponentialKernel()
@@ -24,6 +25,7 @@
         @test repr(k) == "Exponential Kernel"
         @test LaplacianKernel == ExponentialKernel
         @test KernelFunctions.iskroncompatible(k) == true
+        test_ADs(ExponentialKernel)
     end
     @testset "GammaExponentialKernel" begin
         γ = 2.0
@@ -36,7 +38,8 @@
         @test metric(GammaExponentialKernel(γ=2.0)) == SqEuclidean()
         @test repr(k) == "Gamma Exponential Kernel (γ = $(γ))"
         @test KernelFunctions.iskroncompatible(k) == true
-
+        test_ADs(γ -> GammaExponentialKernel(gamma=first(γ)), [γ], ADs = [:ForwardDiff, :ReverseDiff])
+        @test_broken "Zygote gradient given γ"
         #Coherence :
         @test GammaExponentialKernel(γ=1.0)(v1,v2) ≈ SqExponentialKernel()(v1,v2)
         @test GammaExponentialKernel(γ=0.5)(v1,v2) ≈ ExponentialKernel()(v1,v2)
diff --git a/test/basekernels/exponentiated.jl b/test/basekernels/exponentiated.jl
index 17b625a94..a8c117b3b 100644
--- a/test/basekernels/exponentiated.jl
+++ b/test/basekernels/exponentiated.jl
@@ -10,4 +10,5 @@
     @test k(v1,v2) ≈ exp(dot(v1,v2))
     @test metric(ExponentiatedKernel()) == KernelFunctions.DotProduct()
     @test repr(k) == "Exponentiated Kernel"
+    test_ADs(ExponentiatedKernel)
 end
diff --git a/test/basekernels/fbm.jl b/test/basekernels/fbm.jl
index 645fdc088..53bbd99f2 100644
--- a/test/basekernels/fbm.jl
+++ b/test/basekernels/fbm.jl
@@ -21,4 +21,6 @@
     @test kernelmatrix(k, x1*ones(1,1), x2*ones(1,1))[1] ≈ k(x1, x2) atol=1e-5
 
     @test repr(k) == "Fractional Brownian Motion Kernel (h = $(h))"
+    test_ADs(FBMKernel, ADs = [:ReverseDiff])
+    @test_broken "Tests failing for kernelmatrix(k, x) for ForwardDiff and Zygote"
 end
diff --git a/test/basekernels/gabor.jl b/test/basekernels/gabor.jl
index b9d47560c..26f610cae 100644
--- a/test/basekernels/gabor.jl
+++ b/test/basekernels/gabor.jl
@@ -17,4 +17,6 @@
     @test k.ell ≈ 1.0 atol=1e-5
     @test k.p ≈ 1.0 atol=1e-5
     @test repr(k) == "Gabor Kernel (ell = 1.0, p = 1.0)"
+    test_ADs(x -> GaborKernel(ell = x[1], p = x[2]), [ell, p], ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Tests failing for Zygote on differentiating through ell and p"
 end
diff --git a/test/basekernels/maha.jl b/test/basekernels/maha.jl
index 748b733fc..e5ecba3d0 100644
--- a/test/basekernels/maha.jl
+++ b/test/basekernels/maha.jl
@@ -11,4 +11,6 @@
     @test k(v1, v2) ≈ exp(-sqmahalanobis(v1, v2, P))
     @test kappa(ExponentialKernel(), x) == kappa(k, x)
     @test repr(k) == "Mahalanobis Kernel (size(P) = $(size(P)))"
+    # test_ADs(P -> MahalanobisKernel(P), P)
+    @test_broken "Nothing passes (problem with Mahalanobis distance in Distances)"
 end
diff --git a/test/basekernels/matern.jl b/test/basekernels/matern.jl
index af58dc470..b519686f4 100644
--- a/test/basekernels/matern.jl
+++ b/test/basekernels/matern.jl
@@ -14,6 +14,8 @@
         @test metric(MaternKernel()) == Euclidean()
         @test metric(MaternKernel(ν=2.0)) == Euclidean()
         @test repr(k) == "Matern Kernel (ν = $(ν))"
+        test_ADs(x->MaternKernel(nu=first(x)),[ν])
+        @test_broken "All fails (because of logabsgamma for ForwardDiff and ReverseDiff and because of nu for Zygote)"
     end
     @testset "Matern32Kernel" begin
         k = Matern32Kernel()
@@ -22,6 +24,7 @@
         @test kappa(Matern32Kernel(),x) == kappa(k,x)
         @test metric(Matern32Kernel()) == Euclidean()
         @test repr(k) == "Matern 3/2 Kernel"
+        test_ADs(Matern32Kernel)
     end
     @testset "Matern52Kernel" begin
         k = Matern52Kernel()
@@ -30,6 +33,7 @@
         @test kappa(Matern52Kernel(),x) == kappa(k,x)
         @test metric(Matern52Kernel()) == Euclidean()
         @test repr(k) == "Matern 5/2 Kernel"
+        test_ADs(Matern52Kernel)
     end
     @testset "Coherence Materns" begin
         @test kappa(MaternKernel(ν=0.5),x) ≈ kappa(ExponentialKernel(),x)
diff --git a/test/basekernels/nn.jl b/test/basekernels/nn.jl
index 4617bd47d..6d6bb272c 100644
--- a/test/basekernels/nn.jl
+++ b/test/basekernels/nn.jl
@@ -43,5 +43,6 @@
     @test_throws DimensionMismatch kernelmatrix!(A5, k, ones(4,3), ones(3,4))
 
     @test k([x1], [x2]) ≈ k(x1, x2) atol=1e-5
-
+    test_ADs(NeuralNetworkKernel, ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Zygote uncompatible with BaseKernel"
 end
diff --git a/test/basekernels/periodic.jl b/test/basekernels/periodic.jl
index c7056f75d..0fd6e6876 100644
--- a/test/basekernels/periodic.jl
+++ b/test/basekernels/periodic.jl
@@ -7,4 +7,6 @@
     @test k(v1, v2) == k(v2, v1)
     @test PeriodicKernel(3)(v1, v2) == PeriodicKernel(r = ones(3))(v1, v2)
     @test repr(k) == "Periodic Kernel, length(r) = $(length(r)))"
+    test_ADs(r->PeriodicKernel(r =r), r, ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Undefined adjoint for Sinus metric"
 end
diff --git a/test/basekernels/piecewisepolynomial.jl b/test/basekernels/piecewisepolynomial.jl
index 329d983ee..c1d0f633f 100644
--- a/test/basekernels/piecewisepolynomial.jl
+++ b/test/basekernels/piecewisepolynomial.jl
@@ -29,7 +29,9 @@
     kerneldiagmatrix!(A3, k, m1)
     @test A3 == kerneldiagmatrix(k, m1)
 
-    @test repr(k) == "Piecewise Polynomial Kernel (v = $(v), size(maha) = $(size(maha)))"
-
     @test_throws ErrorException PiecewisePolynomialKernel{4}(maha)
+
+    @test repr(k) == "Piecewise Polynomial Kernel (v = $(v), size(maha) = $(size(maha)))"
+    # test_ADs(maha-> PiecewisePolynomialKernel(v=2, maha = maha), maha)
+    @test_broken "Nothing passes (problem with Mahalanobis distance in Distances)"
 end
diff --git a/test/basekernels/polynomial.jl b/test/basekernels/polynomial.jl
index 900378f52..9d4319ce3 100644
--- a/test/basekernels/polynomial.jl
+++ b/test/basekernels/polynomial.jl
@@ -12,6 +12,7 @@
         @test metric(LinearKernel()) == KernelFunctions.DotProduct()
         @test metric(LinearKernel(c=2.0)) == KernelFunctions.DotProduct()
         @test repr(k) == "Linear Kernel (c = 0.0)"
+        test_ADs(x->LinearKernel(c=x[1]), [c])
     end
     @testset "PolynomialKernel" begin
         k = PolynomialKernel()
@@ -24,5 +25,7 @@
         @test metric(PolynomialKernel()) == KernelFunctions.DotProduct()
         @test metric(PolynomialKernel(d=3.0)) == KernelFunctions.DotProduct()
         @test metric(PolynomialKernel(d=3.0,c=2.0)) == KernelFunctions.DotProduct()
+        # test_ADs(x->PolynomialKernel(d=x[1], c=x[2]),[2.0,  c])
+        @test_broken "All, because of the power"
     end
 end
diff --git a/test/basekernels/rationalquad.jl b/test/basekernels/rationalquad.jl
index 4ec26cf13..47839f407 100644
--- a/test/basekernels/rationalquad.jl
+++ b/test/basekernels/rationalquad.jl
@@ -13,6 +13,7 @@
         @test metric(RationalQuadraticKernel()) == SqEuclidean()
         @test metric(RationalQuadraticKernel(α=2.0)) == SqEuclidean()
         @test repr(k) == "Rational Quadratic Kernel (α = $(α))"
+        test_ADs(x->RationalQuadraticKernel(alpha=x[1]),[α])
     end
     @testset "GammaRationalQuadraticKernel" begin
         k = GammaRationalQuadraticKernel()
@@ -23,9 +24,11 @@
         @test GammaRationalQuadraticKernel(alpha=a).α == [a]
         @test repr(k) == "Gamma Rational Quadratic Kernel (α = 2.0, γ = 2.0)"
         #Coherence test
-        @test kappa(GammaRationalQuadraticKernel(α=a,γ=1.0),x) ≈ kappa(RationalQuadraticKernel(α=a),x)
+        @test kappa(GammaRationalQuadraticKernel(α=a, γ=1.0), x) ≈ kappa(RationalQuadraticKernel(α=a), x)
         @test metric(GammaRationalQuadraticKernel()) == SqEuclidean()
         @test metric(GammaRationalQuadraticKernel(γ=2.0)) == SqEuclidean()
-        @test metric(GammaRationalQuadraticKernel(γ=2.0,α=3.0)) == SqEuclidean()
+        @test metric(GammaRationalQuadraticKernel(γ=2.0, α=3.0)) == SqEuclidean()
+        # test_ADs(x->GammaRationalQuadraticKernel(α=x[1], γ=x[2]), [a, 2.0])
+        @test_broken "All (problem with power operation)"
     end
 end
diff --git a/test/basekernels/sm.jl b/test/basekernels/sm.jl
index a8e0a5768..daef2bd62 100644
--- a/test/basekernels/sm.jl
+++ b/test/basekernels/sm.jl
@@ -21,4 +21,6 @@
     @test_throws DimensionMismatch spectral_mixture_kernel(rand(5) ,rand(4,3), rand(4,3))
     @test_throws DimensionMismatch spectral_mixture_kernel(rand(3) ,rand(4,3), rand(5,3))
     @test_throws DimensionMismatch spectral_mixture_product_kernel(rand(5,3) ,rand(4,3), rand(5,3))
+    # test_ADs(x->spectral_mixture_kernel(exp.(x[1:3]), reshape(x[4:18], 5, 3), reshape(x[19:end], 5, 3)), vcat(log.(αs₁), γs[:], ωs[:]), dims = [5,5])
+    @test_broken "No tests passing (BaseKernel)"
 end
diff --git a/test/basekernels/wiener.jl b/test/basekernels/wiener.jl
index 3b628fc65..624837b8c 100644
--- a/test/basekernels/wiener.jl
+++ b/test/basekernels/wiener.jl
@@ -50,4 +50,7 @@
     @test kernelmatrix(k1, x1*ones(1,1), x2*ones(1,1))[1] ≈ k1(x1, x2) atol=1e-5
     @test kernelmatrix(k2, x1*ones(1,1), x2*ones(1,1))[1] ≈ k2(x1, x2) atol=1e-5
     @test kernelmatrix(k3, x1*ones(1,1), x2*ones(1,1))[1] ≈ k3(x1, x2) atol=1e-5
+
+    # test_ADs(()->WienerKernel(i=1))
+    @test_broken "No tests passing"
 end
diff --git a/test/kernels/kernelproduct.jl b/test/kernels/kernelproduct.jl
index 00d5676d0..d39e81943 100644
--- a/test/kernels/kernelproduct.jl
+++ b/test/kernels/kernelproduct.jl
@@ -47,4 +47,6 @@
             @test kerneldiagmatrix!(tmp_diag, k, x) ≈ kerneldiagmatrix(k, x)
         end
     end
+    test_ADs(x->SqExponentialKernel() * LinearKernel(c= x[1]), rand(1), ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Zygote issue"
 end
diff --git a/test/kernels/kernelsum.jl b/test/kernels/kernelsum.jl
index 310f43d00..0c864be8b 100644
--- a/test/kernels/kernelsum.jl
+++ b/test/kernels/kernelsum.jl
@@ -53,4 +53,6 @@
             @test kerneldiagmatrix!(tmp_diag, k, x) ≈ kerneldiagmatrix(k, x)
         end
     end
+    test_ADs(x->KernelSum([SqExponentialKernel(),LinearKernel(c= x[1])], x[2:3]), rand(3))#, ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Zygote failing because of mutating array"
 end
diff --git a/test/kernels/scaledkernel.jl b/test/kernels/scaledkernel.jl
index a5bf8998e..38e6593c3 100644
--- a/test/kernels/scaledkernel.jl
+++ b/test/kernels/scaledkernel.jl
@@ -40,4 +40,5 @@
             @test_broken kerneldiagmatrix!(tmp_diag, ks, x) ≈ kerneldiagmatrix(ks, x)
         end
     end
+    test_ADs(x->x[1] * SqExponentialKernel(), rand(1))
 end
diff --git a/test/kernels/tensorproduct.jl b/test/kernels/tensorproduct.jl
index 8ce9d5f72..1b016a68b 100644
--- a/test/kernels/tensorproduct.jl
+++ b/test/kernels/tensorproduct.jl
@@ -110,4 +110,5 @@
             end
         end
     end
+    test_ADs(()->TensorProduct(SqExponentialKernel(), LinearKernel()), dims = [2, 2]) # ADs = [:ForwardDiff, :ReverseDiff])
 end
diff --git a/test/kernels/transformedkernel.jl b/test/kernels/transformedkernel.jl
index cabbe0008..cf49dde2d 100644
--- a/test/kernels/transformedkernel.jl
+++ b/test/kernels/transformedkernel.jl
@@ -47,4 +47,5 @@
             @test kerneldiagmatrix!(tmp_diag, kt, x) ≈ kerneldiagmatrix(kt, x)
         end
     end
+    test_ADs(x->transform(SqExponentialKernel(), x[1]), rand(1))# ADs = [:ForwardDiff, :ReverseDiff])
 end

From 3e620ae733291df2d37a945e06d8844d878ce6c5 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:05:01 +0200
Subject: [PATCH 14/34] Spread tests for all transforms

---
 test/transform/ardtransform.jl      | 1 +
 test/transform/chaintransform.jl    | 5 +----
 test/transform/functiontransform.jl | 3 +++
 test/transform/lineartransform.jl   | 1 +
 test/transform/scaletransform.jl    | 1 +
 test/transform/selecttransform.jl   | 1 +
 test/transform/transform.jl         | 1 +
 7 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/test/transform/ardtransform.jl b/test/transform/ardtransform.jl
index 4bd10a6dc..e05f50968 100644
--- a/test/transform/ardtransform.jl
+++ b/test/transform/ardtransform.jl
@@ -41,4 +41,5 @@
     @test_throws DimensionMismatch map(t, ColVecs(randn(rng, D + 1, 3)))
 
     @test repr(t) == "ARD Transform (dims: $D)"
+    test_ADs(x->transform(SEKernel(), exp.(x)), randn(rng, 3))
 end
diff --git a/test/transform/chaintransform.jl b/test/transform/chaintransform.jl
index a13883e81..55dd13b74 100644
--- a/test/transform/chaintransform.jl
+++ b/test/transform/chaintransform.jl
@@ -22,8 +22,5 @@
 
     # Verify printing works as expected.
     @test repr(tp ∘ tf) == "Chain of 2 transforms:\n\t - $(tf) |> $(tp)"
+    test_ADs(x->transform(SEKernel(), ScaleTransform(exp(x[1])) ∘ ARDTransform(exp.(x[2:4]))), randn(rng, 4))
 end
-
-
-Base.:∘(t::Transform, tc::ChainTransform) = ChainTransform(vcat(tc.transforms, t))
-Base.:∘(tc::ChainTransform, t::Transform) = ChainTransform(vcat(t, tc.transforms))
diff --git a/test/transform/functiontransform.jl b/test/transform/functiontransform.jl
index 17ddbdb4f..f8441c38c 100644
--- a/test/transform/functiontransform.jl
+++ b/test/transform/functiontransform.jl
@@ -26,4 +26,7 @@
     end
 
     @test repr(FunctionTransform(sin)) == "Function Transform: $(sin)"
+    f(a, x) = sin.(a .* x)
+    test_ADs(x->transform(SEKernel(), FunctionTransform(y->f(x, y))), randn(rng, 3), ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Zygote is failing"
 end
diff --git a/test/transform/lineartransform.jl b/test/transform/lineartransform.jl
index ff65e20b4..46342bc73 100644
--- a/test/transform/lineartransform.jl
+++ b/test/transform/lineartransform.jl
@@ -41,4 +41,5 @@
     @test_throws DimensionMismatch map(t, ColVecs(randn(rng, Din + 1, Dout)))
 
     @test repr(t) == "Linear transform (size(A) = ($Dout, $Din))"
+    test_ADs(x->transform(SEKernel(), LinearTransform(x)), randn(rng, 3, 3))
 end
diff --git a/test/transform/scaletransform.jl b/test/transform/scaletransform.jl
index d9aece310..c97d937f1 100644
--- a/test/transform/scaletransform.jl
+++ b/test/transform/scaletransform.jl
@@ -18,4 +18,5 @@
     @test t.s == [s2]
     @test isequal(ScaleTransform(s), ScaleTransform(s))
     @test repr(t) == "Scale Transform (s = $(s2))"
+    test_ADs(x->transform(SEKernel(), exp(x[1])), randn(rng, 1))
 end
diff --git a/test/transform/selecttransform.jl b/test/transform/selecttransform.jl
index 1781356b1..a34a9ab3d 100644
--- a/test/transform/selecttransform.jl
+++ b/test/transform/selecttransform.jl
@@ -18,4 +18,5 @@
     @test t.select == select2
 
     @test repr(t) == "Select Transform (dims: $(select2))"
+    test_ADs(()->transform(SEKernel(), SelectTransform([1,2])))
 end
diff --git a/test/transform/transform.jl b/test/transform/transform.jl
index 0b79dcad5..6ce7c46bf 100644
--- a/test/transform/transform.jl
+++ b/test/transform/transform.jl
@@ -7,4 +7,5 @@
         @test IdentityTransform()(x) == x
         @test map(IdentityTransform(), x) == x
     end
+    test_ADs(()->transform(SEKernel(), IdentityTransform()))
 end

From 24cb00d3a6f9185b0391a508ab1e092a61e1981b Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:05:16 +0200
Subject: [PATCH 15/34] Removed need to give a name

---
 test/utils_AD.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 3dc2ae908..9d1c8ca69 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -26,16 +26,16 @@ end
 testfunction(k, A, B, dim) = sum(kernelmatrix(k, A, B, obsdim = dim))
 testfunction(k, A, dim) = sum(kernelmatrix(k, A, obsdim = dim))
 
-function test_AD(kernelname::String, kernelfunction, args = nothing; ADs = [:Zygote, :ForwardDiff, :ReverseDiff], dims = [3, 3])
-    test_fd = test_FiniteDiff(kernelname, kernelfunction, args, dims)
+function test_ADs(kernelfunction, args = nothing; ADs = [:Zygote, :ForwardDiff, :ReverseDiff], dims = [3, 3])
+    test_fd = test_FiniteDiff(kernelfunction, args, dims)
     if !test_fd.anynonpass
         for AD in ADs
-            test_AD(AD, kernelname, kernelfunction, args, dims)
+            test_AD(AD, kernelfunction, args, dims)
         end
     end
 end
 
-function test_FiniteDiff(kernelname, kernelfunction, args = nothing, dims = [3, 3])
+function test_FiniteDiff(kernelfunction, args = nothing, dims = [3, 3])
     # Init arguments :
     k = if args === nothing
         kernelfunction()
@@ -43,7 +43,7 @@ function test_FiniteDiff(kernelname, kernelfunction, args = nothing, dims = [3,
         kernelfunction(args)
     end
     rng = MersenneTwister(42)
-    @testset "FiniteDifferences with $(kernelname)" begin
+    @testset "FiniteDifferences" begin
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
                 @test_nowarn gradient(Val(:FiniteDiff), x -> kappa(k, exp(first(x))), [d])
@@ -70,8 +70,8 @@ function test_FiniteDiff(kernelname, kernelfunction, args = nothing, dims = [3,
     end
 end
 
-function test_AD(AD, kernelname, kernelfunction, args = nothing, dims = [3, 3])
-    @testset "Testing $(kernelname) with AD : $(AD)" begin
+function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
+    @testset "$(AD)" begin
         # Test kappa function
         k = if args === nothing
             kernelfunction()

From 5b2e580c26e267552a9fe2fb5fb50faf290d3d86 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:05:32 +0200
Subject: [PATCH 16/34] Adding needed export

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index f55fdbfaf..940d81bf1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,7 +8,7 @@ using SpecialFunctions
 using Test
 import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
-using KernelFunctions: metric, kappa
+using KernelFunctions: metric, kappa, ColVecs, RowVecs
 
 # Writing tests:
 # 1. The file structure of the test should match precisely the file structure of src.

From 0bba1a5992b6ae62d379f402f3f6cab85f511e21 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:05:51 +0200
Subject: [PATCH 17/34] Removed test_AD

---
 test/test_AD.jl | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 test/test_AD.jl

diff --git a/test/test_AD.jl b/test/test_AD.jl
deleted file mode 100644
index 356578892..000000000
--- a/test/test_AD.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-using KernelFunctions
-using KernelFunctions: kappa, ColVecs, RowVecs
-import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
-using Test, LinearAlgebra, Random
-
-include("utils_AD.jl")
-ADs = [:Zygote, :ForwardDiff, :ReverseDiff]
-
-kname = "SEKernel_lengthscale"
-kfunction = () -> SEKernel()
-kfunction = (l -> transform(SEKernel(), first(l)))
-# args = nothing
-args = [2.0]
-v = test_FiniteDiff(kname, kfunction, args)
-if !v.anynonpass
-    for AD in ADs
-        test_AD(AD, kname, kfunction, args)
-    end
-end

From 7f522425fffee6a1263458aeef2236305f63c76d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 19:25:11 +0200
Subject: [PATCH 18/34] Readded all tests

---
 test/runtests.jl | 114 ++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 60 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 940d81bf1..262238bfc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -44,69 +44,63 @@ using KernelFunctions: metric, kappa, ColVecs, RowVecs
 
     include("utils.jl")
     include("utils_AD.jl")
-    # @testset "distances" begin
-    #     include(joinpath("distances", "dotproduct.jl"))
-    #     include(joinpath("distances", "delta.jl"))
-    #     include(joinpath("distances", "sinus.jl"))
-    # end
-    #
-    # @testset "transform" begin
-    #     include(joinpath("transform", "transform.jl"))
-    #     include(joinpath("transform", "scaletransform.jl"))
-    #     include(joinpath("transform", "ardtransform.jl"))
-    #     include(joinpath("transform", "lineartransform.jl"))
-    #     include(joinpath("transform", "functiontransform.jl"))
-    #     include(joinpath("transform", "selecttransform.jl"))
-    #     include(joinpath("transform", "chaintransform.jl"))
-    # end
+    @testset "distances" begin
+        include(joinpath("distances", "dotproduct.jl"))
+        include(joinpath("distances", "delta.jl"))
+        include(joinpath("distances", "sinus.jl"))
+    end
+
+    @testset "transform" begin
+        include(joinpath("transform", "transform.jl"))
+        include(joinpath("transform", "scaletransform.jl"))
+        include(joinpath("transform", "ardtransform.jl"))
+        include(joinpath("transform", "lineartransform.jl"))
+        include(joinpath("transform", "functiontransform.jl"))
+        include(joinpath("transform", "selecttransform.jl"))
+        include(joinpath("transform", "chaintransform.jl"))
+    end
 
     @testset "basekernels" begin
         include(joinpath("basekernels", "constant.jl"))
-        # include(joinpath("basekernels", "cosine.jl"))
-        # include(joinpath("basekernels", "exponential.jl"))
-        # include(joinpath("basekernels", "exponentiated.jl"))
-        # include(joinpath("basekernels", "fbm.jl"))
-        # include(joinpath("basekernels", "gabor.jl"))
-        # include(joinpath("basekernels", "maha.jl"))
-        # include(joinpath("basekernels", "matern.jl"))
-        # include(joinpath("basekernels", "nn.jl"))
-        # include(joinpath("basekernels", "periodic.jl"))
-        # include(joinpath("basekernels", "polynomial.jl"))
-        # include(joinpath("basekernels", "piecewisepolynomial.jl"))
-        # include(joinpath("basekernels", "rationalquad.jl"))
-        # include(joinpath("basekernels", "sm.jl"))
-        # include(joinpath("basekernels", "wiener.jl"))
+        include(joinpath("basekernels", "cosine.jl"))
+        include(joinpath("basekernels", "exponential.jl"))
+        include(joinpath("basekernels", "exponentiated.jl"))
+        include(joinpath("basekernels", "fbm.jl"))
+        include(joinpath("basekernels", "gabor.jl"))
+        include(joinpath("basekernels", "maha.jl"))
+        include(joinpath("basekernels", "matern.jl"))
+        include(joinpath("basekernels", "nn.jl"))
+        include(joinpath("basekernels", "periodic.jl"))
+        include(joinpath("basekernels", "polynomial.jl"))
+        include(joinpath("basekernels", "piecewisepolynomial.jl"))
+        include(joinpath("basekernels", "rationalquad.jl"))
+        include(joinpath("basekernels", "sm.jl"))
+        include(joinpath("basekernels", "wiener.jl"))
     end
 
-    # @testset "kernels" begin
-    #     include(joinpath("kernels", "kernelproduct.jl"))
-    #     include(joinpath("kernels", "kernelsum.jl"))
-    #     include(joinpath("kernels", "scaledkernel.jl"))
-    #     include(joinpath("kernels", "tensorproduct.jl"))
-    #     include(joinpath("kernels", "transformedkernel.jl"))
-    #
-    #     # Legacy tests that don't correspond to anything meaningful in src. Unclear how
-    #     # helpful these are.
-    #     include(joinpath("kernels", "custom.jl"))
-    # end
-    #
-    # @testset "matrix" begin
-    #     include(joinpath("matrix", "kernelmatrix.jl"))
-    #     include(joinpath("matrix", "kernelkroneckermat.jl"))
-    #     include(joinpath("matrix", "kernelpdmat.jl"))
-    # end
-    #
-    # @testset "approximations" begin
-    #     include(joinpath("approximations", "nystrom.jl"))
-    # end
-    #
-    # include("generic.jl")
-    # include("zygote_adjoints.jl")
-    # include("trainable.jl")
-end
+    @testset "kernels" begin
+        include(joinpath("kernels", "kernelproduct.jl"))
+        include(joinpath("kernels", "kernelsum.jl"))
+        include(joinpath("kernels", "scaledkernel.jl"))
+        include(joinpath("kernels", "tensorproduct.jl"))
+        include(joinpath("kernels", "transformedkernel.jl"))
+
+        # Legacy tests that don't correspond to anything meaningful in src. Unclear how
+        # helpful these are.
+        include(joinpath("kernels", "custom.jl"))
+    end
 
-# These are legacy tests that I'm not getting rid of, as they appear to be useful, but
-# weren't enabled on master at the time of refactoring the tests. They will need to be
-# restored at some point.
-# include("utils_AD.jl")
-# include("test_AD.jl")
+    @testset "matrix" begin
+        include(joinpath("matrix", "kernelmatrix.jl"))
+        include(joinpath("matrix", "kernelkroneckermat.jl"))
+        include(joinpath("matrix", "kernelpdmat.jl"))
+    end
+
+    @testset "approximations" begin
+        include(joinpath("approximations", "nystrom.jl"))
+    end
+
+    include("generic.jl")
+    include("zygote_adjoints.jl")
+    include("trainable.jl")
+end

From f1000b3b20f50fb913caa501a511b6d400bf7248 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sat, 16 May 2020 20:22:35 +0200
Subject: [PATCH 19/34] Fixed tests and added adjoint tests

---
 test/basekernels/matern.jl |  2 +-
 test/kernels/kernelsum.jl  |  2 +-
 test/runtests.jl           |  1 +
 test/utils_AD.jl           |  6 +++++-
 test/zygote_adjoints.jl    | 20 +++++++++++++-------
 5 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/test/basekernels/matern.jl b/test/basekernels/matern.jl
index b519686f4..a37ea29ba 100644
--- a/test/basekernels/matern.jl
+++ b/test/basekernels/matern.jl
@@ -14,7 +14,7 @@
         @test metric(MaternKernel()) == Euclidean()
         @test metric(MaternKernel(ν=2.0)) == Euclidean()
         @test repr(k) == "Matern Kernel (ν = $(ν))"
-        test_ADs(x->MaternKernel(nu=first(x)),[ν])
+        # test_ADs(x->MaternKernel(nu=first(x)),[ν])
         @test_broken "All fails (because of logabsgamma for ForwardDiff and ReverseDiff and because of nu for Zygote)"
     end
     @testset "Matern32Kernel" begin
diff --git a/test/kernels/kernelsum.jl b/test/kernels/kernelsum.jl
index 0c864be8b..6647fa466 100644
--- a/test/kernels/kernelsum.jl
+++ b/test/kernels/kernelsum.jl
@@ -53,6 +53,6 @@
             @test kerneldiagmatrix!(tmp_diag, k, x) ≈ kerneldiagmatrix(k, x)
         end
     end
-    test_ADs(x->KernelSum([SqExponentialKernel(),LinearKernel(c= x[1])], x[2:3]), rand(3))#, ADs = [:ForwardDiff, :ReverseDiff])
+    test_ADs(x->KernelSum([SqExponentialKernel(),LinearKernel(c= x[1])], x[2:3]), rand(3), ADs = [:ForwardDiff, :ReverseDiff])
     @test_broken "Zygote failing because of mutating array"
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 262238bfc..cc6502776 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,7 @@ using PDMats
 using Random
 using SpecialFunctions
 using Test
+using Flux: params
 import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
 using KernelFunctions: metric, kappa, ColVecs, RowVecs
diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 9d1c8ca69..7732b1fd1 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -4,7 +4,11 @@ FDM = FiniteDifferences.central_fdm(5, 1)
 function gradient(::Val{:Zygote}, f::Function, args)
     g = first(Zygote.gradient(f, args))
     if isnothing(g)
-        return zeros(size(args)) # To respect the same output as other ADs
+        if args isa AbstractArray{<:Real}
+            return zeros(size(args)) # To respect the same output as other ADs
+        else
+            return zeros.(size.(args))
+        end
     else
         return g
     end
diff --git a/test/zygote_adjoints.jl b/test/zygote_adjoints.jl
index e81cb7097..46abf83e1 100644
--- a/test/zygote_adjoints.jl
+++ b/test/zygote_adjoints.jl
@@ -3,18 +3,24 @@
     rng = MersenneTwister(123456)
     x = rand(rng, 5)
     y = rand(rng, 5)
+    r = rand(rng, 5)
 
-    gzeucl = first(Zygote.gradient(xy->evaluate(Euclidean(),xy[1],xy[2]),[x,y]))
-    gzsqeucl =  first(Zygote.gradient(xy->evaluate(SqEuclidean(),xy[1],xy[2]),[x,y]))
-    gzdotprod = first(Zygote.gradient(xy->evaluate(KernelFunctions.DotProduct(),xy[1],xy[2]),[x,y]))
+    gzeucl = gradient(Val(:Zygote), xy -> evaluate(Euclidean(), xy[1], xy[2]), [x,y])
+    gzsqeucl =  gradient(Val(:Zygote), xy -> evaluate(SqEuclidean(), xy[1], xy[2]), [x,y])
+    gzdotprod = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.DotProduct(), xy[1], xy[2]), [x,y])
+    gzdelta = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.Delta(), xy[1], xy[2]), [x,y])
+    gzsinus = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.Sinus(r), xy[1], xy[2]), [x,y])
 
-    FDM = central_fdm(5,1)
+    gfeucl = gradient(Val(:FiniteDiff), xy -> evaluate(Euclidean(), xy[1], xy[2]), [x,y])
+    gfsqeucl = gradient(Val(:FiniteDiff), xy -> evaluate(SqEuclidean(), xy[1], xy[2]), [x,y])
+    gfdotprod = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.DotProduct(), xy[1], xy[2]), [x,y])
+    gfdelta = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.Delta(), xy[1], xy[2]), [x,y])
+    gfsinus = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.Sinus(r), xy[1], xy[2]), [x,y])
 
-    gfeucl = collect(first(FiniteDifferences.grad(FDM,xy->evaluate(Euclidean(),xy[1],xy[2]),(x,y))))
-    gfsqeucl = collect(first(FiniteDifferences.grad(FDM,xy->evaluate(SqEuclidean(),xy[1],xy[2]),(x,y))))
-    gfdotprod =collect(first(FiniteDifferences.grad(FDM,xy->evaluate(KernelFunctions.DotProduct(),xy[1],xy[2]),(x,y))))
 
     @test all(gzeucl .≈ gfeucl)
     @test all(gzsqeucl .≈ gfsqeucl)
     @test all(gzdotprod .≈ gfdotprod)
+    @test all(gzdelta .≈ gfdelta)
+    @test all(gzsinus .≈ gfsinus)
 end

From 402336598c46a427735db45da78eedf2a6b59bef Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sun, 17 May 2020 12:43:04 +0200
Subject: [PATCH 20/34] Fixing issues in tests and adding some output to avoid
 travis-ci problems

---
 test/basekernels/periodic.jl | 2 +-
 test/runtests.jl             | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/basekernels/periodic.jl b/test/basekernels/periodic.jl
index 0fd6e6876..0e7bfacba 100644
--- a/test/basekernels/periodic.jl
+++ b/test/basekernels/periodic.jl
@@ -7,6 +7,6 @@
     @test k(v1, v2) == k(v2, v1)
     @test PeriodicKernel(3)(v1, v2) == PeriodicKernel(r = ones(3))(v1, v2)
     @test repr(k) == "Periodic Kernel, length(r) = $(length(r)))"
-    test_ADs(r->PeriodicKernel(r =r), r, ADs = [:ForwardDiff, :ReverseDiff])
+    test_ADs(r->PeriodicKernel(r =exp.(r)), log.(r), ADs = [:ForwardDiff, :ReverseDiff])
     @test_broken "Undefined adjoint for Sinus metric"
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index cc6502776..396c7d381 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,7 +6,7 @@ using PDMats
 using Random
 using SpecialFunctions
 using Test
-using Flux: params
+using Flux: params, Chain, Dense
 import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
 using KernelFunctions: metric, kappa, ColVecs, RowVecs
@@ -40,16 +40,19 @@ using KernelFunctions: metric, kappa, ColVecs, RowVecs
 #   disable tests by simply commenting them out, and makes it very clear which tests are not
 #   currently being run.
 # 10. If utility files are required.
+@info "Packages Loaded"
 
 @testset "KernelFunctions" begin
 
     include("utils.jl")
     include("utils_AD.jl")
+
     @testset "distances" begin
         include(joinpath("distances", "dotproduct.jl"))
         include(joinpath("distances", "delta.jl"))
         include(joinpath("distances", "sinus.jl"))
     end
+    @info "Ran tests on Distances"
 
     @testset "transform" begin
         include(joinpath("transform", "transform.jl"))
@@ -60,6 +63,7 @@ using KernelFunctions: metric, kappa, ColVecs, RowVecs
         include(joinpath("transform", "selecttransform.jl"))
         include(joinpath("transform", "chaintransform.jl"))
     end
+    @info "Ran tests on Transform"
 
     @testset "basekernels" begin
         include(joinpath("basekernels", "constant.jl"))
@@ -78,6 +82,7 @@ using KernelFunctions: metric, kappa, ColVecs, RowVecs
         include(joinpath("basekernels", "sm.jl"))
         include(joinpath("basekernels", "wiener.jl"))
     end
+    @info "Ran tests on BaseKernel"
 
     @testset "kernels" begin
         include(joinpath("kernels", "kernelproduct.jl"))
@@ -90,12 +95,14 @@ using KernelFunctions: metric, kappa, ColVecs, RowVecs
         # helpful these are.
         include(joinpath("kernels", "custom.jl"))
     end
+    @info "Ran tests on Kernel"
 
     @testset "matrix" begin
         include(joinpath("matrix", "kernelmatrix.jl"))
         include(joinpath("matrix", "kernelkroneckermat.jl"))
         include(joinpath("matrix", "kernelpdmat.jl"))
     end
+    @info "Ran tests on matrix"
 
     @testset "approximations" begin
         include(joinpath("approximations", "nystrom.jl"))

From a73133b6af5ac58dba50cad2fe5380047ee829a7 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Sun, 17 May 2020 13:45:17 +0200
Subject: [PATCH 21/34] Relaxed tolerance

---
 test/utils_AD.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 7732b1fd1..b6e14fad0 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -91,20 +91,20 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         # Testing kernel evaluations
         x = rand(rng, dims[1])
         y = rand(rng, dims[1])
-        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8
-        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) atol=1e-8
+        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) rtol=1e-5
+        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) rtol=1e-5
         if !(args === nothing)
-            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) atol=1e-8
+            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) rtol=1e-5
         end
         # Testing kernel matrices
         A = rand(rng, dims...)
         B = rand(rng, dims...)
         for dim in 1:2
-            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8
-            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) atol=1e-8
-            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) atol=1e-8
+            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) rtol=1e-5
+            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) rtol=1e-5
+            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) rtol=1e-5
             if !(args === nothing)
-                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) atol=1e-8
+                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) rtol=1e-5
             end
         end
     end

From d586967a40aec83d03ad065bcfd257b23ce9de9c Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 22 May 2020 12:05:45 +0200
Subject: [PATCH 22/34] Added atol for test (for comparisons around 0)

---
 test/utils_AD.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index b6e14fad0..1d9426be7 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -85,26 +85,26 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         rng = MersenneTwister(42)
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
-                @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8
+                @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8 rtol=1e-5
             end
         end
         # Testing kernel evaluations
         x = rand(rng, dims[1])
         y = rand(rng, dims[1])
-        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) rtol=1e-5
-        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) rtol=1e-5
+        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8 rtol=1e-5
+        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) atol=1e-8 rtol=1e-5
         if !(args === nothing)
-            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) rtol=1e-5
+            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) atol=1e-8 rtol=1e-5
         end
         # Testing kernel matrices
         A = rand(rng, dims...)
         B = rand(rng, dims...)
         for dim in 1:2
-            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) rtol=1e-5
-            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) rtol=1e-5
-            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) rtol=1e-5
+            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8 rtol=1e-5
+            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) atol=1e-8 rtol=1e-5
+            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) atol=1e-8 rtol=1e-5
             if !(args === nothing)
-                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) rtol=1e-5
+                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) atol=1e-8 rtol=1e-5
             end
         end
     end

From 577518fa300d530cac12ba8a5611aab853bc5ba3 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 11:59:18 +0200
Subject: [PATCH 23/34] Rewrote testing code

---
 test/utils_AD.jl | 75 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 20 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 1d9426be7..95eb3761e 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -1,7 +1,9 @@
 
-FDM = FiniteDifferences.central_fdm(5, 1)
+const FDM = FiniteDifferences.central_fdm(5, 1)
 
-function gradient(::Val{:Zygote}, f::Function, args)
+gradient(f, s::Symbol, args) = gradient(f, Val(s), args)
+
+function gradient(f, ::Val{:Zygote}, args)
     g = first(Zygote.gradient(f, args))
     if isnothing(g)
         if args isa AbstractArray{<:Real}
@@ -14,18 +16,21 @@ function gradient(::Val{:Zygote}, f::Function, args)
     end
 end
 
-function gradient(::Val{:ForwardDiff}, f::Function, args)
+function gradient(f, ::Val{:ForwardDiff}, args)
     ForwardDiff.gradient(f, args)
 end
 
-function gradient(::Val{:ReverseDiff}, f::Function, args)
+function gradient(f, ::Val{:ReverseDiff}, args)
     ReverseDiff.gradient(f, args)
 end
 
-function gradient(::Val{:FiniteDiff}, f::Function, args)
+function gradient(f, ::Val{:FiniteDiff}, args)
     first(FiniteDifferences.grad(FDM, f, args))
 end
 
+function compare_gradient(f, AD::Symbol, args)
+    isapprox(gradient(f, AD, args), gradient(f, :FiniteDiff, args), atol=1e-8, rtol=1e-5)
+end
 
 testfunction(k, A, B, dim) = sum(kernelmatrix(k, A, B, obsdim = dim))
 testfunction(k, A, dim) = sum(kernelmatrix(k, A, obsdim = dim))
@@ -50,25 +55,39 @@ function test_FiniteDiff(kernelfunction, args = nothing, dims = [3, 3])
     @testset "FiniteDifferences" begin
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
-                @test_nowarn gradient(Val(:FiniteDiff), x -> kappa(k, exp(first(x))), [d])
+                @test_nowarn gradient(:FiniteDiff, [d]) do x
+                    kappa(k, exp(first(x)))
+                end
             end
         end
         ## Testing Kernel Functions
         x = rand(rng, dims[1])
         y = rand(rng, dims[1])
-        @test_nowarn gradient(Val(:FiniteDiff), x -> k(x, y), x)
+        @test_nowarn gradient(:FiniteDiff, x) do x
+                k(x, y)
+            end
         if !(args === nothing)
-            @test_nowarn gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args)
+            @test_nowarn gradient(:FiniteDiff, args) do p
+                kernelfunction(p)(x, y)
+            end
         end
         ## Testing Kernel Matrices
         A = rand(rng, dims...)
         B = rand(rng, dims...)
         for dim in 1:2
-            @test_nowarn gradient(Val(:FiniteDiff), a -> testfunction(k, a, dim), A)
-            @test_nowarn gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A)
-            @test_nowarn gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B)
+            @test_nowarn gradient(:FiniteDiff, A) do a
+                testfunction(k, a, dim)
+            end
+            @test_nowarn gradient(:FiniteDiff , A) do a
+                testfunction(k, a, B, dim)
+            end
+            @test_nowarn gradient(:FiniteDiff, B) do b
+                testfunction(k, A, b, dim)
+            end
             if !(args === nothing)
-                @test_nowarn gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, B, dim), args)
+                @test_nowarn gradient(:FiniteDiff, args) do p
+                    testfunction(kernelfunction(p), A, B, dim)
+                end
             end
         end
     end
@@ -85,26 +104,42 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         rng = MersenneTwister(42)
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
-                @test gradient(Val(AD), x -> kappa(k, exp(x[1])), [d]) ≈ gradient(Val(:FiniteDiff), x -> kappa(k, exp(x[1])), [d]) atol=1e-8 rtol=1e-5
+                @test compare_gradient(AD, [d]) do x
+                    kappa(k, exp(x[1])
+                end
             end
         end
         # Testing kernel evaluations
         x = rand(rng, dims[1])
         y = rand(rng, dims[1])
-        @test gradient(Val(AD), x -> k(x, y), x) ≈ gradient(Val(:FiniteDiff), x -> k(x, y), x) atol=1e-8 rtol=1e-5
-        @test gradient(Val(AD), y -> k(x, y), y) ≈ gradient(Val(:FiniteDiff), y -> k(x, y), y) atol=1e-8 rtol=1e-5
+        @test compare_gradient(AD, x) do x
+            k(x, y)
+        end
+        @test compare_gradient(AD, y) do y
+            k(x, y)
+        end
         if !(args === nothing)
-            @test gradient(Val(AD), p -> kernelfunction(p)(x,y), args) ≈ gradient(Val(:FiniteDiff), p -> kernelfunction(p)(x, y), args) atol=1e-8 rtol=1e-5
+            @test compare_gradient(AD, args) do p
+                kernelfunction(p)(x,y)
+            end
         end
         # Testing kernel matrices
         A = rand(rng, dims...)
         B = rand(rng, dims...)
         for dim in 1:2
-            @test gradient(Val(AD), x -> testfunction(k, x, dim), A) ≈ gradient(Val(:FiniteDiff), x -> testfunction(k, x, dim), A) atol=1e-8 rtol=1e-5
-            @test gradient(Val(AD), a -> testfunction(k, a, B, dim), A) ≈ gradient(Val(:FiniteDiff), a -> testfunction(k, a, B, dim), A) atol=1e-8 rtol=1e-5
-            @test gradient(Val(AD), b -> testfunction(k, A, b, dim), B) ≈ gradient(Val(:FiniteDiff), b -> testfunction(k, A, b, dim), B) atol=1e-8 rtol=1e-5
+            @test compare_gradient(AD, A) do a
+                testfunction(k, a, dim)
+            end
+            @test conpare_gradient(AD, A) do a
+                testfunction(k, a, B, dim)
+            end
+            @test compare_gradient(AD, B) do b
+                testfunction(k, A, b, dim)
+            end
             if !(args === nothing)
-                @test gradient(Val(AD), p -> testfunction(kernelfunction(p), A, dim), args) ≈ gradient(Val(:FiniteDiff), p -> testfunction(kernelfunction(p), A, dim), args) atol=1e-8 rtol=1e-5
+                @test compare_gradient(AD, args) do p
+                    testfunction(kernelfunction(p), AD, A, dim)
+                end
             end
         end
     end

From 9d82e1cf4eceea03f1eb637de49517b678b19c95 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 12:00:23 +0200
Subject: [PATCH 24/34] Put a seed for FBM tests

---
 test/basekernels/fbm.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/basekernels/fbm.jl b/test/basekernels/fbm.jl
index 53bbd99f2..77ed3b537 100644
--- a/test/basekernels/fbm.jl
+++ b/test/basekernels/fbm.jl
@@ -1,12 +1,13 @@
 @testset "FBM" begin
+    rng = MersenneTwister(42)
     h = 0.3
     k = FBMKernel(h = h)
-    v1 = rand(3); v2 = rand(3)
+    v1 = rand(rng, 3); v2 = rand(rng, 3)
     @test k(v1,v2) ≈ (sqeuclidean(v1, zero(v1))^h + sqeuclidean(v2, zero(v2))^h - sqeuclidean(v1-v2, zero(v1-v2))^h)/2 atol=1e-5
 
     # kernelmatrix tests
-    m1 = rand(3,3)
-    m2 = rand(3,3)
+    m1 = rand(rng, 3, 3)
+    m2 = rand(rng, 3, 3)
     Kref = kernelmatrix(k, m1, m1)
     @test kernelmatrix(k, m1) ≈ Kref atol=1e-5
     K = zeros(3, 3)
@@ -16,8 +17,8 @@
     kernelmatrix!(K, k, m1)
     @test K ≈ Kref atol=1e-5
 
-    x1 = rand()
-    x2 = rand()
+    x1 = rand(rng)
+    x2 = rand(rng)
     @test kernelmatrix(k, x1*ones(1,1), x2*ones(1,1))[1] ≈ k(x1, x2) atol=1e-5
 
     @test repr(k) == "Fractional Brownian Motion Kernel (h = $(h))"

From 181341e4a3706d496790c9145a70a1927b82618d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 12:04:11 +0200
Subject: [PATCH 25/34] Remove adjoint for Sinus

---
 src/zygote_adjoints.jl | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/zygote_adjoints.jl b/src/zygote_adjoints.jl
index 38f131cb9..a95be8142 100644
--- a/src/zygote_adjoints.jl
+++ b/src/zygote_adjoints.jl
@@ -59,16 +59,6 @@ end
   end
 end
 
-@adjoint function pairwise(s::Sinus, X::AbstractMatrix, Y::AbstractMatrix; dims=2)
-    D = pairwise(d, X, Y; dims = dims)
-    throw(error("Sinus metric has no defined adjoint for now... PR welcome!"))
-end
-
-@adjoint function pairwise(s::Sinus, X::AbstractMatrix; dims=2)
-  D = pairwise(d, X; dims = dims)
-  throw(error("Sinus metric has no defined adjoint for now... PR welcome!"))
-end
-
 @adjoint function loggamma(x)
     first(logabsgamma(x)) , Δ -> (Δ .* polygamma(0, x), )
 end

From 88c6af716acd04d31484b17505770d27039c1897 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 12:05:15 +0200
Subject: [PATCH 26/34] Import all Flux functions

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 396c7d381..d0ea3e3c5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,7 +6,7 @@ using PDMats
 using Random
 using SpecialFunctions
 using Test
-using Flux: params, Chain, Dense
+using Flux
 import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
 using KernelFunctions: metric, kappa, ColVecs, RowVecs

From aa282a1803a7a6ad2df0a4d7b279f6e73e1344c6 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 12:20:17 +0200
Subject: [PATCH 27/34] Fix parenthesis missing

---
 test/utils_AD.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 95eb3761e..5763e015f 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -105,7 +105,7 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
                 @test compare_gradient(AD, [d]) do x
-                    kappa(k, exp(x[1])
+                    kappa(k, exp(x[1]))
                 end
             end
         end

From ffefd1fec754c24a283ce0db8ff06476f9a626b7 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 12:42:13 +0200
Subject: [PATCH 28/34] Fixed some stupid testing bugs

---
 test/utils_AD.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index 5763e015f..e5f4a7562 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -130,7 +130,7 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
             @test compare_gradient(AD, A) do a
                 testfunction(k, a, dim)
             end
-            @test conpare_gradient(AD, A) do a
+            @test compare_gradient(AD, A) do a
                 testfunction(k, a, B, dim)
             end
             @test compare_gradient(AD, B) do b
@@ -138,7 +138,7 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
             end
             if !(args === nothing)
                 @test compare_gradient(AD, args) do p
-                    testfunction(kernelfunction(p), AD, A, dim)
+                    testfunction(kernelfunction(p), A, dim)
                 end
             end
         end

From 6b5ba4d945bc136788d13fdeafed31411d3982f0 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 13:49:25 +0200
Subject: [PATCH 29/34] Corrected Tests Zygote Adjoints

---
 test/zygote_adjoints.jl | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/test/zygote_adjoints.jl b/test/zygote_adjoints.jl
index 46abf83e1..5e9447b37 100644
--- a/test/zygote_adjoints.jl
+++ b/test/zygote_adjoints.jl
@@ -5,17 +5,37 @@
     y = rand(rng, 5)
     r = rand(rng, 5)
 
-    gzeucl = gradient(Val(:Zygote), xy -> evaluate(Euclidean(), xy[1], xy[2]), [x,y])
-    gzsqeucl =  gradient(Val(:Zygote), xy -> evaluate(SqEuclidean(), xy[1], xy[2]), [x,y])
-    gzdotprod = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.DotProduct(), xy[1], xy[2]), [x,y])
-    gzdelta = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.Delta(), xy[1], xy[2]), [x,y])
-    gzsinus = gradient(Val(:Zygote), xy -> evaluate(KernelFunctions.Sinus(r), xy[1], xy[2]), [x,y])
+    gzeucl = gradient(:Zygote, [x,y]) do xy
+        evaluate(Euclidean(), xy[1], xy[2])
+    end
+    gzsqeucl = gradient(:Zygote, [x,y]) do xy
+        evaluate(SqEuclidean(), xy[1], xy[2])
+    end
+    gzdotprod = gradient(:Zygote, [x,y]) do xy
+        evaluate(KernelFunctions.DotProduct(), xy[1], xy[2])
+    end
+    gzdelta = gradient(:Zygote, [x,y]) do xy
+        evaluate(KernelFunctions.Delta(), xy[1], xy[2])
+    end
+    gzsinus = gradient(:Zygote, [x,y]) do xy
+        evaluate(KernelFunctions.Sinus(r), xy[1], xy[2])
+    end
 
-    gfeucl = gradient(Val(:FiniteDiff), xy -> evaluate(Euclidean(), xy[1], xy[2]), [x,y])
-    gfsqeucl = gradient(Val(:FiniteDiff), xy -> evaluate(SqEuclidean(), xy[1], xy[2]), [x,y])
-    gfdotprod = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.DotProduct(), xy[1], xy[2]), [x,y])
-    gfdelta = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.Delta(), xy[1], xy[2]), [x,y])
-    gfsinus = gradient(Val(:FiniteDiff), xy -> evaluate(KernelFunctions.Sinus(r), xy[1], xy[2]), [x,y])
+    gfeucl = gradient(:FiniteDiff, [x,y]) do xy
+        evaluate(Euclidean(), xy[1], xy[2])
+    end
+    gfsqeucl = gradient(:FiniteDiff, [x,y]) do xy
+        evaluate(SqEuclidean(), xy[1], xy[2])
+    end
+    gfdotprod = gradient(:FiniteDiff, [x,y]) do xy
+        evaluate(KernelFunctions.DotProduct(), xy[1], xy[2])
+    end
+    gfdelta = gradient(:FiniteDiff, [x,y]) do xy
+        evaluate(KernelFunctions.Delta(), xy[1], xy[2])
+    end
+    gfsinus = gradient(:FiniteDiff, [x,y]) do xy
+        evaluate(KernelFunctions.Sinus(r), xy[1], xy[2])
+    end
 
 
     @test all(gzeucl .≈ gfeucl)

From b6ddf527fbc83c82f11807fd978afd38987dec59 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 25 May 2020 15:58:31 +0200
Subject: [PATCH 30/34] Clearer failing messages

---
 test/utils_AD.jl | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/test/utils_AD.jl b/test/utils_AD.jl
index e5f4a7562..1354485f9 100644
--- a/test/utils_AD.jl
+++ b/test/utils_AD.jl
@@ -29,7 +29,9 @@ function gradient(f, ::Val{:FiniteDiff}, args)
 end
 
 function compare_gradient(f, AD::Symbol, args)
-    isapprox(gradient(f, AD, args), gradient(f, :FiniteDiff, args), atol=1e-8, rtol=1e-5)
+    grad_AD = gradient(f, AD, args)
+    grad_FD = gradient(f, :FiniteDiff, args)
+    @test grad_AD ≈ grad_FD atol=1e-8 rtol=1e-5
 end
 
 testfunction(k, A, B, dim) = sum(kernelmatrix(k, A, B, obsdim = dim))
@@ -104,7 +106,7 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         rng = MersenneTwister(42)
         if k isa SimpleKernel
             for d in log.([eps(), rand(rng)])
-                @test compare_gradient(AD, [d]) do x
+                compare_gradient(AD, [d]) do x
                     kappa(k, exp(x[1]))
                 end
             end
@@ -112,14 +114,14 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         # Testing kernel evaluations
         x = rand(rng, dims[1])
         y = rand(rng, dims[1])
-        @test compare_gradient(AD, x) do x
+        compare_gradient(AD, x) do x
             k(x, y)
         end
-        @test compare_gradient(AD, y) do y
+        compare_gradient(AD, y) do y
             k(x, y)
         end
         if !(args === nothing)
-            @test compare_gradient(AD, args) do p
+            compare_gradient(AD, args) do p
                 kernelfunction(p)(x,y)
             end
         end
@@ -127,17 +129,17 @@ function test_AD(AD::Symbol, kernelfunction, args = nothing, dims = [3, 3])
         A = rand(rng, dims...)
         B = rand(rng, dims...)
         for dim in 1:2
-            @test compare_gradient(AD, A) do a
+            compare_gradient(AD, A) do a
                 testfunction(k, a, dim)
             end
-            @test compare_gradient(AD, A) do a
+            compare_gradient(AD, A) do a
                 testfunction(k, a, B, dim)
             end
-            @test compare_gradient(AD, B) do b
+            compare_gradient(AD, B) do b
                 testfunction(k, A, b, dim)
             end
             if !(args === nothing)
-                @test compare_gradient(AD, args) do p
+                compare_gradient(AD, args) do p
                     testfunction(kernelfunction(p), A, dim)
                 end
             end

From 5c7eb6a420f5a1a1a5c07430edf5b869554b1eaf Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 12 Jun 2020 19:16:36 +0200
Subject: [PATCH 31/34] Adding Project.toml to test folder

---
 .github/workflows/CompatHelper.yml |  2 +-
 Project.toml                       | 14 --------------
 test/Project.toml                  | 21 +++++++++++++++++++++
 3 files changed, 22 insertions(+), 15 deletions(-)
 create mode 100644 test/Project.toml

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index dd821e683..cdeee2dba 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -16,4 +16,4 @@ jobs:
       - name: CompatHelper.main()
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+        run: julia -e 'using CompatHelper; CompatHelper.main(; subdirs = ["", "test"])'
diff --git a/Project.toml b/Project.toml
index a43efa1ff..ea52d71a3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -22,17 +22,3 @@ StatsBase = "0.32, 0.33"
 StatsFuns = "0.8, 0.9"
 ZygoteRules = "0.2"
 julia = "1.3"
-
-[extras]
-FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
-PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-
-[targets]
-test = ["Random", "Test", "FiniteDifferences", "Zygote", "ReverseDiff", "ForwardDiff", "PDMats", "Kronecker", "Flux"]
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 000000000..0a504f64b
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,21 @@
+[deps]
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
+Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+Distances = "0.9"
+FiniteDifferences = "0.10"
+Flux = "0.10"
+Kronecker = "0.4"
+PDMats = "0.9"
+SpecialFunctions = "0.10"
+Zygote = "0.4"

From a4e5bb2b814cbfa3bd527b2e0218689d4a0fc2c2 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 12 Jun 2020 19:31:41 +0200
Subject: [PATCH 32/34] Missing ForwardDiff and removed KernelFunctions

---
 test/Project.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/Project.toml b/test/Project.toml
index 0a504f64b..c09563ca1 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -2,7 +2,7 @@
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
@@ -15,6 +15,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 Distances = "0.9"
 FiniteDifferences = "0.10"
 Flux = "0.10"
+ForwardDiff = "0.10"
 Kronecker = "0.4"
 PDMats = "0.9"
 SpecialFunctions = "0.10"

From 686ad8c243d6cfc6674fdf58279a027040a21daa Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 12 Jun 2020 19:32:46 +0200
Subject: [PATCH 33/34] Missing ReverseDiff

---
 test/Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/Project.toml b/test/Project.toml
index c09563ca1..ba243cd37 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -7,6 +7,7 @@ Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
@@ -18,5 +19,6 @@ Flux = "0.10"
 ForwardDiff = "0.10"
 Kronecker = "0.4"
 PDMats = "0.9"
+ReverseDiff = "1.2"
 SpecialFunctions = "0.10"
 Zygote = "0.4"

From e94973eff03fdcae18364e2a263cdc42c1cc68ed Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 15 Jun 2020 11:49:56 +0200
Subject: [PATCH 34/34] Removed passing tests for PeriodicKernel

---
 test/basekernels/periodic.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/basekernels/periodic.jl b/test/basekernels/periodic.jl
index 0e7bfacba..a4a2459db 100644
--- a/test/basekernels/periodic.jl
+++ b/test/basekernels/periodic.jl
@@ -7,6 +7,6 @@
     @test k(v1, v2) == k(v2, v1)
     @test PeriodicKernel(3)(v1, v2) == PeriodicKernel(r = ones(3))(v1, v2)
     @test repr(k) == "Periodic Kernel, length(r) = $(length(r)))"
-    test_ADs(r->PeriodicKernel(r =exp.(r)), log.(r), ADs = [:ForwardDiff, :ReverseDiff])
-    @test_broken "Undefined adjoint for Sinus metric"
+    # test_ADs(r->PeriodicKernel(r =exp.(r)), log.(r), ADs = [:ForwardDiff, :ReverseDiff])
+    @test_broken "Undefined adjoint for Sinus metric, and failing randomly for ForwardDiff and ReverseDiff"
 end