From fcda0c6785afd6ea8c20509c4bb4321a1340d30d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 4 Jun 2020 17:59:57 +0200
Subject: [PATCH 01/13] Draft for example scripts

---
 examples/deepkernellearning.jl    | 73 +++++++++++++++++--------
 examples/kernelridgeregression.jl | 91 ++++++++++++++++++++-----------
 examples/svm.jl                   | 40 ++++++++------
 src/kernels/transformedkernel.jl  |  6 +-
 4 files changed, 136 insertions(+), 74 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index b77f0d13e..50c433705 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -3,33 +3,58 @@ using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
-using Plots
+using Plots;
+pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
+using SliceMap
 
-Flux.@functor SqExponentialKernel
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-Flux.@functor FunctionTransform
+neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+
+Base.map(
+    t::KernelFunctions.FunctionTransform,
+    X::AbstractVector;
+    obsdim::Int = defaultobs,
+) where {T} =
+    slicemap(X, dims = 1) do x
+        t.f(x)
+    end
 
-neuralnet = Chain(Dense(1,3),Dense(3,2))
-k = SqExponentialKernel(FunctionTransform(neuralnet))
 xmin = -3; xmax = 3
-x = range(xmin,xmax,length=100)
-x_test = rand(Uniform(xmin,xmax),200)
-x,y = noisy_function(sinc,x;noise=0.1)
-X = reshape(x,:,1)
-λ = [0.1]
-f(x,k,λ) = kernelmatrix(k,X,x,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y
-f(X,k,1.0)
-loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ)
-loss(k,λ)
+
+x_train = range(xmin, xmax, length = 100)
+x_test = rand(Uniform(xmin, xmax), 200)
+x_train, y = noisy_function(x_train; noise = 0.01) do x
+    sinc(abs(x) ^ abs(x))
+end
+X = reshape(x_train, :, 1)
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+
+λ = log.([1.0])
+function f(x, k, λ)
+    K = kernelmatrix(k, X, x, obsdim = 1)
+    return K * inv(K + exp(λ[1]) * I) * y
+end
+f(X, k, 1.0)
+loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)
+
+@info "Init Loss = $(loss(k, λ))"
+
 ps = Flux.params(k)
-# push!(ps,λ)
-opt = Flux.Momentum(1.0)
+push!(ps,λ)
+
+p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
+Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
 ##
-for i in 1:10
-    grads = Zygote.gradient(()->loss(k,λ),ps)
-    Flux.Optimise.update!(opt,ps,grads)
-    p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))")
-    Plots.plot!(x,f(X,k,λ),lab="Prediction",lw=3.0)
-    display(p)
+anim = Animation()
+nmax= 10
+opt = Flux.ADAGrad(0.001)
+@progress for i = 1:nmax
+    grads = Zygote.gradient(() -> loss(k, λ), ps)
+    Flux.Optimise.update!(opt, ps, grads)
+    if i % 100 == 0
+        @info "$i/$nmax"
+        p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))")
+        Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0)
+        frame(anim)
+    end
 end
+gif(anim, fps = 5)
diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
index 5c1d61d86..d6e999e06 100644
--- a/examples/kernelridgeregression.jl
+++ b/examples/kernelridgeregression.jl
@@ -1,35 +1,64 @@
+# Kernel Ridge Regression
 using KernelFunctions
-using MLDataUtils
-using Zygote
-using Flux
-using Distributions, LinearAlgebra
+using LinearAlgebra
+using Distributions
 using Plots
+using Flux: Optimise
+using ForwardDiff
 
-Flux.@functor SqExponentialKernel
-Flux.@functor ScaleTransform
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-
-xmin = -3; xmax = 3
-x = range(xmin,xmax,length=100)
-x_test = range(xmin,xmax,length=300)
-x,y = noisy_function(sinc,x;noise=0.1)
-X = reshape(x,:,1)
-X_test = reshape(x_test,:,1)
-k = SqExponentialKernel(1.0)#+Matern32Kernel(2.0)
-λ = [-1.0]
-f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y
-f(X,k,1.0)
-loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ)
-loss(k,λ)
-ps = Flux.params(k)
-push!(ps,λ)
-opt = Flux.Momentum(0.1)
-##
-for i in 1:10
-    grads = Zygote.gradient(()->loss(k,λ),ps)
-    Flux.Optimise.update!(opt,ps,grads)
-    p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))")
-    Plots.plot!(x_test,f(X_test,k,λ),lab="Prediction",lw=3.0)
-    display(p)
+# We generate our data :
+xmin = -3; xmax = 3 # Bounds of the data
+N = 50 # Number of samples
+σ = 0.1
+
+x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
+x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function
+y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+
+# Plot the data
+scatter(x_train, y, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+
+# Create a function taking kernel parameters and creating a kernel
+kernelcall(θ) = transform(
+    exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
+    exp(θ[3]),
+)
+
+# Return the prediction given the normalization value λ
+function f(x, θ)
+    k = kernelcall(θ[1:3])
+    kernelmatrix(k, x, x_train) *
+    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y
+end
+
+# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get :
+ŷ = f(x_test, log.(ones(4)))
+scatter(x_train, y, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+plot!(x_test, ŷ, lab = "prediction")
+
+# Create a loss on the training data
+function loss(θ)
+    ŷ = f(x_train, θ)
+    sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ)
+end
+
+# The loss with our starting point :
+loss(log.(ones(4)))
+
+## Training model
+
+θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001))
+anim = Animation()
+opt = ADAGrad(0.5)
+@progress for i = 1:30
+    grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
+    Δ = Optimise.apply!(opt, θ, grads)
+    θ .-= Δ # We apply a simple Gradient descent algorithm
+    p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
+    plot!(x_test, sinc, lab = "true function")
+    plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0)
+    frame(anim)
 end
+gif(anim)
diff --git a/examples/svm.jl b/examples/svm.jl
index 823b2e1d4..30efc9519 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -5,19 +5,27 @@ using Flux
 using Distributions, LinearAlgebra
 using Plots
 
-N = 100 #Number of samples
-μ = randn(2,2) # Random Centers
-xgrid = range(-3,3,length=100) # Create a grid
-Xgrid = hcat(collect.(Iterators.product(xgrid,xgrid))...)' #Combine into a 2D grid
-y = rand([-1,1],N) # Select randomly between the two classes
-X = zeros(N,2)
-X[y.==1,:] = rand(MvNormal(μ[:,1],I),count(y.==1))' #Attribute samples from class 1
-X[y.==-1,:] = rand(MvNormal(μ[:,2],I),count(y.==-1))' # Attribute samples from class 2
-
-
-k = SqExponentialKernel(2.0) # Create kernel function
-f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y # Optimal prediction f
-svmloss(y,ŷ)= f(X,k,λ) |> ŷ -> sum(maximum.(0.0,1-y*ŷ)) - λ*norm(ŷ) # Total svm loss with regularisation
-pred = f(Xgrid,k,λ) #Compute prediction on a grid
-contourf(xgrid,xgrid,pred)
-scatter!(eachcol(X)...,color=y,lab="data")
+N = 100 # Number of samples
+N_test = 200 # Size of the grid
+xmin = -3; xmax = 3
+μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers
+xgrid = range(-xmin, xmax, length=N_test) # Create a grid
+Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
+y = rand((-1, 1), N) # Select randomly between the two classes
+X_train = zeros(2, N)
+X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1
+X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2
+scatter(eachrow(X_train)..., zcolor= y)
+## Compute predictions
+k = SqExponentialKernel() # Create kernel function
+function f(x, k, λ)
+    kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f
+end
+λ = log.([1.0])
+function reg_hingeloss(k, λ)
+    ŷ = f(X, k, λ)
+    return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation
+end
+y_grid = f(Xgrid, k, λ) #Compute prediction on a grid
+contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test))
+scatter!(eachrow(X_train)..., zcolor=y,lab="data")
diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl
index b96208308..3ae038b75 100644
--- a/src/kernels/transformedkernel.jl
+++ b/src/kernels/transformedkernel.jl
@@ -41,11 +41,11 @@ _scale(t::ScaleTransform, metric, x, y) = evaluate(metric, t(x), t(y))
 """
 transform
 
-transform(k::BaseKernel, t::Transform) = TransformedKernel(k, t)
+transform(k::Kernel, t::Transform) = TransformedKernel(k, t)
 
-transform(k::BaseKernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ))
+transform(k::Kernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ))
 
-transform(k::BaseKernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
+transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
 
 kernel(κ) = κ.kernel
 

From 2982f7ce0baa860387804a95c553bf60e9c36c63 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 12 Jun 2020 18:58:59 +0200
Subject: [PATCH 02/13] Work on examples, but need some fixes on transform

---
 examples/deepkernellearning.jl | 45 ++++++++++++++++++++--------------
 examples/svm.jl                |  6 +++--
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index 50c433705..ac4de77c2 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -1,13 +1,17 @@
+# # Loading dataset
+# We use a couple of useful datasets to plot and optimize
+# The different hyper-parameters
 using KernelFunctions
 using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
-using Plots;
+using Plots
+using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
 using SliceMap
 
-neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+
 
 Base.map(
     t::KernelFunctions.FunctionTransform,
@@ -18,28 +22,33 @@ Base.map(
         t.f(x)
     end
 
-xmin = -3; xmax = 3
+# ## Data creation
+# We create a simple 1D Problem with very different variations
+xmin = -3; xmax = 3 # Limits
+noise = 0.01
+x_train = rand(Uniform(xmin, xmax), 100) # Training dataset
+x_test = range(xmin, xmax, length=200) # Testing dataset
+target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
+x_train, y = noisy_function(target_f, x_train; noise = 0.01)
 
-x_train = range(xmin, xmax, length = 100)
-x_test = rand(Uniform(xmin, xmax), 200)
-x_train, y = noisy_function(x_train; noise = 0.01) do x
-    sinc(abs(x) ^ abs(x))
-end
-X = reshape(x_train, :, 1)
+# ## Model definition
+# We create a neural net with 2 layers and 10 units each
+# The data is passed through the NN before being used in the kernel
+neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+# We use a Squared Exponential Kernel
 k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
 
-λ = log.([1.0])
-function f(x, k, λ)
-    K = kernelmatrix(k, X, x, obsdim = 1)
-    return K * inv(K + exp(λ[1]) * I) * y
-end
-f(X, k, 1.0)
-loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)
+f = GP(k)
+fx = f(x_train, noise)
+fp = posterior(fx, y)
+# This compute the log evidence of `y`,
+# which is going to be used as the objective
+loss(y) = logpdf(fx, y)
+pred
 
-@info "Init Loss = $(loss(k, λ))"
+@info "Init Loss = $(loss(y))"
 
 ps = Flux.params(k)
-push!(ps,λ)
 
 p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
 Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
diff --git a/examples/svm.jl b/examples/svm.jl
index 30efc9519..c92600938 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,14 +1,16 @@
 using KernelFunctions
-using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
 
+# #
+
 N = 100 # Number of samples
 N_test = 200 # Size of the grid
 xmin = -3; xmax = 3
-μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers
+
+μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
 xgrid = range(-xmin, xmax, length=N_test) # Create a grid
 Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
 y = rand((-1, 1), N) # Select randomly between the two classes

From c83829a912b969f622843e877535404a06de2aba Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 6 Jul 2020 18:22:18 +0200
Subject: [PATCH 03/13] Formatting for Literate.jl

---
 examples/svm.jl            | 65 +++++++++++++++++++++++---------------
 src/matrix/kernelmatrix.jl |  2 +-
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/examples/svm.jl b/examples/svm.jl
index c92600938..0e10e0622 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,33 +1,46 @@
+# # Support Vector Machines
+# ## We first load some needed packages
 using KernelFunctions
-using Zygote
-using Flux
 using Distributions, LinearAlgebra
-using Plots
-
-# #
-
-N = 100 # Number of samples
-N_test = 200 # Size of the grid
-xmin = -3; xmax = 3
+using Plots; default(legendfontsize = 15.0, ms = 5.0)
 
+# ## Data Generation
+# ### We first generate a mixture of two Gaussians in 2 dimensions
+xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂
 μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
-xgrid = range(-xmin, xmax, length=N_test) # Create a grid
-Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
+# ### We then sample both y and x
+N = 100 # Number of samples
 y = rand((-1, 1), N) # Select randomly between the two classes
-X_train = zeros(2, N)
-X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1
-X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2
-scatter(eachrow(X_train)..., zcolor= y)
-## Compute predictions
-k = SqExponentialKernel() # Create kernel function
-function f(x, k, λ)
-    kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f
+x = Vector{Vector{Float64}}(undef, N) # We preallocate x
+x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y.==1)] # Features for samples of class 1
+x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y.==-1)] # Features for samples of class 2
+scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1", title = "Data")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+
+# ## Model Definition
+# TODO Write theory here
+# ### We create a kernel k
+k = SqExponentialKernel() # SqExponentialKernel or RBFKernel
+λ = 1.0 # Regularization parameter
+
+# ### We create a function to return the optimal prediction for a
+# test data `x_new`
+function f(x_new, x, y, k, λ)
+    kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f
 end
-λ = log.([1.0])
-function reg_hingeloss(k, λ)
-    ŷ = f(X, k, λ)
-    return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation
+
+# ### We also compute the total loss of the model that we want to minimize
+hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
+function reg_hingeloss(k, x, y, λ)
+    ŷ = f(x, x, y, k, λ)
+    return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
 end
-y_grid = f(Xgrid, k, λ) #Compute prediction on a grid
-contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test))
-scatter!(eachrow(X_train)..., zcolor=y,lab="data")
+# ### We create a 2D grid based on the maximum values of the data
+N_test = 200 # Size of the grid
+xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
+xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+# ### We predict the value of y on this grid on plot it against the data
+y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid
+contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
+scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
diff --git a/src/matrix/kernelmatrix.jl b/src/matrix/kernelmatrix.jl
index bc24f7354..8b07841f9 100644
--- a/src/matrix/kernelmatrix.jl
+++ b/src/matrix/kernelmatrix.jl
@@ -89,7 +89,7 @@ function kernelmatrix(κ::SimpleKernel, x::AbstractVector)
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector, y::AbstractVector)
-    validate_dims(x, y)
+    # validate_dims(x, y)
     return map(d -> kappa(κ, d), pairwise(metric(κ), x, y))
 end
 

From 118e9b80055b0109061ae3948001d6af66b8660d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Wed, 8 Jul 2020 19:59:14 +0200
Subject: [PATCH 04/13] Added examples to make.jl

---
 docs/.gitignore                   |  3 +-
 docs/Project.toml                 | 13 ++++++
 docs/make.jl                      | 25 ++++++++++-
 examples/automaticstatistician.jl |  0
 examples/deepkernellearning.jl    | 72 ++++++++++++++++++-------------
 examples/kernelridgeregression.jl | 57 +++++++++++++-----------
 examples/svm.jl                   |  2 +-
 7 files changed, 111 insertions(+), 61 deletions(-)
 create mode 100644 examples/automaticstatistician.jl

diff --git a/docs/.gitignore b/docs/.gitignore
index 90fb8a549..909052372 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,3 @@
 build/
 site/
-
-#Temp to avoid to many changes
+src/examples/
diff --git a/docs/Project.toml b/docs/Project.toml
index 9e80a3254..0252eb1e6 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,18 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
+Distributions = "0.23"
 Documenter = "0.23, 0.24, 0.25"
+Flux = "0.8"
+ForwardDiff = "0.10"
+Literate = "2"
+Plots = "1"
diff --git a/docs/make.jl b/docs/make.jl
index e0366f01d..03772733c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,13 +1,33 @@
 using Documenter
+using Literate
 using KernelFunctions
 
+if ispath(joinpath(@__DIR__, "src", "examples"))
+    rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
+end
+
+for filename in readdir(joinpath(@__DIR__, "..", "examples"))
+    endswith(filename, ".jl") || continue
+    name = splitext(filename)[1]
+    Literate.markdown(
+        joinpath(@__DIR__, "..", "examples", filename),
+        joinpath(@__DIR__, "src", "examples"),
+        name = name,
+        documenter = true,
+    )
+end
+
 makedocs(
     sitename = "KernelFunctions",
     format = Documenter.HTML(),
     modules = [KernelFunctions],
     pages = ["Home"=>"index.md",
              "User Guide" => "userguide.md",
-             "Examples"=>"example.md",
+             "Examples"=>
+                    ["SVM" => "examples/svm.md",
+                     # "Kernel Ridge Regression" => "examples/kernelridgeregression.md",
+                     # "Deep Kernel Learning" => "examples/deepkernellearning.md",
+                     ],
              "Kernel Functions"=>"kernels.md",
              "Transform"=>"transform.md",
              "Metrics"=>"metrics.md",
@@ -22,5 +42,6 @@ makedocs(
 deploydocs(
     deps = Deps.pip("mkdocs", "python-markdown-math"),
     repo = "github.com/JuliaGaussianProcesses/KernelFunctions.jl.git",
-    target = "build"
+    target = "build",
+    push_preview = true,
 )
diff --git a/examples/automaticstatistician.jl b/examples/automaticstatistician.jl
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index ac4de77c2..6f858e8ea 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -1,35 +1,36 @@
-# # Loading dataset
-# We use a couple of useful datasets to plot and optimize
-# The different hyper-parameters
+# # Deep Kernel Learning with Flux
+# ## Package loading
+# We use a couple of useful packages to plot and optimize
+# the different hyper-parameters
 using KernelFunctions
-using MLDataUtils
-using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
 using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
-using SliceMap
+# using SliceMap
 
 
 
-Base.map(
-    t::KernelFunctions.FunctionTransform,
-    X::AbstractVector;
-    obsdim::Int = defaultobs,
-) where {T} =
-    slicemap(X, dims = 1) do x
-        t.f(x)
-    end
+# Base.map(
+#     t::KernelFunctions.FunctionTransform,
+#     X::AbstractVector;
+#     obsdim::Int = defaultobs,
+# ) where {T} =
+#     slicemap(X, dims = 1) do x
+#         t.f(x)
+#     end
 
 # ## Data creation
 # We create a simple 1D Problem with very different variations
 xmin = -3; xmax = 3 # Limits
+N = 100
 noise = 0.01
-x_train = rand(Uniform(xmin, xmax), 100) # Training dataset
-x_test = range(xmin, xmax, length=200) # Testing dataset
+x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
 target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
-x_train, y = noisy_function(target_f, x_train; noise = 0.01)
+target_f(x::AbstractArray) = target_f(first(x))
+y_train = target_f.(x_train) + randn(N) * noise
+x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
 
 # ## Model definition
 # We create a neural net with 2 layers and 10 units each
@@ -38,32 +39,41 @@ neuralnet = Chain(Dense(1, 10), Dense(10, 2))
 # We use a Squared Exponential Kernel
 k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
 
-f = GP(k)
-fx = f(x_train, noise)
-fp = posterior(fx, y)
+# We use AbstractGPs.jl to define our model
+gpprior = GP(k) # GP Prior
+fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
+fp = posterior(fx, y_train) # Posterior of f
+
 # This compute the log evidence of `y`,
 # which is going to be used as the objective
 loss(y) = logpdf(fx, y)
-pred
 
-@info "Init Loss = $(loss(y))"
+@info "Init Loss = $(loss(y_train))"
 
+# Flux will automatically extract all the parameters of the kernel
 ps = Flux.params(k)
 
-p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
-Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
-##
+# We show the initial prediction with the untrained model
+p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+pred = marginals(fp(x_test))
+Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
+# ## Training
 anim = Animation()
 nmax= 10
 opt = Flux.ADAGrad(0.001)
-@progress for i = 1:nmax
-    grads = Zygote.gradient(() -> loss(k, λ), ps)
+for i = 1:nmax
+    grads = gradient(ps) do
+        loss(y_train)
+    end
     Flux.Optimise.update!(opt, ps, grads)
-    if i % 100 == 0
+    # if i % 100 == 0
         @info "$i/$nmax"
-        p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))")
-        Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0)
+        p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+        p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+        pred = marginals(fp(x_test))
+        Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
-    end
+    # end
 end
 gif(anim, fps = 5)
diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
index d6e999e06..2efa474bd 100644
--- a/examples/kernelridgeregression.jl
+++ b/examples/kernelridgeregression.jl
@@ -1,64 +1,71 @@
-# Kernel Ridge Regression
+# # Kernel Ridge Regression
+# ## We load KernelFunctions and some other packages
 using KernelFunctions
 using LinearAlgebra
 using Distributions
-using Plots
+using Plots; default(lw = 2.0, legendfontsize = 15.0)
 using Flux: Optimise
 using ForwardDiff
+using Random: seed!
+seed!(42)
 
-# We generate our data :
+# ## Data Generation
+# We generated data in 1 dimension
 xmin = -3; xmax = 3 # Bounds of the data
 N = 50 # Number of samples
-σ = 0.1
-
 x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
-x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function
-y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+σ = 0.1
+y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+x_test = range(xmin-0.1, xmax+0.1, length=300)
 
 # Plot the data
-scatter(x_train, y, lab = "data")
+scatter(x_train, y_train, lab = "data")
 plot!(x_test, sinc, lab = "true function")
 
-# Create a function taking kernel parameters and creating a kernel
+# ## Kernel training
+# To train the kernel parameters via ForwardDiff.jl
+# we need to create a function creating a kernel from an array
 kernelcall(θ) = transform(
     exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
     exp(θ[3]),
 )
 
-# Return the prediction given the normalization value λ
-function f(x, θ)
+# From theory we know the prediction for a test set x given
+# the kernel parameters and normalization constant
+function f(x, x_train, y_train, θ)
     k = kernelcall(θ[1:3])
     kernelmatrix(k, x, x_train) *
-    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y
+    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y_train
 end
 
-# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get :
-ŷ = f(x_test, log.(ones(4)))
-scatter(x_train, y, lab = "data")
+# We look how the prediction looks like
+# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
+ŷ = f(x_test, x_train, y_train, log.(ones(4)))
+scatter(x_train, y_train, lab = "data")
 plot!(x_test, sinc, lab = "true function")
 plot!(x_test, ŷ, lab = "prediction")
 
-# Create a loss on the training data
+# We define the loss based on the L2 norm both
+# for the loss and the regularization
 function loss(θ)
-    ŷ = f(x_train, θ)
-    sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ)
+    ŷ = f(x_train, x_train, y_train, θ)
+    sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
 end
 
 # The loss with our starting point :
 loss(log.(ones(4)))
 
-## Training model
-
-θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001))
+# ## Training the model
+θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
 anim = Animation()
-opt = ADAGrad(0.5)
-@progress for i = 1:30
+opt = Optimise.ADAGrad(0.5)
+for i = 1:30
     grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
     Δ = Optimise.apply!(opt, θ, grads)
     θ .-= Δ # We apply a simple Gradient descent algorithm
-    p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
+    p = scatter(x_train, y_train, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
     plot!(x_test, sinc, lab = "true function")
-    plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0)
+    plot!(x_test, f(x_test, x_train, y_train, θ), lab = "Prediction", lw = 3.0)
     frame(anim)
 end
 gif(anim)
diff --git a/examples/svm.jl b/examples/svm.jl
index 0e10e0622..5200c626e 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,5 +1,5 @@
 # # Support Vector Machines
-# ## We first load some needed packages
+# ## Package loading
 using KernelFunctions
 using Distributions, LinearAlgebra
 using Plots; default(legendfontsize = 15.0, ms = 5.0)

From 438fcee0582bf553e3d27c8c62e31ffbf13e19ef Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 9 Jul 2020 00:16:20 +0200
Subject: [PATCH 05/13] Small corrections

---
 examples/deepkernellearning.jl |  4 ++--
 examples/svm.jl                | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index 6f858e8ea..01771e53b 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -67,13 +67,13 @@ for i = 1:nmax
         loss(y_train)
     end
     Flux.Optimise.update!(opt, ps, grads)
-    # if i % 100 == 0
+    if i % 100 == 0
         @info "$i/$nmax"
         p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
         p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
         pred = marginals(fp(x_test))
         Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
-    # end
+    end
 end
 gif(anim, fps = 5)
diff --git a/examples/svm.jl b/examples/svm.jl
index 5200c626e..9bb262496 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -20,7 +20,7 @@ scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
 # ## Model Definition
 # TODO Write theory here
 # ### We create a kernel k
-k = SqExponentialKernel() # SqExponentialKernel or RBFKernel
+k = SqExponentialKernel() # SqExponentialKernel/RBFKernel
 λ = 1.0 # Regularization parameter
 
 # ### We create a function to return the optimal prediction for a
@@ -36,11 +36,13 @@ function reg_hingeloss(k, x, y, λ)
     return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
 end
 # ### We create a 2D grid based on the maximum values of the data
-N_test = 200 # Size of the grid
+N_test = 100 # Size of the grid
 xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
-xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
 # ### We predict the value of y on this grid on plot it against the data
-y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid
+y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid
 contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
 scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
 scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+xlims!(extrema(xgrid))
+ylims!(extrema(xgrid))

From 0aaf2bb72e45687bc1cd5324ba9fb97f69902803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Fri, 6 Nov 2020 11:47:13 +0100
Subject: [PATCH 06/13] Moved examples to docs

---
 {examples => docs/examples}/automaticstatistician.jl | 0
 {examples => docs/examples}/deepkernellearning.jl    | 0
 {examples => docs/examples}/kernelridgeregression.jl | 0
 {examples => docs/examples}/svm.jl                   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {examples => docs/examples}/automaticstatistician.jl (100%)
 rename {examples => docs/examples}/deepkernellearning.jl (100%)
 rename {examples => docs/examples}/kernelridgeregression.jl (100%)
 rename {examples => docs/examples}/svm.jl (100%)

diff --git a/examples/automaticstatistician.jl b/docs/examples/automaticstatistician.jl
similarity index 100%
rename from examples/automaticstatistician.jl
rename to docs/examples/automaticstatistician.jl
diff --git a/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl
similarity index 100%
rename from examples/deepkernellearning.jl
rename to docs/examples/deepkernellearning.jl
diff --git a/examples/kernelridgeregression.jl b/docs/examples/kernelridgeregression.jl
similarity index 100%
rename from examples/kernelridgeregression.jl
rename to docs/examples/kernelridgeregression.jl
diff --git a/examples/svm.jl b/docs/examples/svm.jl
similarity index 100%
rename from examples/svm.jl
rename to docs/examples/svm.jl

From cfa203fc5798b3e87188a01bbdcdccf027d6419c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Fri, 6 Nov 2020 18:30:02 +0100
Subject: [PATCH 07/13] Small updates

---
 docs/examples/deepkernellearning.jl | 41 ++++++++++++-----------------
 src/kernels/transformedkernel.jl    |  2 --
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/docs/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl
index 01771e53b..f90dc07dd 100644
--- a/docs/examples/deepkernellearning.jl
+++ b/docs/examples/deepkernellearning.jl
@@ -6,38 +6,28 @@ using KernelFunctions
 using Flux
 using Distributions, LinearAlgebra
 using Plots
+using ProgressMeter
 using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
-# using SliceMap
-
-
-
-# Base.map(
-#     t::KernelFunctions.FunctionTransform,
-#     X::AbstractVector;
-#     obsdim::Int = defaultobs,
-# ) where {T} =
-#     slicemap(X, dims = 1) do x
-#         t.f(x)
-#     end
 
 # ## Data creation
 # We create a simple 1D Problem with very different variations
 xmin = -3; xmax = 3 # Limits
-N = 100
+N = 150
 noise = 0.01
 x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
 target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
 target_f(x::AbstractArray) = target_f(first(x))
 y_train = target_f.(x_train) + randn(N) * noise
 x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
-
+spectral_mixture_kernel()
 # ## Model definition
 # We create a neural net with 2 layers and 10 units each
 # The data is passed through the NN before being used in the kernel
-neuralnet = Chain(Dense(1, 10), Dense(10, 2))
-# We use a Squared Exponential Kernel
-k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
+# We use two cases :
+# - The Squared Exponential Kernel
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet) )
 
 # We use AbstractGPs.jl to define our model
 gpprior = GP(k) # GP Prior
@@ -46,7 +36,7 @@ fp = posterior(fx, y_train) # Posterior of f
 
 # This compute the log evidence of `y`,
 # which is going to be used as the objective
-loss(y) = logpdf(fx, y)
+loss(y) = -logpdf(fx, y)
 
 @info "Init Loss = $(loss(y_train))"
 
@@ -54,26 +44,29 @@ loss(y) = logpdf(fx, y)
 ps = Flux.params(k)
 
 # We show the initial prediction with the untrained model
-p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+p_init = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
 Plots.scatter!(vcat(x_train...), y_train, lab = "data")
 pred = marginals(fp(x_test))
 Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
 # ## Training
 anim = Animation()
-nmax= 10
-opt = Flux.ADAGrad(0.001)
-for i = 1:nmax
-    grads = gradient(ps) do
+nmax= 1000
+opt = Flux.ADAM(0.1)
+@showprogress for i = 1:nmax
+    global grads = gradient(ps) do
         loss(y_train)
     end
     Flux.Optimise.update!(opt, ps, grads)
     if i % 100 == 0
         @info "$i/$nmax"
+        L = loss(y_train)
+        # @info "Loss = $L"
         p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
         p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
-        pred = marginals(fp(x_test))
+        pred = marginals(posterior(fx, y_train)(x_test))
         Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
+        display(p)
     end
 end
 gif(anim, fps = 5)
diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl
index ec9eddc8b..3d1928cd3 100644
--- a/src/kernels/transformedkernel.jl
+++ b/src/kernels/transformedkernel.jl
@@ -52,8 +52,6 @@ end
 
 transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
 
-transform(k::Kernel, ρ::AbstractVector) = transform(k, ARDTransform(ρ))
-
 kernel(κ) = κ.kernel
 
 Base.show(io::IO, κ::TransformedKernel) = printshifted(io, κ, 0)

From df375a7ff96afd13f1da64612ad86d3c093744a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Mon, 9 Nov 2020 19:30:05 +0100
Subject: [PATCH 08/13] Switched `metric` to `binary_op`

---
 docs/make.jl                               |  2 +-
 docs/src/create_kernel.md                  |  6 +++---
 docs/src/metrics.md                        |  2 +-
 src/KernelFunctions.jl                     | 11 ++++++----
 src/basekernels/constant.jl                |  6 +++---
 src/basekernels/cosine.jl                  |  2 +-
 src/basekernels/exponential.jl             |  6 +++---
 src/basekernels/exponentiated.jl           |  2 +-
 src/basekernels/maha.jl                    |  2 +-
 src/basekernels/matern.jl                  |  6 +++---
 src/basekernels/periodic.jl                |  2 +-
 src/basekernels/piecewisepolynomial.jl     |  2 +-
 src/basekernels/polynomial.jl              |  4 ++--
 src/basekernels/rationalquad.jl            |  4 ++--
 src/{distances => binary_op}/delta.jl      |  0
 src/{distances => binary_op}/dotproduct.jl |  2 +-
 src/{distances => binary_op}/pairwise.jl   |  0
 src/{distances => binary_op}/sinus.jl      |  0
 src/generic.jl                             |  2 +-
 src/kernels/transformedkernel.jl           | 14 ++++++-------
 src/matrix/kernelmatrix.jl                 |  8 ++++----
 src/utils.jl                               | 24 +++++++++++-----------
 test/basekernels/constant.jl               |  8 ++++----
 test/basekernels/exponential.jl            |  8 ++++----
 test/basekernels/exponentiated.jl          |  2 +-
 test/basekernels/matern.jl                 |  8 ++++----
 test/basekernels/polynomial.jl             | 10 ++++-----
 test/basekernels/rationalquad.jl           | 10 ++++-----
 test/matrix/kernelmatrix.jl                |  2 +-
 test/runtests.jl                           |  2 +-
 30 files changed, 80 insertions(+), 77 deletions(-)
 rename src/{distances => binary_op}/delta.jl (100%)
 rename src/{distances => binary_op}/dotproduct.jl (93%)
 rename src/{distances => binary_op}/pairwise.jl (100%)
 rename src/{distances => binary_op}/sinus.jl (100%)

diff --git a/docs/make.jl b/docs/make.jl
index 441d8d972..8c89f2bae 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -37,7 +37,7 @@ makedocs(
                      ],
              "Kernel Functions"=>"kernels.md",
              "Transform"=>"transform.md",
-             "Metrics"=>"metrics.md",
+             "Binary Operations"=>"metrics.md",
              "Theory"=>"theory.md",
              "Custom Kernels"=>"create_kernel.md",
              "API"=>"api.md"]
diff --git a/docs/src/create_kernel.md b/docs/src/create_kernel.md
index 049b6e0a3..301ec4fe7 100644
--- a/docs/src/create_kernel.md
+++ b/docs/src/create_kernel.md
@@ -6,15 +6,15 @@ Here are a few ways depending on how complicated your kernel is :
 
 ### SimpleKernel for kernels function depending on a metric
 
-If your kernel function is of the form `k(x, y) = f(d(x, y))` where `d(x, y)` is a `PreMetric`,
-you can construct your custom kernel by defining `kappa` and `metric` for your kernel.
+If your kernel function is of the form `k(x, y) = f(binary_op(x, y))` where `binary_op(x, y)` is a `PreMetric` or another function/instance implementing `pairwise` and `evaluate` from `Distances.jl`,
+you can construct your custom kernel by defining `kappa` and `binary_op` for your kernel.
 Here is for example how one can define the `SqExponentialKernel` again :
 
 ```julia
 struct MyKernel <: KernelFunctions.SimpleKernel end
 
 KernelFunctions.kappa(::MyKernel, d2::Real) = exp(-d2)
-KernelFunctions.metric(::MyKernel) = SqEuclidean()
+KernelFunctions.binary_op(::MyKernel) = SqEuclidean()
 ```
 
 ### Kernel for more complex kernels
diff --git a/docs/src/metrics.md b/docs/src/metrics.md
index 644fcb2a0..3120b70a8 100644
--- a/docs/src/metrics.md
+++ b/docs/src/metrics.md
@@ -5,7 +5,7 @@ To do so a distance measure is needed for each kernel. Two very common ones can
 However all kernels do not rely on distances metrics respecting all the definitions. That's why  additional metrics come with the package such as `DotProduct` (`<x,y>`) and `Delta` (`δ(x,y)`).
 Note that every `SimpleKernel` must have a defined metric defined as :
 ```julia
-    KernelFunctions.metric(::CustomKernel) = SqEuclidean()
+    KernelFunctions.binary_op(::CustomKernel) = SqEuclidean()
 ```
 
 ## Adding a new metric
diff --git a/src/KernelFunctions.jl b/src/KernelFunctions.jl
index eeb57a9c7..89fc6d1a1 100644
--- a/src/KernelFunctions.jl
+++ b/src/KernelFunctions.jl
@@ -64,10 +64,13 @@ abstract type Kernel end
 abstract type SimpleKernel <: Kernel end
 
 include("utils.jl")
-include(joinpath("distances", "pairwise.jl"))
-include(joinpath("distances", "dotproduct.jl"))
-include(joinpath("distances", "delta.jl"))
-include(joinpath("distances", "sinus.jl"))
+
+abstract type AbstractBinaryOp end
+const BinaryOp = Union{Metric, AbstractBinaryOp}
+include(joinpath("binary_op", "pairwise.jl"))
+include(joinpath("binary_op", "dotproduct.jl"))
+include(joinpath("binary_op", "delta.jl"))
+include(joinpath("binary_op", "sinus.jl"))
 
 include(joinpath("transform", "transform.jl"))
 include(joinpath("transform", "scaletransform.jl"))
diff --git a/src/basekernels/constant.jl b/src/basekernels/constant.jl
index db7f94870..e74f92f85 100644
--- a/src/basekernels/constant.jl
+++ b/src/basekernels/constant.jl
@@ -11,7 +11,7 @@ struct ZeroKernel <: SimpleKernel end
 
 kappa(κ::ZeroKernel, d::T) where {T<:Real} = zero(T)
 
-metric(::ZeroKernel) = Delta()
+binary_op(::ZeroKernel) = Delta()
 
 Base.show(io::IO, ::ZeroKernel) = print(io, "Zero Kernel")
 
@@ -34,7 +34,7 @@ const EyeKernel = WhiteKernel
 
 kappa(κ::WhiteKernel, δₓₓ::Real) = δₓₓ
 
-metric(::WhiteKernel) = Delta()
+binary_op(::WhiteKernel) = Delta()
 
 Base.show(io::IO, ::WhiteKernel) = print(io, "White Kernel")
 
@@ -58,6 +58,6 @@ end
 
 kappa(κ::ConstantKernel,x::Real) = first(κ.c)*one(x)
 
-metric(::ConstantKernel) = Delta()
+binary_op(::ConstantKernel) = Delta()
 
 Base.show(io::IO, κ::ConstantKernel) = print(io, "Constant Kernel (c = ", first(κ.c), ")")
diff --git a/src/basekernels/cosine.jl b/src/basekernels/cosine.jl
index 53525cf7d..c685d8ff5 100644
--- a/src/basekernels/cosine.jl
+++ b/src/basekernels/cosine.jl
@@ -10,6 +10,6 @@ struct CosineKernel <: SimpleKernel end
 
 kappa(κ::CosineKernel, d::Real) = cospi(d)
 
-metric(::CosineKernel) = Euclidean()
+binary_op(::CosineKernel) = Euclidean()
 
 Base.show(io::IO, ::CosineKernel) = print(io, "Cosine Kernel")
diff --git a/src/basekernels/exponential.jl b/src/basekernels/exponential.jl
index 1a9c38a34..8e2e0ff36 100644
--- a/src/basekernels/exponential.jl
+++ b/src/basekernels/exponential.jl
@@ -13,7 +13,7 @@ struct SqExponentialKernel <: SimpleKernel end
 
 kappa(κ::SqExponentialKernel, d²::Real) = exp(-d² / 2)
 
-metric(::SqExponentialKernel) = SqEuclidean()
+binary_op(::SqExponentialKernel) = SqEuclidean()
 
 iskroncompatible(::SqExponentialKernel) = true
 
@@ -36,7 +36,7 @@ struct ExponentialKernel <: SimpleKernel end
 
 kappa(κ::ExponentialKernel, d::Real) = exp(-d)
 
-metric(::ExponentialKernel) = Euclidean()
+binary_op(::ExponentialKernel) = Euclidean()
 
 iskroncompatible(::ExponentialKernel) = true
 
@@ -70,7 +70,7 @@ end
 
 kappa(κ::GammaExponentialKernel, d::Real) = exp(-d^first(κ.γ))
 
-metric(::GammaExponentialKernel) = Euclidean()
+binary_op(::GammaExponentialKernel) = Euclidean()
 
 iskroncompatible(::GammaExponentialKernel) = true
 
diff --git a/src/basekernels/exponentiated.jl b/src/basekernels/exponentiated.jl
index 73eba7d06..309cef171 100644
--- a/src/basekernels/exponentiated.jl
+++ b/src/basekernels/exponentiated.jl
@@ -10,7 +10,7 @@ struct ExponentiatedKernel <: SimpleKernel end
 
 kappa(κ::ExponentiatedKernel, xᵀy::Real) = exp(xᵀy)
 
-metric(::ExponentiatedKernel) = DotProduct()
+binary_op(::ExponentiatedKernel) = DotProduct()
 
 iskroncompatible(::ExponentiatedKernel) = true
 
diff --git a/src/basekernels/maha.jl b/src/basekernels/maha.jl
index 979bc0ed6..cfbe989ad 100644
--- a/src/basekernels/maha.jl
+++ b/src/basekernels/maha.jl
@@ -20,6 +20,6 @@ end
 
 kappa(κ::MahalanobisKernel, d::T) where {T<:Real} = exp(-d)
 
-metric(κ::MahalanobisKernel) = SqMahalanobis(κ.P)
+binary_op(κ::MahalanobisKernel) = SqMahalanobis(κ.P)
 
 Base.show(io::IO, κ::MahalanobisKernel) = print(io, "Mahalanobis Kernel (size(P) = ", size(κ.P), ")")
diff --git a/src/basekernels/matern.jl b/src/basekernels/matern.jl
index e6b8ee8ea..feb413742 100644
--- a/src/basekernels/matern.jl
+++ b/src/basekernels/matern.jl
@@ -29,7 +29,7 @@ function _matern(ν::Real, d::Real)
     return exp((one(d) - ν) * logtwo - loggamma(ν) + ν * log(y) + log(besselk(ν, y)))
 end
 
-metric(::MaternKernel) = Euclidean()
+binary_op(::MaternKernel) = Euclidean()
 
 Base.show(io::IO, κ::MaternKernel) = print(io, "Matern Kernel (ν = ", first(κ.ν), ")")
 
@@ -45,7 +45,7 @@ struct Matern32Kernel <: SimpleKernel end
 
 kappa(κ::Matern32Kernel, d::Real) = (1 + sqrt(3) * d) * exp(-sqrt(3) * d)
 
-metric(::Matern32Kernel) = Euclidean()
+binary_op(::Matern32Kernel) = Euclidean()
 
 Base.show(io::IO, ::Matern32Kernel) = print(io, "Matern 3/2 Kernel")
 
@@ -61,6 +61,6 @@ struct Matern52Kernel <: SimpleKernel end
 
 kappa(κ::Matern52Kernel, d::Real) = (1 + sqrt(5) * d + 5 * d^2 / 3) * exp(-sqrt(5) * d)
 
-metric(::Matern52Kernel) = Euclidean()
+binary_op(::Matern52Kernel) = Euclidean()
 
 Base.show(io::IO, ::Matern52Kernel) = print(io, "Matern 5/2 Kernel")
diff --git a/src/basekernels/periodic.jl b/src/basekernels/periodic.jl
index 6c19518e8..6ad3c7ddf 100644
--- a/src/basekernels/periodic.jl
+++ b/src/basekernels/periodic.jl
@@ -22,7 +22,7 @@ PeriodicKernel(T::DataType, dims::Int = 1) = PeriodicKernel(r = ones(T, dims))
 
 @functor PeriodicKernel
 
-metric(κ::PeriodicKernel) = Sinus(κ.r)
+binary_op(κ::PeriodicKernel) = Sinus(κ.r)
 
 kappa(κ::PeriodicKernel, d::Real) = exp(- 0.5d)
 
diff --git a/src/basekernels/piecewisepolynomial.jl b/src/basekernels/piecewisepolynomial.jl
index baf788348..85e15334b 100644
--- a/src/basekernels/piecewisepolynomial.jl
+++ b/src/basekernels/piecewisepolynomial.jl
@@ -42,7 +42,7 @@ _f(κ::PiecewisePolynomialKernel{3}, r, j) = 1 + (j + 3) * r +
 
 kappa(κ::PiecewisePolynomialKernel{V}, r) where V = max(1 - r, 0)^(κ.j + V) * _f(κ, r, κ.j)
 
-metric(κ::PiecewisePolynomialKernel) = Mahalanobis(κ.maha)
+binary_op(κ::PiecewisePolynomialKernel) = Mahalanobis(κ.maha)
 
 function Base.show(io::IO, κ::PiecewisePolynomialKernel{V}) where {V}
     print(io, "Piecewise Polynomial Kernel (v = ", V, ", size(maha) = ", size(κ.maha), ")")
diff --git a/src/basekernels/polynomial.jl b/src/basekernels/polynomial.jl
index 7ac23fa1c..336f77ce6 100644
--- a/src/basekernels/polynomial.jl
+++ b/src/basekernels/polynomial.jl
@@ -18,7 +18,7 @@ end
 
 kappa(κ::LinearKernel, xᵀy::Real) = xᵀy + first(κ.c)
 
-metric(::LinearKernel) = DotProduct()
+binary_op(::LinearKernel) = DotProduct()
 
 Base.show(io::IO, κ::LinearKernel) = print(io, "Linear Kernel (c = ", first(κ.c), ")")
 
@@ -44,6 +44,6 @@ end
 
 kappa(κ::PolynomialKernel, xᵀy::Real) = (xᵀy + first(κ.c))^(first(κ.d))
 
-metric(::PolynomialKernel) = DotProduct()
+binary_op(::PolynomialKernel) = DotProduct()
 
 Base.show(io::IO, κ::PolynomialKernel) = print(io, "Polynomial Kernel (c = ", first(κ.c), ", d = ", first(κ.d), ")")
diff --git a/src/basekernels/rationalquad.jl b/src/basekernels/rationalquad.jl
index f59bccd56..6fd781779 100644
--- a/src/basekernels/rationalquad.jl
+++ b/src/basekernels/rationalquad.jl
@@ -22,7 +22,7 @@ function kappa(κ::RationalQuadraticKernel, d²::T) where {T<:Real}
     return (one(T) + d² / (2 * first(κ.α)))^(-first(κ.α))
 end
 
-metric(::RationalQuadraticKernel) = SqEuclidean()
+binary_op(::RationalQuadraticKernel) = SqEuclidean()
 
 function Base.show(io::IO, κ::RationalQuadraticKernel)
     print(io, "Rational Quadratic Kernel (α = $(first(κ.α)))")
@@ -55,7 +55,7 @@ function kappa(κ::GammaRationalQuadraticKernel, d²::Real)
     return (one(d²) + d²^(first(κ.γ) / 2) / first(κ.α))^(-first(κ.α))
 end
 
-metric(::GammaRationalQuadraticKernel) = SqEuclidean()
+binary_op(::GammaRationalQuadraticKernel) = SqEuclidean()
 
 function Base.show(io::IO, κ::GammaRationalQuadraticKernel)
     print(io, "Gamma Rational Quadratic Kernel (α = $(first(κ.α)), γ = $(first(κ.γ)))")
diff --git a/src/distances/delta.jl b/src/binary_op/delta.jl
similarity index 100%
rename from src/distances/delta.jl
rename to src/binary_op/delta.jl
diff --git a/src/distances/dotproduct.jl b/src/binary_op/dotproduct.jl
similarity index 93%
rename from src/distances/dotproduct.jl
rename to src/binary_op/dotproduct.jl
index 880c494df..d35ff4587 100644
--- a/src/distances/dotproduct.jl
+++ b/src/binary_op/dotproduct.jl
@@ -1,4 +1,4 @@
-struct DotProduct <: Distances.PreMetric end
+struct DotProduct end
 # struct DotProduct <: Distances.UnionSemiMetric end
 
 @inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
diff --git a/src/distances/pairwise.jl b/src/binary_op/pairwise.jl
similarity index 100%
rename from src/distances/pairwise.jl
rename to src/binary_op/pairwise.jl
diff --git a/src/distances/sinus.jl b/src/binary_op/sinus.jl
similarity index 100%
rename from src/distances/sinus.jl
rename to src/binary_op/sinus.jl
diff --git a/src/generic.jl b/src/generic.jl
index 3bae27822..ccddb3153 100644
--- a/src/generic.jl
+++ b/src/generic.jl
@@ -6,4 +6,4 @@ Base.iterate(k::Kernel, ::Any) = nothing
 printshifted(io::IO, o, shift::Int) = print(io, o)
 
 # Fallback implementation of evaluate for `SimpleKernel`s.
-(k::SimpleKernel)(x, y) = kappa(k, evaluate(metric(k), x, y))
+(k::SimpleKernel)(x, y) = kappa(k, evaluate(binary_op(k), x, y))
\ No newline at end of file
diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl
index 3d1928cd3..ebeb4f87e 100644
--- a/src/kernels/transformedkernel.jl
+++ b/src/kernels/transformedkernel.jl
@@ -14,22 +14,22 @@ end
 (k::TransformedKernel)(x, y) = k.kernel(k.transform(x), k.transform(y))
 
 # Optimizations for scale transforms of simple kernels to save allocations:
-# Instead of a multiplying every element of the inputs before evaluating the metric,
+# Instead of a multiplying every element of the inputs before evaluating the binary_op,
 # we perform a scalar multiplcation of the distance of the original inputs, if possible.
 function (k::TransformedKernel{<:SimpleKernel,<:ScaleTransform})(
     x::AbstractVector{<:Real},
     y::AbstractVector{<:Real},
 )
-    return kappa(k.kernel, _scale(k.transform, metric(k.kernel), x, y))
+    return kappa(k.kernel, _scale(k.transform, binary_op(k.kernel), x, y))
 end
 
-function _scale(t::ScaleTransform, metric::Euclidean, x, y)
-    return first(t.s) * evaluate(metric, x, y)
+function _scale(t::ScaleTransform, binary_op::Euclidean, x, y)
+    return first(t.s) * evaluate(binary_op, x, y)
 end
-function _scale(t::ScaleTransform, metric::Union{SqEuclidean,DotProduct}, x, y)
-    return first(t.s)^2 * evaluate(metric, x, y)
+function _scale(t::ScaleTransform, binary_op::Union{SqEuclidean,DotProduct}, x, y)
+    return first(t.s)^2 * evaluate(binary_op, x, y)
 end
-_scale(t::ScaleTransform, metric, x, y) = evaluate(metric, t(x), t(y))
+_scale(t::ScaleTransform, binary_op, x, y) = evaluate(binary_op, t(x), t(y))
 
 """
 ```julia
diff --git a/src/matrix/kernelmatrix.jl b/src/matrix/kernelmatrix.jl
index a7262a243..bedd9ab5b 100644
--- a/src/matrix/kernelmatrix.jl
+++ b/src/matrix/kernelmatrix.jl
@@ -83,7 +83,7 @@ kerneldiagmatrix(κ::Kernel, x::AbstractVector, y::AbstractVector) = map(κ, x,
 
 function kernelmatrix!(K::AbstractMatrix, κ::SimpleKernel, x::AbstractVector)
     validate_inplace_dims(K, x)
-    pairwise!(K, metric(κ), x)
+    pairwise!(K, binary_op(κ), x)
     return map!(d -> kappa(κ, d), K, K)
 end
 
@@ -91,17 +91,17 @@ function kernelmatrix!(
     K::AbstractMatrix, κ::SimpleKernel, x::AbstractVector, y::AbstractVector,
 )
     validate_inplace_dims(K, x, y)
-    pairwise!(K, metric(κ), x, y)
+    pairwise!(K, binary_op(κ), x, y)
     return map!(d -> kappa(κ, d), K, K)
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector)
-    return map(d -> kappa(κ, d), pairwise(metric(κ), x))
+    return map(d -> kappa(κ, d), pairwise(binary_op(κ), x))
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector, y::AbstractVector)
     validate_inputs(x, y)
-    return map(d -> kappa(κ, d), pairwise(metric(κ), x, y))
+    return map(d -> kappa(κ, d), pairwise(binary_op(κ), x, y))
 end
 
 
diff --git a/src/utils.jl b/src/utils.jl
index 3bddf6557..60ea45bcf 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -41,18 +41,18 @@ Base.getindex(D::ColVecs, i) = ColVecs(view(D.X, :, i))
 
 dim(x::ColVecs) = size(x.X, 1)
 
-pairwise(d::PreMetric, x::ColVecs) = Distances.pairwise(d, x.X; dims=2)
-pairwise(d::PreMetric, x::ColVecs, y::ColVecs) = Distances.pairwise(d, x.X, y.X; dims=2)
-function pairwise(d::PreMetric, x::AbstractVector, y::ColVecs)
+pairwise(d::BinaryOp, x::ColVecs) = Distances.pairwise(d, x.X; dims=2)
+pairwise(d::BinaryOp, x::ColVecs, y::ColVecs) = Distances.pairwise(d, x.X, y.X; dims=2)
+function pairwise(d::BinaryOp, x::AbstractVector, y::ColVecs)
     return Distances.pairwise(d, reduce(hcat, x), y.X; dims=2)
 end
-function pairwise(d::PreMetric, x::ColVecs, y::AbstractVector)
+function pairwise(d::BinaryOp, x::ColVecs, y::AbstractVector)
     return Distances.pairwise(d, x.X, reduce(hcat, y); dims=2)
 end
-function pairwise!(out::AbstractMatrix, d::PreMetric, x::ColVecs)
+function pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs)
     return Distances.pairwise!(out, d, x.X; dims=2)
 end
-function pairwise!(out::AbstractMatrix, d::PreMetric, x::ColVecs, y::ColVecs)
+function pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs, y::ColVecs)
     return Distances.pairwise!(out, d, x.X, y.X; dims=2)
 end
 
@@ -79,18 +79,18 @@ Base.getindex(D::RowVecs, i) = RowVecs(view(D.X, i, :))
 
 dim(x::RowVecs) = size(x.X, 2)
 
-pairwise(d::PreMetric, x::RowVecs) = Distances.pairwise(d, x.X; dims=1)
-pairwise(d::PreMetric, x::RowVecs, y::RowVecs) = Distances.pairwise(d, x.X, y.X; dims=1)
-function pairwise(d::PreMetric, x::AbstractVector, y::RowVecs)
+pairwise(d::BinaryOp, x::RowVecs) = Distances.pairwise(d, x.X; dims=1)
+pairwise(d::BinaryOp, x::RowVecs, y::RowVecs) = Distances.pairwise(d, x.X, y.X; dims=1)
+function pairwise(d::BinaryOp, x::AbstractVector, y::RowVecs)
     return Distances.pairwise(d, permutedims(reduce(hcat, x)), y.X; dims=1)
 end
-function pairwise(d::PreMetric, x::RowVecs, y::AbstractVector)
+function pairwise(d::BinaryOp, x::RowVecs, y::AbstractVector)
     return Distances.pairwise(d, x.X, permutedims(reduce(hcat, y)); dims=1)
 end
-function pairwise!(out::AbstractMatrix, d::PreMetric, x::RowVecs)
+function pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs)
     return Distances.pairwise!(out, d, x.X; dims=1)
 end
-function pairwise!(out::AbstractMatrix, d::PreMetric, x::RowVecs, y::RowVecs)
+function pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs, y::RowVecs)
     return Distances.pairwise!(out, d, x.X, y.X; dims=1)
 end
 
diff --git a/test/basekernels/constant.jl b/test/basekernels/constant.jl
index a98f3e514..9756b39b2 100644
--- a/test/basekernels/constant.jl
+++ b/test/basekernels/constant.jl
@@ -3,7 +3,7 @@
         k = ZeroKernel()
         @test eltype(k) == Any
         @test kappa(k, 2.0) == 0.0
-        @test KernelFunctions.metric(ZeroKernel()) == KernelFunctions.Delta()
+        @test binary_op(ZeroKernel()) == KernelFunctions.Delta()
         @test repr(k) == "Zero Kernel"
 
         # Standardised tests.
@@ -16,7 +16,7 @@
         @test kappa(k, 1.0) == 1.0
         @test kappa(k, 0.0) == 0.0
         @test EyeKernel == WhiteKernel
-        @test metric(WhiteKernel()) == KernelFunctions.Delta()
+        @test binary_op(WhiteKernel()) == KernelFunctions.Delta()
         @test repr(k) == "White Kernel"
 
         # Standardised tests.
@@ -29,8 +29,8 @@
         @test eltype(k) == Any
         @test kappa(k, 1.0) == c
         @test kappa(k, 0.5) == c
-        @test metric(ConstantKernel()) == KernelFunctions.Delta()
-        @test metric(ConstantKernel(c=2.0)) == KernelFunctions.Delta()
+        @test binary_op(ConstantKernel()) == KernelFunctions.Delta()
+        @test binary_op(ConstantKernel(c=2.0)) == KernelFunctions.Delta()
         @test repr(k) == "Constant Kernel (c = $(c))"
         test_params(k, ([c],))
 
diff --git a/test/basekernels/exponential.jl b/test/basekernels/exponential.jl
index cdb0c54f3..81b431362 100644
--- a/test/basekernels/exponential.jl
+++ b/test/basekernels/exponential.jl
@@ -8,7 +8,7 @@
         @test kappa(k,x) ≈ exp(-x / 2)
         @test k(v1,v2) ≈ exp(-norm(v1-v2)^2 / 2)
         @test kappa(SqExponentialKernel(),x) == kappa(k,x)
-        @test metric(SqExponentialKernel()) == SqEuclidean()
+        @test binary_op(SqExponentialKernel()) == SqEuclidean()
         @test RBFKernel == SqExponentialKernel
         @test GaussianKernel == SqExponentialKernel
         @test SEKernel == SqExponentialKernel
@@ -24,7 +24,7 @@
         @test kappa(k,x) ≈ exp(-x)
         @test k(v1,v2) ≈ exp(-norm(v1-v2))
         @test kappa(ExponentialKernel(),x) == kappa(k,x)
-        @test metric(ExponentialKernel()) == Euclidean()
+        @test binary_op(ExponentialKernel()) == Euclidean()
         @test repr(k) == "Exponential Kernel"
         @test LaplacianKernel == ExponentialKernel
         @test KernelFunctions.iskroncompatible(k) == true
@@ -39,8 +39,8 @@
         @test k(v1, v2) ≈ exp(-norm(v1 - v2)^γ)
         @test kappa(GammaExponentialKernel(), x) == kappa(k, x)
         @test GammaExponentialKernel(gamma=γ).γ == [γ]
-        @test metric(GammaExponentialKernel()) == Euclidean()
-        @test metric(GammaExponentialKernel(γ=2.0)) == Euclidean()
+        @test binary_op(GammaExponentialKernel()) == Euclidean()
+        @test binary_op(GammaExponentialKernel(γ=2.0)) == Euclidean()
         @test repr(k) == "Gamma Exponential Kernel (γ = $(γ))"
         @test KernelFunctions.iskroncompatible(k) == true
 
diff --git a/test/basekernels/exponentiated.jl b/test/basekernels/exponentiated.jl
index 1328437eb..3b86440c3 100644
--- a/test/basekernels/exponentiated.jl
+++ b/test/basekernels/exponentiated.jl
@@ -8,7 +8,7 @@
     @test kappa(k,x) ≈ exp(x)
     @test kappa(k,-x) ≈ exp(-x)
     @test k(v1,v2) ≈ exp(dot(v1,v2))
-    @test metric(ExponentiatedKernel()) == KernelFunctions.DotProduct()
+    @test binary_op(ExponentiatedKernel()) == KernelFunctions.DotProduct()
     @test repr(k) == "Exponentiated Kernel"
 
     # Standardised tests. This kernel appears to be fairly numerically unstable.
diff --git a/test/basekernels/matern.jl b/test/basekernels/matern.jl
index 061601b3e..a8262a0c2 100644
--- a/test/basekernels/matern.jl
+++ b/test/basekernels/matern.jl
@@ -11,8 +11,8 @@
         @test kappa(k,x) ≈ matern(x,ν)
         @test kappa(k,0.0) == 1.0
         @test kappa(MaternKernel(ν=ν),x) == kappa(k,x)
-        @test metric(MaternKernel()) == Euclidean()
-        @test metric(MaternKernel(ν=2.0)) == Euclidean()
+        @test binary_op(MaternKernel()) == Euclidean()
+        @test binary_op(MaternKernel(ν=2.0)) == Euclidean()
         @test repr(k) == "Matern Kernel (ν = $(ν))"
         # test_ADs(x->MaternKernel(nu=first(x)),[ν])
         @test_broken "All fails (because of logabsgamma for ForwardDiff and ReverseDiff and because of nu for Zygote)"
@@ -26,7 +26,7 @@
         @test kappa(k,x) ≈ (1+sqrt(3)*x)exp(-sqrt(3)*x)
         @test k(v1,v2) ≈ (1+sqrt(3)*norm(v1-v2))exp(-sqrt(3)*norm(v1-v2))
         @test kappa(Matern32Kernel(),x) == kappa(k,x)
-        @test metric(Matern32Kernel()) == Euclidean()
+        @test binary_op(Matern32Kernel()) == Euclidean()
         @test repr(k) == "Matern 3/2 Kernel"
 
         # Standardised tests.
@@ -38,7 +38,7 @@
         @test kappa(k,x) ≈ (1+sqrt(5)*x+5/3*x^2)exp(-sqrt(5)*x)
         @test k(v1,v2) ≈ (1+sqrt(5)*norm(v1-v2)+5/3*norm(v1-v2)^2)exp(-sqrt(5)*norm(v1-v2))
         @test kappa(Matern52Kernel(),x) == kappa(k,x)
-        @test metric(Matern52Kernel()) == Euclidean()
+        @test binary_op(Matern52Kernel()) == Euclidean()
         @test repr(k) == "Matern 5/2 Kernel"
 
         # Standardised tests.
diff --git a/test/basekernels/polynomial.jl b/test/basekernels/polynomial.jl
index a9e6d8e5d..39148f8ab 100644
--- a/test/basekernels/polynomial.jl
+++ b/test/basekernels/polynomial.jl
@@ -9,8 +9,8 @@
         @test kappa(k,x) ≈ x
         @test k(v1,v2) ≈ dot(v1,v2)
         @test kappa(LinearKernel(),x) == kappa(k,x)
-        @test metric(LinearKernel()) == KernelFunctions.DotProduct()
-        @test metric(LinearKernel(c=2.0)) == KernelFunctions.DotProduct()
+        @test binary_op(LinearKernel()) == KernelFunctions.DotProduct()
+        @test binary_op(LinearKernel(c=2.0)) == KernelFunctions.DotProduct()
         @test repr(k) == "Linear Kernel (c = 0.0)"
 
         # Standardised tests.
@@ -27,9 +27,9 @@
 
         # Coherence tests.
         @test kappa(PolynomialKernel(d=1.0,c=c),x) ≈ kappa(LinearKernel(c=c),x)
-        @test metric(PolynomialKernel()) == KernelFunctions.DotProduct()
-        @test metric(PolynomialKernel(d=3.0)) == KernelFunctions.DotProduct()
-        @test metric(PolynomialKernel(d=3.0,c=2.0)) == KernelFunctions.DotProduct()
+        @test binary_op(PolynomialKernel()) == KernelFunctions.DotProduct()
+        @test binary_op(PolynomialKernel(d=3.0)) == KernelFunctions.DotProduct()
+        @test binary_op(PolynomialKernel(d=3.0,c=2.0)) == KernelFunctions.DotProduct()
 
         # Standardised tests.
         TestUtils.test_interface(k, Float64)
diff --git a/test/basekernels/rationalquad.jl b/test/basekernels/rationalquad.jl
index 90b877308..045ff9a3f 100644
--- a/test/basekernels/rationalquad.jl
+++ b/test/basekernels/rationalquad.jl
@@ -16,8 +16,8 @@
             )
         end
 
-        @test metric(RationalQuadraticKernel()) == SqEuclidean()
-        @test metric(RationalQuadraticKernel(α=2.0)) == SqEuclidean()
+        @test binary_op(RationalQuadraticKernel()) == SqEuclidean()
+        @test binary_op(RationalQuadraticKernel(α=2.0)) == SqEuclidean()
         @test repr(k) == "Rational Quadratic Kernel (α = $(α))"
 
         # Standardised tests.
@@ -74,9 +74,9 @@
             )
         end
 
-        @test metric(GammaRationalQuadraticKernel()) == SqEuclidean()
-        @test metric(GammaRationalQuadraticKernel(γ=2.0)) == SqEuclidean()
-        @test metric(GammaRationalQuadraticKernel(γ=2.0, α=3.0)) == SqEuclidean()
+        @test binary_op(GammaRationalQuadraticKernel()) == SqEuclidean()
+        @test binary_op(GammaRationalQuadraticKernel(γ=2.0)) == SqEuclidean()
+        @test binary_op(GammaRationalQuadraticKernel(γ=2.0, α=3.0)) == SqEuclidean()
 
         # Standardised tests.
         TestUtils.test_interface(k, Float64)
diff --git a/test/matrix/kernelmatrix.jl b/test/matrix/kernelmatrix.jl
index 2f825c1ae..8e006cbaf 100644
--- a/test/matrix/kernelmatrix.jl
+++ b/test/matrix/kernelmatrix.jl
@@ -7,7 +7,7 @@ struct BaseSE <: KernelFunctions.Kernel end
 # are implemented in the package. That this happens to be an exponentiated quadratic kernel
 # is a complete coincidence.
 struct ToySimpleKernel <: SimpleKernel end
-KernelFunctions.metric(::ToySimpleKernel) = SqEuclidean()
+KernelFunctions.binary_op(::ToySimpleKernel) = SqEuclidean()
 KernelFunctions.kappa(::ToySimpleKernel, d) = exp(-d / 2)
 
 @testset "kernelmatrix" begin
diff --git a/test/runtests.jl b/test/runtests.jl
index f8c1cbf68..ed3b2114a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,7 +10,7 @@ using Test
 using Flux
 import Zygote, ForwardDiff, ReverseDiff, FiniteDifferences
 
-using KernelFunctions: SimpleKernel, metric, kappa, ColVecs, RowVecs, TestUtils
+using KernelFunctions: SimpleKernel, binary_op, kappa, ColVecs, RowVecs, TestUtils
 
 using KernelFunctions.TestUtils: test_interface
 

From 86907011972ff2c4484757a3350d1c6569498ab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Tue, 24 Nov 2020 17:32:09 +0100
Subject: [PATCH 09/13] Correct order

---
 src/KernelFunctions.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/KernelFunctions.jl b/src/KernelFunctions.jl
index 89fc6d1a1..497b80e7d 100644
--- a/src/KernelFunctions.jl
+++ b/src/KernelFunctions.jl
@@ -63,10 +63,11 @@ using StatsBase
 abstract type Kernel end
 abstract type SimpleKernel <: Kernel end
 
-include("utils.jl")
-
 abstract type AbstractBinaryOp end
 const BinaryOp = Union{Metric, AbstractBinaryOp}
+
+include("utils.jl")
+
 include(joinpath("binary_op", "pairwise.jl"))
 include(joinpath("binary_op", "dotproduct.jl"))
 include(joinpath("binary_op", "delta.jl"))

From 9423e7d5edf4321172c6fff095ae46eb3c65a1ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Wed, 25 Nov 2020 12:16:55 +0100
Subject: [PATCH 10/13] Relaxed types for pairwise and added methods for
 dotproduct

---
 src/binary_op/dotproduct.jl | 62 ++++++++++++++++++++++++++++++++-----
 src/binary_op/pairwise.jl   | 17 +++++-----
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/src/binary_op/dotproduct.jl b/src/binary_op/dotproduct.jl
index d35ff4587..5a3ca7f44 100644
--- a/src/binary_op/dotproduct.jl
+++ b/src/binary_op/dotproduct.jl
@@ -1,15 +1,61 @@
 struct DotProduct end
 # struct DotProduct <: Distances.UnionSemiMetric end
 
-@inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
-    @boundscheck if length(a) != length(b)
-        throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
+
+(d::DotProduct)(a, b) = evaluate(d, a, b)
+Distances.evaluate(::DotProduct, a, b) = dot(a, b)
+
+function Distances.pairwise(d::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
+    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
+    m = size(a, dims)
+    n = size(b, dims)
+    P = Matrix{Distances.result_type(d, a, b)}(undef, m, n)
+    if dims == 1
+        return _pairwise!(P, d, transpose(a), transpose(b))
+    else
+        return _pairwise!(P, d, a, b)
+    end
+end
+
+function Distances.pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
+    dims = deprecated_dims(dims)
+    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
+    if dims == 1
+        na, ma = size(a)
+        nb, mb = size(b)
+        ma == mb || throw(DimensionMismatch("The numbers of columns in a and b " *
+                                            "must match (got $ma and $mb)."))
+    else
+        ma, na = size(a)
+        mb, nb = size(b)
+        ma == mb || throw(DimensionMismatch("The numbers of rows in a and b " *
+                                            "must match (got $ma and $mb)."))
+    end
+    size(P) == (na, nb) ||
+        throw(DimensionMismatch("Incorrect size of P (got $(size(P)), expected $((na, nb)))."))
+    if dims == 1
+        _pairwise!(P, metric, transpose(a), transpose(b))
+    else
+        _pairwise!(P, metric, a, b)
     end
-    return dot(a,b)
 end
 
-Distances.result_type(::DotProduct, Ta::Type, Tb::Type) = promote_type(Ta, Tb)
+function Distances._pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a)
+    for ij in CartesianIndices(P)
+        P[ij] = @views dot(a[:, ij[1]], b[:, ij[2]])
+    end
+    return P
+end
+
+# @inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
+#     @boundscheck if length(a) != length(b)
+#         throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
+#     end
+#     return dot(a,b)
+# end
+
+# Distances.result_type(::DotProduct, Ta::Type, Tb::Type) = promote_type(Ta, Tb)
 
-@inline Distances.eval_op(::DotProduct, a::Real, b::Real) = a * b
-@inline (dist::DotProduct)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist, a, b)
-@inline (dist::DotProduct)(a::Number,b::Number) = a * b
+# @inline Distances.eval_op(::DotProduct, a::Real, b::Real) = a * b
+# @inline (dist::DotProduct)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist, a, b)
+# @inline (dist::DotProduct)(a::Number,b::Number) = a * b
diff --git a/src/binary_op/pairwise.jl b/src/binary_op/pairwise.jl
index 188e03299..f0a9ec072 100644
--- a/src/binary_op/pairwise.jl
+++ b/src/binary_op/pairwise.jl
@@ -1,41 +1,42 @@
 # Add our own pairwise function to be able to apply it on vectors
 
-pairwise(d::PreMetric, X::AbstractVector, Y::AbstractVector) = broadcast(d, X, permutedims(Y))
+pairwise(d::BinaryOp, X::AbstractVector, Y::AbstractVector) = broadcast(d, X, permutedims(Y))
 
-pairwise(d::PreMetric, X::AbstractVector) = pairwise(d, X, X)
+pairwise(d::BinaryOp, X::AbstractVector) = pairwise(d, X, X)
 
 function pairwise!(
     out::AbstractMatrix,
-    d::PreMetric,
+    d::BinaryOp,
     X::AbstractVector,
     Y::AbstractVector,
 )
     broadcast!(d, out, X, Y')
 end
 
-pairwise!(out::AbstractMatrix, d::PreMetric, X::AbstractVector) = pairwise!(out, d, X, X)
+pairwise!(out::AbstractMatrix, d::BinaryOp, X::AbstractVector) = pairwise!(out, d, X, X)
 
-function pairwise(d::PreMetric, x::AbstractVector{<:Real})
+function pairwise(d::BinaryOp, x::AbstractVector{<:Real})
     return Distances.pairwise(d, reshape(x, :, 1); dims = 1)
 end
 
 function pairwise(
-    d::PreMetric,
+    d::BinaryOp,
     x::AbstractVector{<:Real},
     y::AbstractVector{<:Real},
 )
     return Distances.pairwise(d, reshape(x, :, 1), reshape(y, :, 1); dims = 1)
 end
 
-function pairwise!(out::AbstractMatrix, d::PreMetric, x::AbstractVector{<:Real})
+function pairwise!(out::AbstractMatrix, d::BinaryOp, x::AbstractVector{<:Real})
     return Distances.pairwise!(out, d, reshape(x, :, 1); dims = 1)
 end
 
 function pairwise!(
     out::AbstractMatrix,
-    d::PreMetric,
+    d::BinaryOp,
     x::AbstractVector{<:Real},
     y::AbstractVector{<:Real},
 )
     return Distances.pairwise!(out, d, reshape(x, :, 1), reshape(y, :, 1); dims=1)
 end
+

From 546ec5eb511c98bb5069b4942d2b70b8acb185d1 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 15 Feb 2021 11:21:52 +0100
Subject: [PATCH 11/13] Removed functions from pairwise

---
 src/binary_op/abstractbinaryop.jl | 80 +++++++++++++++++++++++++++++++
 src/binary_op/dotproduct.jl       | 44 ++---------------
 src/binary_op/pairwise.jl         | 42 ----------------
 test/Project.toml                 |  2 +-
 4 files changed, 84 insertions(+), 84 deletions(-)
 create mode 100644 src/binary_op/abstractbinaryop.jl
 delete mode 100644 src/binary_op/pairwise.jl

diff --git a/src/binary_op/abstractbinaryop.jl b/src/binary_op/abstractbinaryop.jl
new file mode 100644
index 000000000..d97e5c983
--- /dev/null
+++ b/src/binary_op/abstractbinaryop.jl
@@ -0,0 +1,80 @@
+## AbstractBinaryOp shadows the implementation of Distances.jl functions and types
+## for types which are not metric by definition but benefit from all the 
+## pairwise machinery
+
+## pairwise functions for matrices
+function Distances.pairwise(d::AbstractBinaryOp, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
+    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
+    m = size(a, dims)
+    n = size(b, dims)
+    P = Matrix{Distances.result_type(d, a, b)}(undef, m, n)
+    return mul!(P, a, b')
+    if dims == 1
+        return _pairwise!(P, d, transpose(a), transpose(b))
+    else
+        return _pairwise!(P, d, a, b)
+    end
+end
+
+function Distances.pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
+    dims = deprecated_dims(dims)
+    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
+    if dims == 1
+        na, ma = size(a)
+        nb, mb = size(b)
+        ma == mb || throw(DimensionMismatch("The numbers of columns in a and b " *
+                                            "must match (got $ma and $mb)."))
+    else
+        ma, na = size(a)
+        mb, nb = size(b)
+        ma == mb || throw(DimensionMismatch("The numbers of rows in a and b " *
+                                            "must match (got $ma and $mb)."))
+    end
+    size(P) == (na, nb) ||
+        throw(DimensionMismatch("Incorrect size of P (got $(size(P)), expected $((na, nb)))."))
+    if dims == 1
+        _pairwise!(P, metric, transpose(a), transpose(b))
+    else
+        _pairwise!(P, metric, a, b)
+    end
+end
+
+## pairwise function for vectors
+function pairwise(d::AbstractBinaryOp, X::AbstractVector, Y::AbstractVector=X)
+    return broadcast(d, X, permutedims(Y))
+end
+
+function pairwise!(
+    out::AbstractMatrix,
+    d::AbstractBinaryOp,
+    X::AbstractVector,
+    Y::AbstractVector=X,
+)
+    broadcast!(d, out, X, permutedims(Y))
+end
+
+# function pairwise(d::BinaryOp, x::AbstractVector{<:Real})
+#     return Distances.pairwise(d, reshape(x, :, 1); dims = 1)
+# end
+
+# function pairwise(
+#     d::BinaryOp,
+#     x::AbstractVector{<:Real},
+#     y::AbstractVector{<:Real},
+# )
+#     return Distances.pairwise(d, reshape(x, :, 1), reshape(y, :, 1); dims = 1)
+# end
+
+# function pairwise!(out::AbstractMatrix, d::BinaryOp, x::AbstractVector{<:Real})
+#     return Distances.pairwise!(out, d, reshape(x, :, 1); dims = 1)
+# end
+
+# function pairwise!(
+#     out::AbstractMatrix,
+#     d::BinaryOp,
+#     x::AbstractVector{<:Real},
+#     y::AbstractVector{<:Real},
+# )
+#     return Distances.pairwise!(out, d, reshape(x, :, 1), reshape(y, :, 1); dims=1)
+# end
+
diff --git a/src/binary_op/dotproduct.jl b/src/binary_op/dotproduct.jl
index 5a3ca7f44..370046d41 100644
--- a/src/binary_op/dotproduct.jl
+++ b/src/binary_op/dotproduct.jl
@@ -1,50 +1,12 @@
-struct DotProduct end
+struct DotProduct <: AbstractBinaryOp end
 # struct DotProduct <: Distances.UnionSemiMetric end
 
 
-(d::DotProduct)(a, b) = evaluate(d, a, b)
+(d::DotProduct)(a, b) = Distances.evaluate(d, a, b)
 Distances.evaluate(::DotProduct, a, b) = dot(a, b)
 
-function Distances.pairwise(d::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
-    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
-    m = size(a, dims)
-    n = size(b, dims)
-    P = Matrix{Distances.result_type(d, a, b)}(undef, m, n)
-    if dims == 1
-        return _pairwise!(P, d, transpose(a), transpose(b))
-    else
-        return _pairwise!(P, d, a, b)
-    end
-end
-
-function Distances.pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
-    dims = deprecated_dims(dims)
-    dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
-    if dims == 1
-        na, ma = size(a)
-        nb, mb = size(b)
-        ma == mb || throw(DimensionMismatch("The numbers of columns in a and b " *
-                                            "must match (got $ma and $mb)."))
-    else
-        ma, na = size(a)
-        mb, nb = size(b)
-        ma == mb || throw(DimensionMismatch("The numbers of rows in a and b " *
-                                            "must match (got $ma and $mb)."))
-    end
-    size(P) == (na, nb) ||
-        throw(DimensionMismatch("Incorrect size of P (got $(size(P)), expected $((na, nb)))."))
-    if dims == 1
-        _pairwise!(P, metric, transpose(a), transpose(b))
-    else
-        _pairwise!(P, metric, a, b)
-    end
-end
-
 function Distances._pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a)
-    for ij in CartesianIndices(P)
-        P[ij] = @views dot(a[:, ij[1]], b[:, ij[2]])
-    end
-    return P
+    return mul!(P, a, transpose(b))
 end
 
 # @inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
diff --git a/src/binary_op/pairwise.jl b/src/binary_op/pairwise.jl
deleted file mode 100644
index f0a9ec072..000000000
--- a/src/binary_op/pairwise.jl
+++ /dev/null
@@ -1,42 +0,0 @@
-# Add our own pairwise function to be able to apply it on vectors
-
-pairwise(d::BinaryOp, X::AbstractVector, Y::AbstractVector) = broadcast(d, X, permutedims(Y))
-
-pairwise(d::BinaryOp, X::AbstractVector) = pairwise(d, X, X)
-
-function pairwise!(
-    out::AbstractMatrix,
-    d::BinaryOp,
-    X::AbstractVector,
-    Y::AbstractVector,
-)
-    broadcast!(d, out, X, Y')
-end
-
-pairwise!(out::AbstractMatrix, d::BinaryOp, X::AbstractVector) = pairwise!(out, d, X, X)
-
-function pairwise(d::BinaryOp, x::AbstractVector{<:Real})
-    return Distances.pairwise(d, reshape(x, :, 1); dims = 1)
-end
-
-function pairwise(
-    d::BinaryOp,
-    x::AbstractVector{<:Real},
-    y::AbstractVector{<:Real},
-)
-    return Distances.pairwise(d, reshape(x, :, 1), reshape(y, :, 1); dims = 1)
-end
-
-function pairwise!(out::AbstractMatrix, d::BinaryOp, x::AbstractVector{<:Real})
-    return Distances.pairwise!(out, d, reshape(x, :, 1); dims = 1)
-end
-
-function pairwise!(
-    out::AbstractMatrix,
-    d::BinaryOp,
-    x::AbstractVector{<:Real},
-    y::AbstractVector{<:Real},
-)
-    return Distances.pairwise!(out, d, reshape(x, :, 1), reshape(y, :, 1); dims=1)
-end
-
diff --git a/test/Project.toml b/test/Project.toml
index 8c057e260..f9e8a5cfb 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -15,7 +15,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 AxisArrays = "0.4.3"
-Distances = "0.9, 0.10"
+Distances = "0.10"
 FiniteDifferences = "0.10.8, 0.11"
 Flux = "0.10, 0.11"
 ForwardDiff = "0.10"

From 470faf9ba00037ae41db71efa977b2df809131a0 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 15 Feb 2021 12:28:45 +0100
Subject: [PATCH 12/13] Passing tests

---
 src/KernelFunctions.jl                      |  4 +--
 src/binary_op/abstractbinaryop.jl           | 25 ++++++++++-----
 src/binary_op/dotproduct.jl                 |  4 +--
 test/binary_op/abstractbinaryop.jl          | 34 +++++++++++++++++++++
 test/{distances => binary_op}/delta.jl      |  0
 test/{distances => binary_op}/dotproduct.jl |  2 +-
 test/{distances => binary_op}/sinus.jl      |  0
 test/distances/pairwise.jl                  | 29 ------------------
 test/runtests.jl                            | 12 ++++----
 test/utils.jl                               | 16 +++++-----
 10 files changed, 70 insertions(+), 56 deletions(-)
 create mode 100644 test/binary_op/abstractbinaryop.jl
 rename test/{distances => binary_op}/delta.jl (100%)
 rename test/{distances => binary_op}/dotproduct.jl (68%)
 rename test/{distances => binary_op}/sinus.jl (100%)
 delete mode 100644 test/distances/pairwise.jl

diff --git a/src/KernelFunctions.jl b/src/KernelFunctions.jl
index 2348689ec..eecb871fa 100644
--- a/src/KernelFunctions.jl
+++ b/src/KernelFunctions.jl
@@ -68,11 +68,11 @@ abstract type Kernel end
 abstract type SimpleKernel <: Kernel end
 
 abstract type AbstractBinaryOp end
-const BinaryOp = Union{Metric, AbstractBinaryOp}
+const BinaryOp = Union{Distances.Metric, AbstractBinaryOp}
 
 include("utils.jl")
 
-include(joinpath("binary_op", "pairwise.jl"))
+include(joinpath("binary_op", "abstractbinaryop.jl"))
 include(joinpath("binary_op", "dotproduct.jl"))
 include(joinpath("binary_op", "delta.jl"))
 include(joinpath("binary_op", "sinus.jl"))
diff --git a/src/binary_op/abstractbinaryop.jl b/src/binary_op/abstractbinaryop.jl
index d97e5c983..ca34037d4 100644
--- a/src/binary_op/abstractbinaryop.jl
+++ b/src/binary_op/abstractbinaryop.jl
@@ -8,15 +8,15 @@ function Distances.pairwise(d::AbstractBinaryOp, a::AbstractMatrix, b::AbstractM
     m = size(a, dims)
     n = size(b, dims)
     P = Matrix{Distances.result_type(d, a, b)}(undef, m, n)
-    return mul!(P, a, b')
     if dims == 1
-        return _pairwise!(P, d, transpose(a), transpose(b))
+        return Distances._pairwise!(P, d, transpose(a), transpose(b))
     else
-        return _pairwise!(P, d, a, b)
+        return Distances._pairwise!(P, d, a, b)
     end
+    return P
 end
 
-function Distances.pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
+function Distances.pairwise!(P::AbstractMatrix, d::AbstractBinaryOp, a::AbstractMatrix, b::AbstractMatrix=a; dims=1)
     dims = deprecated_dims(dims)
     dims in (1, 2) || throw(ArgumentError("dims should be 1 or 2 (got $dims)"))
     if dims == 1
@@ -33,18 +33,27 @@ function Distances.pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix,
     size(P) == (na, nb) ||
         throw(DimensionMismatch("Incorrect size of P (got $(size(P)), expected $((na, nb)))."))
     if dims == 1
-        _pairwise!(P, metric, transpose(a), transpose(b))
+        _pairwise!(P, d, transpose(a), transpose(b))
     else
-        _pairwise!(P, metric, a, b)
+        _pairwise!(P, d, a, b)
     end
+    return P
 end
 
+function Distances._pairwise!(P::AbstractMatrix, d::AbstractBinaryOp, a::AbstractMatrix, b::AbstractMatrix)
+    for ij in CartesianIndices(P)
+        P[ij] = @views d(a[:, ij[1]], b[:, ij[2]])
+    end
+    return P
+end
+
+
 ## pairwise function for vectors
-function pairwise(d::AbstractBinaryOp, X::AbstractVector, Y::AbstractVector=X)
+function Distances.pairwise(d::AbstractBinaryOp, X::AbstractVector, Y::AbstractVector=X)
     return broadcast(d, X, permutedims(Y))
 end
 
-function pairwise!(
+function Distances.pairwise!(
     out::AbstractMatrix,
     d::AbstractBinaryOp,
     X::AbstractVector,
diff --git a/src/binary_op/dotproduct.jl b/src/binary_op/dotproduct.jl
index 370046d41..ab2dc5234 100644
--- a/src/binary_op/dotproduct.jl
+++ b/src/binary_op/dotproduct.jl
@@ -5,8 +5,8 @@ struct DotProduct <: AbstractBinaryOp end
 (d::DotProduct)(a, b) = Distances.evaluate(d, a, b)
 Distances.evaluate(::DotProduct, a, b) = dot(a, b)
 
-function Distances._pairwise!(P::AbstractMatrix, ::DotProduct, a::AbstractMatrix, b::AbstractMatrix=a)
-    return mul!(P, a, transpose(b))
+function Distances._pairwise!(P::AbstractMatrix, ::KernelFunctions.DotProduct, a::AbstractMatrix, b::AbstractMatrix=a)
+    return mul!(P, transpose(a), b)
 end
 
 # @inline function Distances._evaluate(::DotProduct, a::AbstractVector, b::AbstractVector)
diff --git a/test/binary_op/abstractbinaryop.jl b/test/binary_op/abstractbinaryop.jl
new file mode 100644
index 000000000..dce31dc68
--- /dev/null
+++ b/test/binary_op/abstractbinaryop.jl
@@ -0,0 +1,34 @@
+@testset "abstractbinaryop" begin
+    using KernelFunctions: AbstractBinaryOp
+    rng = MersenneTwister(123456)
+    d = SqEuclidean()
+    Ns = (4, 5)
+    D = 3
+    x = [randn(rng, D) for _ in 1:Ns[1]]
+    y = [randn(rng, D) for _ in 1:Ns[2]]
+    X = hcat(x...)
+    Y = hcat(y...)
+    K = zeros(Ns)
+
+    struct Max <: AbstractBinaryOp end
+
+    (d::Max)(a, b) =  Distances.evaluate(d, a, b)
+    Distances.evaluate(::Max, a, b) = maximum(abs, a - b)
+    @test pairwise(d, x, y) ≈ pairwise(d, X, Y; dims=2)
+    @test pairwise(d, x) ≈ pairwise(d, X; dims=2)
+    pairwise!(K, d, x, y)
+    @test K ≈ pairwise(d, X, Y; dims=2)
+    K = zeros(Ns[1], Ns[1])
+    pairwise!(K, d, x)
+    @test K ≈ pairwise(d, X; dims=2)
+
+    x = randn(rng, 10)
+    X = reshape(x, :, 1)
+    y = randn(rng, 11)
+    Y = reshape(y, :, 1)
+    K = zeros(10, 11)
+    @test pairwise(d, x, y) ≈ pairwise(d, X, Y; dims=1)
+    @test pairwise(d, x) ≈ pairwise(d, X; dims=1)
+    pairwise!(K, d, x, y)
+    @test K ≈ pairwise(d, X, Y; dims=1)
+end
diff --git a/test/distances/delta.jl b/test/binary_op/delta.jl
similarity index 100%
rename from test/distances/delta.jl
rename to test/binary_op/delta.jl
diff --git a/test/distances/dotproduct.jl b/test/binary_op/dotproduct.jl
similarity index 68%
rename from test/distances/dotproduct.jl
rename to test/binary_op/dotproduct.jl
index b6e3691ed..958aa448b 100644
--- a/test/distances/dotproduct.jl
+++ b/test/binary_op/dotproduct.jl
@@ -2,7 +2,7 @@
     A = rand(10, 5)
     B = rand(20, 5)
     d = KernelFunctions.DotProduct()
-    @test diag(pairwise(d, A; dims=2)) == [dot(A[:, i], A[:, i]) for i in 1:size(A, 2)]
+    @test diag(pairwise(d, A; dims=1)) == [dot(A[i, :], A[i, :]) for i in 1:size(A, 1)]
     @test_throws DimensionMismatch d(rand(3), rand(4))
     @test d(3.0, 2.0) == 6.0
 end
diff --git a/test/distances/sinus.jl b/test/binary_op/sinus.jl
similarity index 100%
rename from test/distances/sinus.jl
rename to test/binary_op/sinus.jl
diff --git a/test/distances/pairwise.jl b/test/distances/pairwise.jl
deleted file mode 100644
index 486097f52..000000000
--- a/test/distances/pairwise.jl
+++ /dev/null
@@ -1,29 +0,0 @@
-@testset "pairwise" begin
-    rng = MersenneTwister(123456)
-    d = SqEuclidean()
-    Ns = (4, 5)
-    D = 3
-    x = [randn(rng, D) for _ in 1:Ns[1]]
-    y = [randn(rng, D) for _ in 1:Ns[2]]
-    X = hcat(x...)
-    Y = hcat(y...)
-    K = zeros(Ns)
-
-    @test KernelFunctions.pairwise(d, x, y) ≈ pairwise(d, X, Y; dims=2)
-    @test KernelFunctions.pairwise(d, x) ≈ pairwise(d, X; dims=2)
-    KernelFunctions.pairwise!(K, d, x, y)
-    @test K ≈ pairwise(d, X, Y; dims=2)
-    K = zeros(Ns[1], Ns[1])
-    KernelFunctions.pairwise!(K, d, x)
-    @test K ≈ pairwise(d, X; dims=2)
-
-    x = randn(rng, 10)
-    X = reshape(x, :, 1)
-    y = randn(rng, 11)
-    Y = reshape(y, :, 1)
-    K = zeros(10, 11)
-    @test KernelFunctions.pairwise(d, x, y) ≈ pairwise(d, X, Y; dims=1)
-    @test KernelFunctions.pairwise(d, x) ≈ pairwise(d, X; dims=1)
-    KernelFunctions.pairwise!(K, d, x, y)
-    @test K ≈ pairwise(d, X, Y; dims=1)
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 908ef1d1f..ae24e1bab 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -55,13 +55,13 @@ include("test_utils.jl")
 @testset "KernelFunctions" begin
     include("utils.jl")
 
-    @testset "distances" begin
-        include(joinpath("distances", "pairwise.jl"))
-        include(joinpath("distances", "dotproduct.jl"))
-        include(joinpath("distances", "delta.jl"))
-        include(joinpath("distances", "sinus.jl"))
+    @testset "binary_op" begin
+        include(joinpath("binary_op", "abstractbinaryop.jl"))
+        include(joinpath("binary_op", "dotproduct.jl"))
+        include(joinpath("binary_op", "delta.jl"))
+        include(joinpath("binary_op", "sinus.jl"))
     end
-    @info "Ran tests on Distances"
+    @info "Ran tests on binary_op"
 
     @testset "transform" begin
         include(joinpath("transform", "transform.jl"))
diff --git a/test/utils.jl b/test/utils.jl
index 8bdd16330..fce5478f8 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -29,15 +29,15 @@
 
         Y = randn(rng, D, N + 1)
         DY = ColVecs(Y)
-        @test KernelFunctions.pairwise(SqEuclidean(), DX) ≈
+        @test pairwise(SqEuclidean(), DX) ≈
               pairwise(SqEuclidean(), X; dims=2)
-        @test KernelFunctions.pairwise(SqEuclidean(), DX, DY) ≈
+        @test pairwise(SqEuclidean(), DX, DY) ≈
               pairwise(SqEuclidean(), X, Y; dims=2)
         K = zeros(N, N)
-        KernelFunctions.pairwise!(K, SqEuclidean(), DX)
+        pairwise!(K, SqEuclidean(), DX)
         @test K ≈ pairwise(SqEuclidean(), X; dims=2)
         K = zeros(N, N + 1)
-        KernelFunctions.pairwise!(K, SqEuclidean(), DX, DY)
+        pairwise!(K, SqEuclidean(), DX, DY)
         @test K ≈ pairwise(SqEuclidean(), X, Y; dims=2)
 
         let
@@ -68,15 +68,15 @@
 
         Y = randn(rng, D + 1, N)
         DY = RowVecs(Y)
-        @test KernelFunctions.pairwise(SqEuclidean(), DX) ≈
+        @test pairwise(SqEuclidean(), DX) ≈
               pairwise(SqEuclidean(), X; dims=1)
-        @test KernelFunctions.pairwise(SqEuclidean(), DX, DY) ≈
+        @test pairwise(SqEuclidean(), DX, DY) ≈
               pairwise(SqEuclidean(), X, Y; dims=1)
         K = zeros(D, D)
-        KernelFunctions.pairwise!(K, SqEuclidean(), DX)
+        pairwise!(K, SqEuclidean(), DX)
         @test K ≈ pairwise(SqEuclidean(), X; dims=1)
         K = zeros(D, D + 1)
-        KernelFunctions.pairwise!(K, SqEuclidean(), DX, DY)
+        pairwise!(K, SqEuclidean(), DX, DY)
         @test K ≈ pairwise(SqEuclidean(), X, Y; dims=1)
 
         let

From 14a376276dcb2acea694cdd5ed2e98b4f9913bff Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 15 Feb 2021 13:41:49 +0100
Subject: [PATCH 13/13] Fixes and update docs

---
 docs/src/index.md                 |  2 +-
 docs/src/metrics.md               | 27 +++++++++++++++++++--------
 src/basekernels/fbm.jl            |  8 ++++----
 src/binary_op/abstractbinaryop.jl | 28 ++--------------------------
 src/matrix/kernelmatrix.jl        |  8 ++++----
 src/utils.jl                      | 24 ++++++++++++------------
 6 files changed, 42 insertions(+), 55 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 719e6bb79..bf0f002a2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -11,7 +11,7 @@ The main goals of this package compared to its predecessors/concurrents in [MLKe
 
 The methodology of how kernels are computed is quite simple and is done in three phases :
 - A `Transform` object is applied sample-wise on every sample
-- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `Metric` proper to each kernel
+- The pairwise matrix is computed using [Distances.jl](https://github.com/JuliaStats/Distances.jl) by using a `BinaryOp` like a `Metric` proper to each kernel
 - The `Kernel` function is applied element-wise on the pairwise matrix
 
 For a quick introduction on how to use it go to [User guide](@ref)
diff --git a/docs/src/metrics.md b/docs/src/metrics.md
index be8869094..0bff836fc 100644
--- a/docs/src/metrics.md
+++ b/docs/src/metrics.md
@@ -1,4 +1,4 @@
-# Metrics
+# Binary Operations
 
 `SimpleKernel` implementations rely on [Distances.jl](https://github.com/JuliaStats/Distances.jl) for efficiently computing the pairwise matrix.
 This requires a distance measure or metric, such as the commonly used `SqEuclidean` and `Euclidean`.
@@ -11,21 +11,32 @@ KernelFunctions.binary_op(::CustomKernel) = SqEuclidean()
 However, there are kernels that can be implemented efficiently using "metrics" that do not respect all the definitions expected by Distances.jl. For this reason, KernelFunctions.jl provides additional "metrics" such as `DotProduct` ($\langle x, y \rangle$) and `Delta` ($\delta(x,y)$).
 
 
-## Adding a new metric
+## Adding a new binary operation
 
-If you want to create a new "metric" just implement the following:
+If you want to create a new binary operation you have the choice.
+If your operation respects all [`PreMetric` conditions](https://en.wikipedia.org/wiki/Metric_(mathematics)#Generalized_metrics) you can just implement the following:
 
 ```julia
-struct Delta <: Distances.PreMetric
-end
+struct Delta <: Distances.PreMetric end
 
-@inline function Distances._evaluate(::Delta,a::AbstractVector{T},b::AbstractVector{T}) where {T}
+@inline function Distances._evaluate(
+    ::Delta,
+    a::AbstractVector{T},
+    b::AbstractVector{T}
+    ) where {T}
     @boundscheck if length(a) != length(b)
         throw(DimensionMismatch("first array has length $(length(a)) which does not match the length of the second, $(length(b))."))
     end
     return a==b
 end
 
-@inline (dist::Delta)(a::AbstractArray,b::AbstractArray) = Distances._evaluate(dist,a,b)
-@inline (dist::Delta)(a::Number,b::Number) = a==b
+@inline (dist::Delta)(a::AbstractArray, b::AbstractArray) = Distances._evaluate(dist,a,b)
+@inline (dist::Delta)(a::Number, b::Number) = a==b
 ```
+
+However if it somehow does not respect some of the conditions (for instance `d(x, y) ≥ 0` for the `DotProduct`), we have a similar backend:
+```julia
+struct DotProduct <: KernelFunctions.AbstractBinaryOp end
+(d::DotProduct)(a, b) = evaluate(d, a,b)
+Distances.evaluate(::DotProduct, a, b) = dot(a, b)
+```
\ No newline at end of file
diff --git a/src/basekernels/fbm.jl b/src/basekernels/fbm.jl
index fad6d70f2..61d7485fb 100644
--- a/src/basekernels/fbm.jl
+++ b/src/basekernels/fbm.jl
@@ -46,26 +46,26 @@ _mod(x::RowVecs) = vec(sum(abs2, x.X; dims=2))
 
 function kernelmatrix(κ::FBMKernel, x::AbstractVector)
     modx = _mod(x)
-    modxx = pairwise(SqEuclidean(), x)
+    modxx = Distances.pairwise(SqEuclidean(), x)
     return _fbm.(modx, modx', modxx, κ.h)
 end
 
 function kernelmatrix!(K::AbstractMatrix, κ::FBMKernel, x::AbstractVector)
     modx = _mod(x)
-    pairwise!(K, SqEuclidean(), x)
+    Distances.pairwise!(K, SqEuclidean(), x)
     K .= _fbm.(modx, modx', K, κ.h)
     return K
 end
 
 function kernelmatrix(κ::FBMKernel, x::AbstractVector, y::AbstractVector)
-    modxy = pairwise(SqEuclidean(), x, y)
+    modxy = Distances.pairwise(SqEuclidean(), x, y)
     return _fbm.(_mod(x), _mod(y)', modxy, κ.h)
 end
 
 function kernelmatrix!(
     K::AbstractMatrix, κ::FBMKernel, x::AbstractVector, y::AbstractVector
 )
-    pairwise!(K, SqEuclidean(), x, y)
+    Distances.pairwise!(K, SqEuclidean(), x, y)
     K .= _fbm.(_mod(x), _mod(y)', K, κ.h)
     return K
 end
diff --git a/src/binary_op/abstractbinaryop.jl b/src/binary_op/abstractbinaryop.jl
index ca34037d4..0598e3ef0 100644
--- a/src/binary_op/abstractbinaryop.jl
+++ b/src/binary_op/abstractbinaryop.jl
@@ -47,7 +47,6 @@ function Distances._pairwise!(P::AbstractMatrix, d::AbstractBinaryOp, a::Abstrac
     return P
 end
 
-
 ## pairwise function for vectors
 function Distances.pairwise(d::AbstractBinaryOp, X::AbstractVector, Y::AbstractVector=X)
     return broadcast(d, X, permutedims(Y))
@@ -62,28 +61,5 @@ function Distances.pairwise!(
     broadcast!(d, out, X, permutedims(Y))
 end
 
-# function pairwise(d::BinaryOp, x::AbstractVector{<:Real})
-#     return Distances.pairwise(d, reshape(x, :, 1); dims = 1)
-# end
-
-# function pairwise(
-#     d::BinaryOp,
-#     x::AbstractVector{<:Real},
-#     y::AbstractVector{<:Real},
-# )
-#     return Distances.pairwise(d, reshape(x, :, 1), reshape(y, :, 1); dims = 1)
-# end
-
-# function pairwise!(out::AbstractMatrix, d::BinaryOp, x::AbstractVector{<:Real})
-#     return Distances.pairwise!(out, d, reshape(x, :, 1); dims = 1)
-# end
-
-# function pairwise!(
-#     out::AbstractMatrix,
-#     d::BinaryOp,
-#     x::AbstractVector{<:Real},
-#     y::AbstractVector{<:Real},
-# )
-#     return Distances.pairwise!(out, d, reshape(x, :, 1), reshape(y, :, 1); dims=1)
-# end
-
+## Additional needed Helpers
+Distances.result_type(::AbstractBinaryOp, Ta::Type, Tb::Type) = promote_type(Ta, Tb)
\ No newline at end of file
diff --git a/src/matrix/kernelmatrix.jl b/src/matrix/kernelmatrix.jl
index 4c84b5007..65212ddea 100644
--- a/src/matrix/kernelmatrix.jl
+++ b/src/matrix/kernelmatrix.jl
@@ -79,7 +79,7 @@ kernelmatrix_diag(κ::Kernel, x::AbstractVector, y::AbstractVector) = map(κ, x,
 
 function kernelmatrix!(K::AbstractMatrix, κ::SimpleKernel, x::AbstractVector)
     validate_inplace_dims(K, x)
-    pairwise!(K, binary_op(κ), x)
+    Distances.pairwise!(K, binary_op(κ), x)
     return map!(d -> kappa(κ, d), K, K)
 end
 
@@ -87,17 +87,17 @@ function kernelmatrix!(
     K::AbstractMatrix, κ::SimpleKernel, x::AbstractVector, y::AbstractVector
 )
     validate_inplace_dims(K, x, y)
-    pairwise!(K, binary_op(κ), x, y)
+    Distances.pairwise!(K, binary_op(κ), x, y)
     return map!(d -> kappa(κ, d), K, K)
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector)
-    return map(d -> kappa(κ, d), pairwise(binary_op(κ), x))
+    return map(d -> kappa(κ, d), Distances.pairwise(binary_op(κ), x))
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector, y::AbstractVector)
     validate_inputs(x, y)
-    return map(d -> kappa(κ, d), pairwise(binary_op(κ), x, y))
+    return map(d -> kappa(κ, d), Distances.pairwise(binary_op(κ), x, y))
 end
 
 #
diff --git a/src/utils.jl b/src/utils.jl
index fc3d5f5c5..a868a08d1 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -52,18 +52,18 @@ Base.setindex!(D::ColVecs, v::AbstractVector, i) = setindex!(D.X, v, :, i)
 
 dim(x::ColVecs) = size(x.X, 1)
 
-pairwise(d::BinaryOp, x::ColVecs) = Distances.pairwise(d, x.X; dims=2)
-pairwise(d::BinaryOp, x::ColVecs, y::ColVecs) = Distances.pairwise(d, x.X, y.X; dims=2)
-function pairwise(d::BinaryOp, x::AbstractVector, y::ColVecs)
+Distances.pairwise(d::BinaryOp, x::ColVecs) = Distances.pairwise(d, x.X; dims=2)
+Distances.pairwise(d::BinaryOp, x::ColVecs, y::ColVecs) = Distances.pairwise(d, x.X, y.X; dims=2)
+function Distances.pairwise(d::BinaryOp, x::AbstractVector, y::ColVecs)
     return Distances.pairwise(d, reduce(hcat, x), y.X; dims=2)
 end
-function pairwise(d::BinaryOp, x::ColVecs, y::AbstractVector)
+function Distances.pairwise(d::BinaryOp, x::ColVecs, y::AbstractVector)
     return Distances.pairwise(d, x.X, reduce(hcat, y); dims=2)
 end
-function pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs)
+function Distances.pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs)
     return Distances.pairwise!(out, d, x.X; dims=2)
 end
-function pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs, y::ColVecs)
+function Distances.pairwise!(out::AbstractMatrix, d::BinaryOp, x::ColVecs, y::ColVecs)
     return Distances.pairwise!(out, d, x.X, y.X; dims=2)
 end
 
@@ -91,18 +91,18 @@ Base.setindex!(D::RowVecs, v::AbstractVector, i) = setindex!(D.X, v, i, :)
 
 dim(x::RowVecs) = size(x.X, 2)
 
-pairwise(d::BinaryOp, x::RowVecs) = Distances.pairwise(d, x.X; dims=1)
-pairwise(d::BinaryOp, x::RowVecs, y::RowVecs) = Distances.pairwise(d, x.X, y.X; dims=1)
-function pairwise(d::BinaryOp, x::AbstractVector, y::RowVecs)
+Distances.pairwise(d::BinaryOp, x::RowVecs) = Distances.pairwise(d, x.X; dims=1)
+Distances.pairwise(d::BinaryOp, x::RowVecs, y::RowVecs) = Distances.pairwise(d, x.X, y.X; dims=1)
+function Distances.pairwise(d::BinaryOp, x::AbstractVector, y::RowVecs)
     return Distances.pairwise(d, permutedims(reduce(hcat, x)), y.X; dims=1)
 end
-function pairwise(d::BinaryOp, x::RowVecs, y::AbstractVector)
+function Distances.pairwise(d::BinaryOp, x::RowVecs, y::AbstractVector)
     return Distances.pairwise(d, x.X, permutedims(reduce(hcat, y)); dims=1)
 end
-function pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs)
+function Distances.pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs)
     return Distances.pairwise!(out, d, x.X; dims=1)
 end
-function pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs, y::RowVecs)
+function Distances.pairwise!(out::AbstractMatrix, d::BinaryOp, x::RowVecs, y::RowVecs)
     return Distances.pairwise!(out, d, x.X, y.X; dims=1)
 end