diff --git a/.github/workflows/test_GraphNeuralNetworks.yml b/.github/workflows/test_GraphNeuralNetworks.yml
index c92899661..065866e33 100644
--- a/.github/workflows/test_GraphNeuralNetworks.yml
+++ b/.github/workflows/test_GraphNeuralNetworks.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         version:
           - '1.10' # Replace this with the minimum Julia version that your package supports.
-          # - '1' #  '1' will automatically expand to the latest stable 1.x release of Julia.
+          - '1' #  '1' will automatically expand to the latest stable 1.x release of Julia.
           # - 'pre'
         os:
           - ubuntu-latest
diff --git a/GNNGraphs/test/test_utils.jl b/GNNGraphs/test/test_utils.jl
index fe8f9a997..56e298311 100644
--- a/GNNGraphs/test/test_utils.jl
+++ b/GNNGraphs/test/test_utils.jl
@@ -5,224 +5,3 @@ function ngradient(f, x...)
     fdm = central_fdm(5, 1)
     return FiniteDifferences.grad(fdm, f, x...)
 end
-
-const rule_config = Zygote.ZygoteRuleConfig()
-
-# Using this until https://github.com/JuliaDiff/FiniteDifferences.jl/issues/188 is fixed
-function FiniteDifferences.to_vec(x::Integer)
-    Integer_from_vec(v) = x
-    return Int[x], Integer_from_vec
-end
-
-# Test that forward pass on cpu and gpu are the same. 
-# Tests also gradient on cpu and gpu comparing with
-# finite difference methods.
-# Test gradients with respects to layer weights and to input. 
-# If `g` has edge features, it is assumed that the layer can 
-# use them in the forward pass as `l(g, x, e)`.
-# Test also gradient with respect to `e`. 
-function test_layer(l, g::GNNGraph; atol = 1e-5, rtol = 1e-5,
-                    exclude_grad_fields = [],
-                    verbose = false,
-                    test_gpu = TEST_GPU,
-                    outsize = nothing,
-                    outtype = :node)
-
-    # TODO these give errors, probably some bugs in ChainRulesTestUtils
-    # test_rrule(rule_config, x -> l(g, x), x; rrule_f=rrule_via_ad, check_inferred=false)
-    # test_rrule(rule_config, l -> l(g, x), l; rrule_f=rrule_via_ad, check_inferred=false)
-
-    isnothing(node_features(g)) && error("Plese add node data to the input graph")
-    fdm = central_fdm(5, 1)
-
-    x = node_features(g)
-    e = edge_features(g)
-    use_edge_feat = !isnothing(e)
-
-    x64, e64, l64, g64 = to64.([x, e, l, g]) # needed for accurate FiniteDifferences' grad
-    xgpu, egpu, lgpu, ggpu = gpu.([x, e, l, g])
-
-    f(l, g::GNNGraph) = l(g)
-    f(l, g::GNNGraph, x, e) = use_edge_feat ? l(g, x, e) : l(g, x)
-
-    loss(l, g::GNNGraph) =
-        if outtype == :node
-            sum(node_features(f(l, g)))
-        elseif outtype == :edge
-            sum(edge_features(f(l, g)))
-        elseif outtype == :graph
-            sum(graph_features(f(l, g)))
-        elseif outtype == :node_edge
-            gnew = f(l, g)
-            sum(node_features(gnew)) + sum(edge_features(gnew))
-        end
-
-    function loss(l, g::GNNGraph, x, e)
-        y = f(l, g, x, e)
-        if outtype == :node_edge
-            return sum(y[1]) + sum(y[2])
-        else
-            return sum(y)
-        end
-    end
-
-    # TEST OUTPUT
-    y = f(l, g, x, e)
-    if outtype == :node_edge
-        @assert y isa Tuple
-        @test eltype(y[1]) == eltype(x)
-        @test eltype(y[2]) == eltype(e)
-        @test all(isfinite, y[1])
-        @test all(isfinite, y[2])
-        if !isnothing(outsize)
-            @test size(y[1]) == outsize[1]
-            @test size(y[2]) == outsize[2]
-        end
-    else
-        @test eltype(y) == eltype(x)
-        @test all(isfinite, y)
-        if !isnothing(outsize)
-            @test size(y) == outsize
-        end
-    end
-
-    # test same output on different graph formats
-    gcoo = GNNGraph(g, graph_type = :coo)
-    ycoo = f(l, gcoo, x, e)
-    if outtype == :node_edge
-        @test ycoo[1] ≈ y[1]
-        @test ycoo[2] ≈ y[2]
-    else
-        @test ycoo ≈ y
-    end
-
-    g′ = f(l, g)
-    if outtype == :node
-        @test g′.ndata.x ≈ y
-    elseif outtype == :edge
-        @test g′.edata.e ≈ y
-    elseif outtype == :graph
-        @test g′.gdata.u ≈ y
-    elseif outtype == :node_edge
-        @test g′.ndata.x ≈ y[1]
-        @test g′.edata.e ≈ y[2]
-    else
-        @error "wrong outtype $outtype"
-    end
-    if test_gpu
-        ygpu = f(lgpu, ggpu, xgpu, egpu)
-        if outtype == :node_edge
-            @test ygpu[1] isa CuArray
-            @test eltype(ygpu[1]) == eltype(xgpu)
-            @test Array(ygpu[1]) ≈ y[1]
-            @test ygpu[2] isa CuArray
-            @test eltype(ygpu[2]) == eltype(xgpu)
-            @test Array(ygpu[2]) ≈ y[2]
-        else
-            @test ygpu isa CuArray
-            @test eltype(ygpu) == eltype(xgpu)
-            @test Array(ygpu) ≈ y
-        end
-    end
-
-    # TEST x INPUT GRADIENT
-    x̄ = gradient(x -> loss(l, g, x, e), x)[1]
-    x̄_fd = FiniteDifferences.grad(fdm, x64 -> loss(l64, g64, x64, e64), x64)[1]
-    @test eltype(x̄) == eltype(x)
-    @test x̄≈x̄_fd atol=atol rtol=rtol
-
-    if test_gpu
-        x̄gpu = gradient(xgpu -> loss(lgpu, ggpu, xgpu, egpu), xgpu)[1]
-        @test x̄gpu isa CuArray
-        @test eltype(x̄gpu) == eltype(x)
-        @test Array(x̄gpu)≈x̄ atol=atol rtol=rtol
-    end
-
-    # TEST e INPUT GRADIENT
-    if e !== nothing
-        verbose && println("Test e gradient cpu")
-        ē = gradient(e -> loss(l, g, x, e), e)[1]
-        ē_fd = FiniteDifferences.grad(fdm, e64 -> loss(l64, g64, x64, e64), e64)[1]
-        @test eltype(ē) == eltype(e)
-        @test ē≈ē_fd atol=atol rtol=rtol
-
-        if test_gpu
-            verbose && println("Test e gradient gpu")
-            ēgpu = gradient(egpu -> loss(lgpu, ggpu, xgpu, egpu), egpu)[1]
-            @test ēgpu isa CuArray
-            @test eltype(ēgpu) == eltype(ē)
-            @test Array(ēgpu)≈ē atol=atol rtol=rtol
-        end
-    end
-
-    # TEST LAYER GRADIENT - l(g, x, e) 
-    l̄ = gradient(l -> loss(l, g, x, e), l)[1]
-    l̄_fd = FiniteDifferences.grad(fdm, l64 -> loss(l64, g64, x64, e64), l64)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, exclude_grad_fields, verbose)
-
-    if test_gpu
-        l̄gpu = gradient(lgpu -> loss(lgpu, ggpu, xgpu, egpu), lgpu)[1]
-        test_approx_structs(lgpu, l̄gpu, l̄; atol, rtol, exclude_grad_fields, verbose)
-    end
-
-    # TEST LAYER GRADIENT - l(g)
-    l̄ = gradient(l -> loss(l, g), l)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, exclude_grad_fields, verbose)
-
-    return true
-end
-
-function test_approx_structs(l, l̄, l̄fd; atol = 1e-5, rtol = 1e-5,
-                             exclude_grad_fields = [],
-                             verbose = false)
-    l̄ = l̄ isa Base.RefValue ? l̄[] : l̄           # Zygote wraps gradient of mutables in RefValue 
-    l̄fd = l̄fd isa Base.RefValue ? l̄fd[] : l̄fd           # Zygote wraps gradient of mutables in RefValue 
-
-    for f in fieldnames(typeof(l))
-        f ∈ exclude_grad_fields && continue
-        verbose && println("Test gradient of field $f...")
-        x, g, gfd = getfield(l, f), getfield(l̄, f), getfield(l̄fd, f)
-        test_approx_structs(x, g, gfd; atol, rtol, exclude_grad_fields, verbose)
-        verbose && println("... field $f done!")
-    end
-    return true
-end
-
-function test_approx_structs(x, g::Nothing, gfd; atol, rtol, kws...)
-    # finite diff gradients has to be zero if present
-    @test !(gfd isa AbstractArray) || isapprox(gfd, fill!(similar(gfd), 0); atol, rtol)
-end
-
-function test_approx_structs(x::Union{AbstractArray, Number},
-                             g::Union{AbstractArray, Number}, gfd; atol, rtol, kws...)
-    @test eltype(g) == eltype(x)
-    if x isa CuArray
-        @test g isa CuArray
-        g = Array(g)
-    end
-    @test g≈gfd atol=atol rtol=rtol
-end
-
-"""
-    to32(m)
-
-Convert the `eltype` of model's float parameters to `Float32`.
-Preserves integer arrays.
-"""
-to32(m) = _paramtype(Float32, m)
-
-"""
-    to64(m)
-
-Convert the `eltype` of model's float parameters to `Float64`.
-Preserves integer arrays.
-"""
-to64(m) = _paramtype(Float64, m)
-
-struct GNNEltypeAdaptor{T} end
-
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:AbstractFloat}) where T = convert(AbstractArray{T}, x)
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Integer}) where T = x
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Number}) where T = convert(AbstractArray{T}, x)
-
-_paramtype(::Type{T}, m) where T = fmap(adapt(GNNEltypeAdaptor{T}()), m)
diff --git a/GNNlib/src/layers/conv.jl b/GNNlib/src/layers/conv.jl
index e310fa81c..8b378cbdc 100644
--- a/GNNlib/src/layers/conv.jl
+++ b/GNNlib/src/layers/conv.jl
@@ -253,7 +253,6 @@ function gin_conv(l, g::AbstractGNNGraph, x)
     xj, xi = expand_srcdst(g, x) 
  
     m = propagate(copy_xj, g, l.aggr, xj = xj)
-    
     return l.nn((1 .+ ofeltype(xi, l.ϵ)) .* xi .+ m)
 end
 
diff --git a/GraphNeuralNetworks/Project.toml b/GraphNeuralNetworks/Project.toml
index 29aaf3acd..dabc74ac5 100644
--- a/GraphNeuralNetworks/Project.toml
+++ b/GraphNeuralNetworks/Project.toml
@@ -33,28 +33,26 @@ LinearAlgebra = "1"
 MLUtils = "0.4"
 MacroTools = "0.5"
 NNlib = "0.9"
+Pkg = "1"
 Random = "1"
 Reexport = "1"
 Statistics = "1"
 TestItemRunner = "1.0.5"
-cuDNN = "1"
 julia = "1.10"
 
 [extras]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["Test", "TestItemRunner", "MLDatasets", "Adapt", "DataFrames", "InlineStrings", 
-      "SparseArrays", "Graphs", "Zygote", "FiniteDifferences", "ChainRulesTestUtils", "CUDA", "cuDNN"]
+test = ["Test", "TestItemRunner", "Pkg", "MLDatasets", "Adapt", "DataFrames", "InlineStrings", "SparseArrays", "Graphs", "Zygote", "FiniteDifferences", "ChainRulesTestUtils"]
diff --git a/GraphNeuralNetworks/test/examples/node_classification_cora.jl b/GraphNeuralNetworks/test/examples/node_classification_cora.jl
index cfcdc2d1c..9a424a26d 100644
--- a/GraphNeuralNetworks/test/examples/node_classification_cora.jl
+++ b/GraphNeuralNetworks/test/examples/node_classification_cora.jl
@@ -1,14 +1,13 @@
-@testitem "Training Example" setup=[TestModule] begin
-    using .TestModule
+@testmodule TrainingExampleModule begin
     using Flux
     using Flux: onecold, onehotbatch
     using Flux.Losses: logitcrossentropy
     using GraphNeuralNetworks
     using MLDatasets: Cora
     using Statistics, Random
-    using CUDA
-    CUDA.allowscalar(false)
-
+    using Test
+    ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
+    
     function eval_loss_accuracy(X, y, ids, model, g)
         ŷ = model(g, X)
         l = logitcrossentropy(ŷ[:, ids], y[:, ids])
@@ -21,7 +20,7 @@
         η = 5.0f-3            # learning rate
         epochs = 10         # number of epochs
         seed = 17           # set seed > 0 for reproducibility
-        usecuda = false     # if true use cuda (if available)
+        use_gpu = false     # if true use gpu (if available)
         nhidden = 64        # dimension of hidden features
     end
 
@@ -29,11 +28,11 @@
         args = Args(; kws...)
         args.seed > 0 && Random.seed!(args.seed)
 
-        if args.usecuda && CUDA.functional()
-            device = Flux.gpu
-            args.seed > 0 && CUDA.seed!(args.seed)
+        if args.use_gpu
+            device = gpu_device(force=true)
+            Random.seed!(default_device_rng(device))
         else
-            device = Flux.cpu
+            device = cpu_device()
         end
 
         # LOAD DATA
@@ -41,7 +40,7 @@
         classes = dataset.metadata["classes"]
         g = mldataset2gnngraph(dataset) |> device
         X = g.ndata.features
-        y = onehotbatch(g.ndata.targets |> cpu, classes) |> device # remove when https://github.com/FluxML/Flux.jl/pull/1959 tagged
+        y = onehotbatch(g.ndata.targets, classes)
         train_mask = g.ndata.train_mask
         test_mask = g.ndata.test_mask
         ytrain = y[:, train_mask]
@@ -78,7 +77,7 @@
         return train_res, test_res
     end
 
-    function train_many(; usecuda = false)
+    function train_many(; use_gpu = false)
         for (layer, Layer) in [
             ("GCNConv", (nin, nout) -> GCNConv(nin => nout, relu)),
             ("ResGatedGraphConv", (nin, nout) -> ResGatedGraphConv(nin => nout, relu)),
@@ -96,16 +95,21 @@
             ## ("EdgeConv",(nin, nout) -> EdgeConv(Dense(2nin, nout, relu))), # Fits the training set but does not generalize well
         ]
             @show layer
-            @time train_res, test_res = train(Layer; usecuda, verbose = false)
+            @time train_res, test_res = train(Layer; use_gpu, verbose = false)
             # @show train_res, test_res
             @test train_res.acc > 94
             @test test_res.acc > 69
         end
     end
+end # module
+
+@testitem "training example" setup=[TrainingExampleModule] begin
+    using .TrainingExampleModule
+    TrainingExampleModule.train_many()
+end
 
-    train_many(usecuda = false)
-    # #TODO
-    # if TEST_GPU
-    #     train_many(usecuda = true)
-    # end
+@testitem "training example GPU" setup=[TrainingExampleModule] tags=[:gpu] begin
+    using .TrainingExampleModule
+    TrainingExampleModule.train_many(use_gpu = true)
 end
+
diff --git a/GraphNeuralNetworks/test/layers/basic.jl b/GraphNeuralNetworks/test/layers/basic.jl
index caad9458a..bce86b75d 100644
--- a/GraphNeuralNetworks/test/layers/basic.jl
+++ b/GraphNeuralNetworks/test/layers/basic.jl
@@ -18,7 +18,7 @@
 
         Flux.testmode!(gnn)
 
-        test_layer(gnn, g, rtol = 1e-5, exclude_grad_fields = [:μ, :σ²])
+        test_gradients(gnn, g, x, rtol = 1e-5)
 
         @testset "constructor with names" begin
             m = GNNChain(GCNConv(din => d),
@@ -53,7 +53,7 @@
 
             Flux.trainmode!(gnn)
 
-            test_layer(gnn, g, rtol = 1e-4, atol=1e-4, exclude_grad_fields = [:μ, :σ²])
+            test_gradients(gnn, g, x, rtol = 1e-4, atol=1e-4)
         end
     end
 
diff --git a/GraphNeuralNetworks/test/layers/conv.jl b/GraphNeuralNetworks/test/layers/conv.jl
index 238315a4f..88c6282fb 100644
--- a/GraphNeuralNetworks/test/layers/conv.jl
+++ b/GraphNeuralNetworks/test/layers/conv.jl
@@ -6,18 +6,26 @@ end
 
 @testitem "GCNConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
-    l = GCNConv(D_IN => D_OUT)
-    for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
-    end
+    @testset "basic" begin
+        l = GCNConv(D_IN => D_OUT)
+        for g in TEST_GRAPHS
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        end
 
-    l = GCNConv(D_IN => D_OUT, tanh, bias = false)
-    for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
-    end
+        l = GCNConv(D_IN => D_OUT, tanh, bias = false)
+        for g in TEST_GRAPHS
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        end
 
-    l = GCNConv(D_IN => D_OUT, add_self_loops = false)
-    test_layer(l, TEST_GRAPHS[1], rtol = RTOL_HIGH, outsize = (D_OUT, TEST_GRAPHS[1].num_nodes))
+        l = GCNConv(D_IN => D_OUT, add_self_loops = false)
+        for g in TEST_GRAPHS
+            has_isolated_nodes(g) && continue
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
+        end
+    end
 
     @testset "edge weights & custom normalization $GRAPH_T" for GRAPH_T in GRAPH_TYPES
         s = [2, 3, 1, 3, 1, 2]
@@ -40,20 +48,34 @@ end
         g = GNNGraph((s, t, w), ndata = x, graph_type = GRAPH_T, edata = w)
         l = GCNConv(1 => 1, add_self_loops = false, use_edge_weight = true)
         @test gradient(w -> sum(l(g, x, w)), w)[1] isa AbstractVector{Float32}   # redundant test but more explicit
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (1, g.num_nodes))
+        @test size(l(g, x, w)) == (1, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 
     @testset "conv_weight" begin
         l = GraphNeuralNetworks.GCNConv(D_IN => D_OUT)
         w = zeros(Float32, D_OUT, D_IN)
-        g1 = GNNGraph(TEST_GRAPHS[1], ndata = ones(Float32, D_IN, 4))
-        @test l(g1, g1.ndata.x, conv_weight = w) == zeros(Float32, D_OUT, 4)
-        a = rand(Float32, D_IN, 4)
-        g2 = GNNGraph(TEST_GRAPHS[1], ndata = a)
-        @test l(g2, g2.ndata.x, conv_weight = w) == w * a
+        
+        for g in TEST_GRAPHS
+            x = ones(Float32, D_IN, g.num_nodes)
+            @test l(g, x, conv_weight = w) == zeros(Float32, D_OUT, g.num_nodes)
+            x = rand(Float32, D_IN, g.num_nodes)
+            @test l(g, x, conv_weight = w) == w * x
+        end
     end
 end
 
+
+@testitem "GCNConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = GCNConv(D_IN => D_OUT)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "ChebConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     k = 2
@@ -63,7 +85,8 @@ end
     @test l.k == k
     for g in TEST_GRAPHS
         g = add_self_loops(g)
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_LOW)
     end
 
     @testset "bias=false" begin
@@ -72,16 +95,30 @@ end
     end
 end
 
+
+@testitem "ChebConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    k = 2
+    l = ChebConv(D_IN => D_OUT, k)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_LOW, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "GraphConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = GraphConv(D_IN => D_OUT)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 
     l = GraphConv(D_IN => D_OUT, tanh, bias = false, aggr = mean)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 
     @testset "bias=false" begin
@@ -90,15 +127,25 @@ end
     end
 end
 
+
+@testitem "GraphConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = GraphConv(D_IN => D_OUT)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
+
 @testitem "GATConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     for heads in (1, 2), concat in (true, false)
         l = GATConv(D_IN => D_OUT; heads, concat, dropout=0)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_LOW,
-                        exclude_grad_fields = [:negative_slope, :dropout],
-                        outsize = (concat ? heads * D_OUT : D_OUT,
-                                    g.num_nodes))
+            @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_LOW)
         end
     end
 
@@ -106,9 +153,8 @@ end
         ein = 3
         l = GATConv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
         g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
-        test_layer(l, g, rtol = RTOL_LOW,
-                    exclude_grad_fields = [:negative_slope, :dropout],
-                    outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW)
     end
 
     @testset "num params" begin
@@ -121,15 +167,25 @@ end
     end
 end
 
+@testitem "GATConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    for heads in (1, 2), concat in (true, false)
+        l = GATConv(D_IN => D_OUT; heads, concat, dropout=0)
+        for g in TEST_GRAPHS
+            g.graph isa AbstractSparseMatrix && continue
+            @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_LOW, test_gpu = true, compare_finite_diff = false)
+        end
+    end
+end
+
 @testitem "GATv2Conv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     for heads in (1, 2), concat in (true, false)
         l = GATv2Conv(D_IN => D_OUT, tanh; heads, concat, dropout=0)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_LOW, atol=ATOL_LOW,
-                        exclude_grad_fields = [:negative_slope, :dropout],
-                        outsize = (concat ? heads * D_OUT : D_OUT,
-                                    g.num_nodes))
+            @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_LOW, atol=ATOL_LOW)
         end
     end
 
@@ -137,9 +193,8 @@ end
         ein = 3
         l = GATv2Conv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
         g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
-        test_layer(l, g, rtol = RTOL_LOW, atol=ATOL_LOW,
-                    exclude_grad_fields = [:negative_slope, :dropout],
-                    outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW, atol=ATOL_LOW)
     end
 
     @testset "num params" begin
@@ -152,6 +207,18 @@ end
     end
 end
 
+@testitem "GATv2Conv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    for heads in (1, 2), concat in (true, false)
+        l = GATv2Conv(D_IN => D_OUT, tanh; heads, concat, dropout=0)
+        for g in TEST_GRAPHS
+            g.graph isa AbstractSparseMatrix && continue
+            @test size(l(g, g.x)) == (concat ? heads * D_OUT : D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_LOW, atol=ATOL_LOW, test_gpu = true, compare_finite_diff = false)
+        end
+    end
+end
+
 @testitem "GatedGraphConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     num_layers = 3
@@ -159,30 +226,66 @@ end
     @test size(l.weight) == (D_OUT, D_OUT, num_layers)
 
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 end
 
+
+@testitem "GatedGraphConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    num_layers = 3
+    l = GatedGraphConv(D_OUT, num_layers)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "EdgeConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = EdgeConv(Dense(2 * D_IN, D_OUT), aggr = +)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "EdgeConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = EdgeConv(Dense(2 * D_IN, D_OUT), aggr = +)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "GINConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     nn = Dense(D_IN, D_OUT)
 
-    l = GINConv(nn, 0.01f0, aggr = mean)
+    l = GINConv(nn, 0.01, aggr = mean)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 
     @test !in(:eps, Flux.trainable(l))
 end
 
+@testitem "GINConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    nn = Dense(D_IN, D_OUT)
+    l = GINConv(nn, 0.01, aggr = mean)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "NNConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     edim = 10
@@ -191,10 +294,24 @@ end
     l = NNConv(D_IN => D_OUT, nn, tanh, bias = true, aggr = +)
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "NNConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    edim = 10
+    nn = Dense(edim, D_OUT * D_IN)
+    l = NNConv(D_IN => D_OUT, nn, tanh, bias = true, aggr = +)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "SAGEConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = SAGEConv(D_IN => D_OUT)
@@ -202,18 +319,40 @@ end
 
     l = SAGEConv(D_IN => D_OUT, tanh, bias = false, aggr = +)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "SAGEConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = SAGEConv(D_IN => D_OUT)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "ResGatedGraphConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = ResGatedGraphConv(D_IN => D_OUT, tanh, bias = true)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "ResGatedGraphConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = ResGatedGraphConv(D_IN => D_OUT, tanh, bias = true)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "CGConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
 
@@ -221,7 +360,8 @@ end
     l = CGConv((D_IN, edim) => D_OUT, tanh, residual = false, bias = true)
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH)
     end
 
     # no edge features
@@ -231,6 +371,17 @@ end
     @test l1(g1, g1.ndata.x, nothing) == l1(g1).ndata.x
 end
 
+@testitem "CGConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    edim = 10
+    l = CGConv((D_IN, edim) => D_OUT, tanh, residual = false, bias = true)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x, g.e)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "AGNNConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = AGNNConv(trainable=false, add_self_loops=false)
@@ -245,21 +396,53 @@ end
     @test l.trainable == true 
     Flux.trainable(l) == (; β = [1f0])
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_IN, g.num_nodes))
+        @test size(l(g, g.x)) == (D_IN, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "AGNNConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = AGNNConv(trainable=false, add_self_loops=false)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_IN, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "MEGNetConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     l = MEGNetConv(D_IN => D_OUT, aggr = +)
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, D_IN, g.num_edges))
-        test_layer(l, g, rtol = RTOL_LOW,
-                    outtype = :node_edge,
-                    outsize = ((D_OUT, g.num_nodes), (D_OUT, g.num_edges)))
+        y = l(g, g.x, g.e)
+        @test size(y[1]) == (D_OUT, g.num_nodes)
+        @test size(y[2]) == (D_OUT, g.num_edges)
+        function loss(l, g, x, e)
+            y = l(g, x, e)
+            return mean(y[1]) + sum(y[2])
+        end
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW; loss)
     end
 end
 
+@testitem "MEGNetConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = MEGNetConv(D_IN => D_OUT, aggr = +)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        y = l(g, g.x, g.e)
+        @test size(y[1]) == (D_OUT, g.num_nodes)
+        @test size(y[2]) == (D_OUT, g.num_edges)
+        function loss(l, g, x, e)
+            y = l(g, x, e)
+            return mean(y[1]) + sum(y[2])
+        end
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW; loss, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "GMMConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     ein_channel = 10
@@ -267,45 +450,85 @@ end
     l = GMMConv((D_IN, ein_channel) => D_OUT, K = K)
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, ein_channel, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+        y = l(g, g.x, g.e)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH)
     end
 end
 
+@testitem "GMMConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    ein_channel = 10
+    K = 5
+    l = GMMConv((D_IN, ein_channel) => D_OUT, K = K)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        y = l(g, g.x, g.e)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
+
 @testitem "SGConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     K = [1, 2, 3] # for different number of hops       
     for k in K
         l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
 
         l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
     end
 end
 
+@testitem "SGConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    k = 2
+    l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end
+end
+
 @testitem "TAGConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     K = [1, 2, 3]
     for k in K
         l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
 
         l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
     end
 end
 
+@testitem "TAGConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    k = 2
+    l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end
+end
+
 @testitem "EGNNConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     #TODO test gradient
+    #TODO test gpu
     @testset "EGNNConv $GRAPH_T" for GRAPH_T in GRAPH_TYPES
         hin = 5
         hout = 5
@@ -331,37 +554,61 @@ end
     # batch_norm=false here for tests to pass; true in paper
     for g in TEST_GRAPHS
         g = GNNGraph(g, ndata = rand(Float32, D_IN * heads, g.num_nodes))
-        test_layer(l, g, rtol = RTOL_LOW,
-                    exclude_grad_fields = [:negative_slope],
-                    outsize = (D_IN * heads, g.num_nodes))
+        @test size(l(g, g.x)) == (D_IN * heads, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_LOW)
     end
     # used like in Shi et al., 2021 
     l = TransformerConv((D_IN, ein) => D_IN; heads, gating = true,
                         bias_qkv = true)
     for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, ein, g.num_edges))
-        test_layer(l, g, rtol = RTOL_LOW,
-                    exclude_grad_fields = [:negative_slope],
-                    outsize = (D_IN * heads, g.num_nodes))
+        @test size(l(g, g.x, g.e)) == (D_IN * heads, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW)
     end
     # test averaging heads
     l = TransformerConv(D_IN => D_IN; heads, concat = false,
                         bias_root = false,
                         root_weight = false)
     for g in TEST_GRAPHS
-        test_layer(l, g, rtol = RTOL_LOW,
-                    exclude_grad_fields = [:negative_slope],
-                    outsize = (D_IN, g.num_nodes))
+        @test size(l(g, g.x)) == (D_IN, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_LOW)
     end
 end
 
+@testitem "TransformerConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    ein = 2
+    heads = 3
+
+    # used like in Shi et al., 2021 
+    l = TransformerConv((D_IN, ein) => D_IN; heads, gating = true,
+                        bias_qkv = true)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x, g.e)) == (D_IN * heads, g.num_nodes)
+        test_gradients(l, g, g.x, g.e, rtol = RTOL_LOW, test_gpu = true, compare_finite_diff = false)
+    end
+end
+
+
 @testitem "DConv" setup=[TolSnippet, TestModule] begin
     using .TestModule
     K = [1, 2, 3] # for different number of hops       
     for k in K
         l = DConv(D_IN => D_OUT, k)
         for g in TEST_GRAPHS
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
+            @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+            test_gradients(l, g, g.x, rtol = RTOL_HIGH)
         end
     end
-end
\ No newline at end of file
+end
+
+@testitem "DConv GPU" setup=[TolSnippet, TestModule] tags=[:gpu] begin
+    using .TestModule
+    l = DConv(D_IN => D_OUT, 2)
+    for g in TEST_GRAPHS
+        g.graph isa AbstractSparseMatrix && continue
+        @test size(l(g, g.x)) == (D_OUT, g.num_nodes)
+        test_gradients(l, g, g.x, rtol = RTOL_HIGH, test_gpu = true, compare_finite_diff = false)
+    end   
+end
diff --git a/GraphNeuralNetworks/test/layers/heteroconv.jl b/GraphNeuralNetworks/test/layers/heteroconv.jl
index 7584378cf..4a6d40ca3 100644
--- a/GraphNeuralNetworks/test/layers/heteroconv.jl
+++ b/GraphNeuralNetworks/test/layers/heteroconv.jl
@@ -17,17 +17,17 @@
         y = model(g, x)
 
         grad, dx = gradient((model, x) -> sum(model(g, x)[1]) + sum(model(g, x)[2].^2), model, x)
-        ngrad, ndx = ngradient((model, x) -> sum(model(g, x)[1]) + sum(model(g, x)[2].^2), model, x)
+        yhat, (ngrad, ndx) = finitediff_withgradient((model, x) -> sum(model(g, x)[1]) + sum(model(g, x)[2].^2), model, x)
 
-        @test grad.layers[1].weight1 ≈ ngrad.layers[1].weight1  rtol=1e-4
-        @test grad.layers[1].weight2 ≈ ngrad.layers[1].weight2  rtol=1e-4
-        @test grad.layers[1].bias ≈ ngrad.layers[1].bias        rtol=1e-4
-        @test grad.layers[2].weight1 ≈ ngrad.layers[2].weight1  rtol=1e-4
-        @test grad.layers[2].weight2 ≈ ngrad.layers[2].weight2  rtol=1e-4
-        @test grad.layers[2].bias ≈ ngrad.layers[2].bias        rtol=1e-4
+        @test grad.layers[1].weight1 ≈ ngrad.layers[1].weight1  rtol=1e-3
+        @test grad.layers[1].weight2 ≈ ngrad.layers[1].weight2  rtol=1e-3
+        @test grad.layers[1].bias ≈ ngrad.layers[1].bias        rtol=1e-3
+        @test grad.layers[2].weight1 ≈ ngrad.layers[2].weight1  rtol=1e-3
+        @test grad.layers[2].weight2 ≈ ngrad.layers[2].weight2  rtol=1e-3
+        @test grad.layers[2].bias ≈ ngrad.layers[2].bias        rtol=1e-3
 
-        @test dx[:A] ≈ ndx[:A]        rtol=1e-4
-        @test dx[:B] ≈ ndx[:B]        rtol=1e-4
+        @test dx[:A] ≈ ndx[:A]        rtol=1e-3
+        @test dx[:B] ≈ ndx[:B]        rtol=1e-3
     end
 
     @testset "Constructor from pairs" begin
diff --git a/GraphNeuralNetworks/test/layers/pool.jl b/GraphNeuralNetworks/test/layers/pool.jl
index 60789f2e5..382a728ea 100644
--- a/GraphNeuralNetworks/test/layers/pool.jl
+++ b/GraphNeuralNetworks/test/layers/pool.jl
@@ -19,7 +19,7 @@
         @test u[:, [1]] ≈ sum(g.ndata.x[:, 1:n], dims = 2)
         @test p(g).gdata.u == u
 
-        test_layer(p, g, rtol = 1e-5, exclude_grad_fields = [:aggr], outtype = :graph)
+        test_gradients(p, g, g.x, rtol = 1e-5)
     end
 end
 
@@ -41,7 +41,8 @@ end
                                     graph_type = GRAPH_T)
                         for i in 1:ng])
 
-        test_layer(p, g, rtol = 1e-5, outtype = :graph, outsize = (chout, ng))
+        @test size(p(g, g.x)) == (chout, ng)
+        test_gradients(p, g, g.x, rtol = 1e-5)
     end
 end
 
@@ -83,7 +84,6 @@ end
         @test size(y) == (2 * n_in, g.num_graphs)
         
         ## TODO the numerical gradient seems to be 3 times smaller than zygote one
-        # test_layer(l, g, rtol = 1e-4, atol=1e-4, outtype = :graph, outsize = (2 * n_in, g.num_graphs), 
-        #         verbose=true, exclude_grad_fields = [:state0, :state])
+        # test_gradients(l, g, g.x, rtol = 1e-4, atol=1e-4)
     end
 end
diff --git a/GraphNeuralNetworks/test/runtests.jl b/GraphNeuralNetworks/test/runtests.jl
index b9e874db1..e3ca04f88 100644
--- a/GraphNeuralNetworks/test/runtests.jl
+++ b/GraphNeuralNetworks/test/runtests.jl
@@ -1,3 +1,25 @@
 using TestItemRunner
 
-@run_package_tests
+## See https://www.julia-vscode.org/docs/stable/userguide/testitems/
+## for how to run the tests within VS Code.
+## See test_module.jl for the test infrastructure.
+
+## Uncomment below to change the default test settings
+# ENV["GNN_TEST_CPU"] = "false"
+# ENV["GNN_TEST_CUDA"] = "true"
+# ENV["GNN_TEST_AMDGPU"] = "true"
+# ENV["GNN_TEST_Metal"] = "true"
+
+if get(ENV, "GNN_TEST_CPU", "true") == "true"
+    @run_package_tests filter = ti -> :gpu ∉ ti.tags
+end
+if get(ENV, "GNN_TEST_CUDA", "false") == "true"
+    @run_package_tests filter = ti -> :gpu ∈ ti.tags
+end
+if get(ENV, "GNN_TEST_AMDGPU", "false") == "true"
+    @run_package_tests filter = ti -> :gpu ∈ ti.tags
+end
+if get(ENV, "GNN_TEST_Metal", "false") == "true"
+    @run_package_tests filter = ti -> :gpu ∈ ti.tags
+end
+
diff --git a/GraphNeuralNetworks/test/test_module.jl b/GraphNeuralNetworks/test/test_module.jl
index 7c5b9a77a..abd54b84c 100644
--- a/GraphNeuralNetworks/test/test_module.jl
+++ b/GraphNeuralNetworks/test/test_module.jl
@@ -5,244 +5,144 @@ using Test
 using Statistics, Random
 using Flux, Functors
 using Graphs
-using ChainRulesTestUtils, FiniteDifferences, Zygote, Adapt, CUDA
-CUDA.allowscalar(false)
+using ChainRulesTestUtils, FiniteDifferences, Zygote, Adapt
+using SparseArrays
+using Pkg
+
+## Uncomment below to change the default test settings
+# ENV["GNN_TEST_CPU"] = "false"
+# ENV["GNN_TEST_CUDA"] = "true"
+# ENV["GNN_TEST_AMDGPU"] = "true"
+# ENV["GNN_TEST_Metal"] = "true"
+
+if get(ENV, "GNN_TEST_CUDA", "false") == "true"
+    Pkg.add(["CUDA", "cuDNN"])
+    using CUDA
+    CUDA.allowscalar(false)
+end
+if get(ENV, "GNN_TEST_AMDGPU", "false") == "true"
+    Pkg.add("AMDGPU")
+    using AMDGPU
+    AMDGPU.allowscalar(false)
+end
+if get(ENV, "GNN_TEST_Metal", "false") == "true"
+    Pkg.add("Metal")
+    using Metal
+    Metal.allowscalar(false)
+end
+
+# from Bse
+export mean, randn, SparseArrays, AbstractSparseMatrix
 
 # from other packages
 export Flux, gradient, Dense, Chain, relu, random_regular_graph, erdos_renyi,
        BatchNorm, LayerNorm, Dropout, Parallel
-export mean, randn
+
 # from this module
-export D_IN, D_OUT, test_layer, ngradient, GRAPH_TYPES, TEST_GRAPHS
+export D_IN, D_OUT, GRAPH_TYPES, TEST_GRAPHS,
+       test_gradients, finitediff_withgradient, 
+       check_equal_leaves
+
 
 const D_IN = 3
 const D_OUT = 5
 
-function ngradient(f, x...)
-    fdm = central_fdm(5, 1)
-    return FiniteDifferences.grad(fdm, f, x...)
+function finitediff_withgradient(f, x...)
+    y = f(x...)
+    # We set a range to avoid domain errors
+    fdm = FiniteDifferences.central_fdm(5, 1, max_range=1e-2)
+    return y, FiniteDifferences.grad(fdm, f, x...)
 end
 
-const rule_config = Zygote.ZygoteRuleConfig()
-
-# Using this until https://github.com/JuliaDiff/FiniteDifferences.jl/issues/188 is fixed
-function FiniteDifferences.to_vec(x::Integer)
-    Integer_from_vec(v) = x
-    return Int[x], Integer_from_vec
-end
-
-# Test that forward pass on cpu and gpu are the same. 
-# Tests also gradient on cpu and gpu comparing with
-# finite difference methods.
-# Test gradients with respects to layer weights and to input. 
-# If `g` has edge features, it is assumed that the layer can 
-# use them in the forward pass as `l(g, x, e)`.
-# Test also gradient with respect to `e`. 
-function test_layer(l, g::GNNGraph; atol = 1e-5, rtol = 1e-5,
-                    exclude_grad_fields = [],
-                    verbose = false,
-                    test_gpu = false,
-                    outsize = nothing,
-                    outtype = :node)
-
-    # TODO these give errors, probably some bugs in ChainRulesTestUtils
-    # test_rrule(rule_config, x -> l(g, x), x; rrule_f=rrule_via_ad, check_inferred=false)
-    # test_rrule(rule_config, l -> l(g, x), l; rrule_f=rrule_via_ad, check_inferred=false)
-
-    isnothing(node_features(g)) && error("Plese add node data to the input graph")
-    fdm = central_fdm(5, 1)
-
-    x = node_features(g)
-    e = edge_features(g)
-    use_edge_feat = !isnothing(e)
-
-    x64, e64, l64, g64 = to64.([x, e, l, g]) # needed for accurate FiniteDifferences' grad
-    xgpu, egpu, lgpu, ggpu = gpu.([x, e, l, g])
-
-    f(l, g::GNNGraph) = l(g)
-    f(l, g::GNNGraph, x, e) = use_edge_feat ? l(g, x, e) : l(g, x)
-
-    loss(l, g::GNNGraph) =
-        if outtype == :node
-            sum(node_features(f(l, g)))
-        elseif outtype == :edge
-            sum(edge_features(f(l, g)))
-        elseif outtype == :graph
-            sum(graph_features(f(l, g)))
-        elseif outtype == :node_edge
-            gnew = f(l, g)
-            sum(node_features(gnew)) + sum(edge_features(gnew))
-        end
-
-    function loss(l, g::GNNGraph, x, e)
-        y = f(l, g, x, e)
-        if outtype == :node_edge
-            return sum(y[1]) + sum(y[2])
-        else
-            return sum(y)
-        end
-    end
-
-    # TEST OUTPUT
-    y = f(l, g, x, e)
-    if outtype == :node_edge
-        @assert y isa Tuple
-        @test eltype(y[1]) == eltype(x)
-        @test eltype(y[2]) == eltype(e)
-        @test all(isfinite, y[1])
-        @test all(isfinite, y[2])
-        if !isnothing(outsize)
-            @test size(y[1]) == outsize[1]
-            @test size(y[2]) == outsize[2]
-        end
-    else
-        @test eltype(y) == eltype(x)
-        @test all(isfinite, y)
-        if !isnothing(outsize)
-            @test size(y) == outsize
+function check_equal_leaves(a, b; rtol=1e-4, atol=1e-4)
+    fmapstructure_with_path(a, b) do kp, x, y
+        if x isa AbstractArray
+            # @show kp
+            @test x ≈ y rtol=rtol atol=atol
+        # elseif x isa Number
+        #     @show kp
+        #     @test x ≈ y rtol=rtol atol=atol
         end
     end
+end
 
-    # test same output on different graph formats
-    gcoo = GNNGraph(g, graph_type = :coo)
-    ycoo = f(l, gcoo, x, e)
-    if outtype == :node_edge
-        @test ycoo[1] ≈ y[1]
-        @test ycoo[2] ≈ y[2]
-    else
-        @test ycoo ≈ y
+function test_gradients(
+            f,
+            graph::GNNGraph, 
+            xs...;
+            rtol=1e-5, atol=1e-5,
+            test_gpu = false,
+            test_grad_f = true,
+            test_grad_x = true,
+            compare_finite_diff = true,
+            loss = (f, g, xs...) -> mean(f(g, xs...)),
+            )
+
+    if !test_gpu && !compare_finite_diff
+        error("You should either compare finite diff vs CPU AD \
+               or CPU AD vs GPU AD.")
     end
 
-    g′ = f(l, g)
-    if outtype == :node
-        @test g′.ndata.x ≈ y
-    elseif outtype == :edge
-        @test g′.edata.e ≈ y
-    elseif outtype == :graph
-        @test g′.gdata.u ≈ y
-    elseif outtype == :node_edge
-        @test g′.ndata.x ≈ y[1]
-        @test g′.edata.e ≈ y[2]
-    else
-        @error "wrong outtype $outtype"
-    end
+    ## Let's make sure first that the forward pass works.
+    l = loss(f, graph, xs...)
+    @test l isa Number
     if test_gpu
-        ygpu = f(lgpu, ggpu, xgpu, egpu)
-        if outtype == :node_edge
-            @test ygpu[1] isa CuArray
-            @test eltype(ygpu[1]) == eltype(xgpu)
-            @test Array(ygpu[1]) ≈ y[1]
-            @test ygpu[2] isa CuArray
-            @test eltype(ygpu[2]) == eltype(xgpu)
-            @test Array(ygpu[2]) ≈ y[2]
-        else
-            @test ygpu isa CuArray
-            @test eltype(ygpu) == eltype(xgpu)
-            @test Array(ygpu) ≈ y
-        end
+        gpu_dev = gpu_device(force=true)
+        cpu_dev = cpu_device()
+        graph_gpu = graph |> gpu_dev
+        xs_gpu = xs |> gpu_dev
+        f_gpu = f |> gpu_dev
+        l_gpu = loss(f_gpu, graph_gpu, xs_gpu...)
+        @test l_gpu isa Number
     end
 
-    # TEST x INPUT GRADIENT
-    x̄ = gradient(x -> loss(l, g, x, e), x)[1]
-    x̄_fd = FiniteDifferences.grad(fdm, x64 -> loss(l64, g64, x64, e64), x64)[1]
-    @test eltype(x̄) == eltype(x)
-    @test x̄≈x̄_fd atol=atol rtol=rtol
-
-    if test_gpu
-        x̄gpu = gradient(xgpu -> loss(lgpu, ggpu, xgpu, egpu), xgpu)[1]
-        @test x̄gpu isa CuArray
-        @test eltype(x̄gpu) == eltype(x)
-        @test Array(x̄gpu)≈x̄ atol=atol rtol=rtol
-    end
-
-    # TEST e INPUT GRADIENT
-    if e !== nothing
-        verbose && println("Test e gradient cpu")
-        ē = gradient(e -> loss(l, g, x, e), e)[1]
-        ē_fd = FiniteDifferences.grad(fdm, e64 -> loss(l64, g64, x64, e64), e64)[1]
-        @test eltype(ē) == eltype(e)
-        @test ē≈ē_fd atol=atol rtol=rtol
+    if test_grad_x
+        # Zygote gradient with respect to input.
+        y, g = Zygote.withgradient((xs...) -> loss(f, graph, xs...), xs...)
+        
+        if compare_finite_diff
+            # Cast to Float64 to avoid precision issues.
+            f64 = f |> Flux.f64
+            xs64 = xs .|> Flux.f64
+            y_fd, g_fd = finitediff_withgradient((xs...) -> loss(f64, graph, xs...), xs64...)
+            @test y ≈ y_fd rtol=rtol atol=atol
+            check_equal_leaves(g, g_fd; rtol, atol)
+        end
 
         if test_gpu
-            verbose && println("Test e gradient gpu")
-            ēgpu = gradient(egpu -> loss(lgpu, ggpu, xgpu, egpu), egpu)[1]
-            @test ēgpu isa CuArray
-            @test eltype(ēgpu) == eltype(ē)
-            @test Array(ēgpu)≈ē atol=atol rtol=rtol
+            # Zygote gradient with respect to input on GPU.
+            y_gpu, g_gpu = Zygote.withgradient((xs...) -> loss(f_gpu, graph_gpu, xs...), xs_gpu...)
+            @test get_device(g_gpu) == get_device(xs_gpu)
+            @test y_gpu ≈ y rtol=rtol atol=atol
+            check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
         end
     end
 
-    # TEST LAYER GRADIENT - l(g, x, e) 
-    l̄ = gradient(l -> loss(l, g, x, e), l)[1]
-    l̄_fd = FiniteDifferences.grad(fdm, l64 -> loss(l64, g64, x64, e64), l64)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, exclude_grad_fields, verbose)
-
-    if test_gpu
-        l̄gpu = gradient(lgpu -> loss(lgpu, ggpu, xgpu, egpu), lgpu)[1]
-        test_approx_structs(lgpu, l̄gpu, l̄; atol, rtol, exclude_grad_fields, verbose)
-    end
-
-    # TEST LAYER GRADIENT - l(g)
-    l̄ = gradient(l -> loss(l, g), l)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, exclude_grad_fields, verbose)
-
-    return true
-end
-
-function test_approx_structs(l, l̄, l̄fd; atol = 1e-5, rtol = 1e-5,
-                             exclude_grad_fields = [],
-                             verbose = false)
-    l̄ = l̄ isa Base.RefValue ? l̄[] : l̄           # Zygote wraps gradient of mutables in RefValue 
-    l̄fd = l̄fd isa Base.RefValue ? l̄fd[] : l̄fd           # Zygote wraps gradient of mutables in RefValue 
+    if test_grad_f
+        # Zygote gradient with respect to f.
+        y, g = Zygote.withgradient(f -> loss(f, graph, xs...), f)
+
+        if compare_finite_diff
+            # Cast to Float64 to avoid precision issues.
+            f64 = f |> Flux.f64
+            ps, re = Flux.destructure(f64)
+            y_fd, g_fd = finitediff_withgradient(ps -> loss(re(ps),graph, xs...), ps)
+            g_fd = (re(g_fd[1]),)
+            @test y ≈ y_fd rtol=rtol atol=atol
+            check_equal_leaves(g, g_fd; rtol, atol)
+        end
 
-    for f in fieldnames(typeof(l))
-        f ∈ exclude_grad_fields && continue
-        verbose && println("Test gradient of field $f...")
-        x, g, gfd = getfield(l, f), getfield(l̄, f), getfield(l̄fd, f)
-        test_approx_structs(x, g, gfd; atol, rtol, exclude_grad_fields, verbose)
-        verbose && println("... field $f done!")
+        if test_gpu
+            # Zygote gradient with respect to f on GPU.
+            y_gpu, g_gpu = Zygote.withgradient(f -> loss(f,graph_gpu, xs_gpu...), f_gpu)
+            # @test get_device(g_gpu) == get_device(xs_gpu)
+            @test y_gpu ≈ y rtol=rtol atol=atol
+            check_equal_leaves(g_gpu |> cpu_dev, g; rtol, atol)
+        end
     end
     return true
 end
 
-function test_approx_structs(x, g::Nothing, gfd; atol, rtol, kws...)
-    # finite diff gradients has to be zero if present
-    @test !(gfd isa AbstractArray) || isapprox(gfd, fill!(similar(gfd), 0); atol, rtol)
-end
-
-function test_approx_structs(x::Union{AbstractArray, Number},
-                             g::Union{AbstractArray, Number}, gfd; atol, rtol, kws...)
-    @test eltype(g) == eltype(x)
-    if x isa CuArray
-        @test g isa CuArray
-        g = Array(g)
-    end
-    @test g≈gfd atol=atol rtol=rtol
-end
-
-"""
-    to32(m)
-
-Convert the `eltype` of model's float parameters to `Float32`.
-Preserves integer arrays.
-"""
-to32(m) = _paramtype(Float32, m)
-
-"""
-    to64(m)
-
-Convert the `eltype` of model's float parameters to `Float64`.
-Preserves integer arrays.
-"""
-to64(m) = _paramtype(Float64, m)
-
-struct GNNEltypeAdaptor{T} end
-
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:AbstractFloat}) where T = convert(AbstractArray{T}, x)
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Integer}) where T = x
-Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Number}) where T = convert(AbstractArray{T}, x)
-
-_paramtype(::Type{T}, m) where T = fmap(adapt(GNNEltypeAdaptor{T}()), m)
 
 function generate_test_graphs(graph_type)
     adj1 = [0 1 0 1