diff --git a/GraphNeuralNetworks/Project.toml b/GraphNeuralNetworks/Project.toml
index eb9c44caf..29aaf3acd 100644
--- a/GraphNeuralNetworks/Project.toml
+++ b/GraphNeuralNetworks/Project.toml
@@ -21,17 +21,14 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
-# [extensions]
-# GraphNeuralNetworksCUDAExt = "CUDA"
-
 [compat]
 CUDA = "4, 5"
 ChainRulesCore = "1"
 Flux = "0.14"
 Functors = "0.4.1"
-Graphs = "1.12"
 GNNGraphs = "1.0"
 GNNlib = "0.2"
+Graphs = "1.12"
 LinearAlgebra = "1"
 MLUtils = "0.4"
 MacroTools = "0.5"
@@ -39,6 +36,7 @@ NNlib = "0.9"
 Random = "1"
 Reexport = "1"
 Statistics = "1"
+TestItemRunner = "1.0.5"
 cuDNN = "1"
 julia = "1.10"
 
@@ -53,8 +51,10 @@ InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["Test", "MLDatasets", "Adapt", "DataFrames", "InlineStrings", "SparseArrays", "Graphs", "Zygote", "FiniteDifferences", "ChainRulesTestUtils", "CUDA", "cuDNN"]
+test = ["Test", "TestItemRunner", "MLDatasets", "Adapt", "DataFrames", "InlineStrings", 
+      "SparseArrays", "Graphs", "Zygote", "FiniteDifferences", "ChainRulesTestUtils", "CUDA", "cuDNN"]
diff --git a/GraphNeuralNetworks/test/examples/node_classification_cora.jl b/GraphNeuralNetworks/test/examples/node_classification_cora.jl
index 33fc4c145..cfcdc2d1c 100644
--- a/GraphNeuralNetworks/test/examples/node_classification_cora.jl
+++ b/GraphNeuralNetworks/test/examples/node_classification_cora.jl
@@ -1,107 +1,111 @@
-using Flux
-using Flux: onecold, onehotbatch
-using Flux.Losses: logitcrossentropy
-using GraphNeuralNetworks
-using MLDatasets: Cora
-using Statistics, Random
-using CUDA
-CUDA.allowscalar(false)
+@testitem "Training Example" setup=[TestModule] begin
+    using .TestModule
+    using Flux
+    using Flux: onecold, onehotbatch
+    using Flux.Losses: logitcrossentropy
+    using GraphNeuralNetworks
+    using MLDatasets: Cora
+    using Statistics, Random
+    using CUDA
+    CUDA.allowscalar(false)
 
-function eval_loss_accuracy(X, y, ids, model, g)
-    ŷ = model(g, X)
-    l = logitcrossentropy(ŷ[:, ids], y[:, ids])
-    acc = mean(onecold(ŷ[:, ids]) .== onecold(y[:, ids]))
-    return (loss = round(l, digits = 4), acc = round(acc * 100, digits = 2))
-end
+    function eval_loss_accuracy(X, y, ids, model, g)
+        ŷ = model(g, X)
+        l = logitcrossentropy(ŷ[:, ids], y[:, ids])
+        acc = mean(onecold(ŷ[:, ids]) .== onecold(y[:, ids]))
+        return (loss = round(l, digits = 4), acc = round(acc * 100, digits = 2))
+    end
 
-# arguments for the `train` function 
-Base.@kwdef mutable struct Args
-    η = 5.0f-3            # learning rate
-    epochs = 10         # number of epochs
-    seed = 17           # set seed > 0 for reproducibility
-    usecuda = false     # if true use cuda (if available)
-    nhidden = 64        # dimension of hidden features
-end
+    # arguments for the `train` function 
+    Base.@kwdef mutable struct Args
+        η = 5.0f-3            # learning rate
+        epochs = 10         # number of epochs
+        seed = 17           # set seed > 0 for reproducibility
+        usecuda = false     # if true use cuda (if available)
+        nhidden = 64        # dimension of hidden features
+    end
 
-function train(Layer; verbose = false, kws...)
-    args = Args(; kws...)
-    args.seed > 0 && Random.seed!(args.seed)
+    function train(Layer; verbose = false, kws...)
+        args = Args(; kws...)
+        args.seed > 0 && Random.seed!(args.seed)
 
-    if args.usecuda && CUDA.functional()
-        device = Flux.gpu
-        args.seed > 0 && CUDA.seed!(args.seed)
-    else
-        device = Flux.cpu
-    end
+        if args.usecuda && CUDA.functional()
+            device = Flux.gpu
+            args.seed > 0 && CUDA.seed!(args.seed)
+        else
+            device = Flux.cpu
+        end
 
-    # LOAD DATA
-    dataset = Cora()
-    classes = dataset.metadata["classes"]
-    g = mldataset2gnngraph(dataset) |> device
-    X = g.ndata.features
-    y = onehotbatch(g.ndata.targets |> cpu, classes) |> device # remove when https://github.com/FluxML/Flux.jl/pull/1959 tagged
-    train_mask = g.ndata.train_mask
-    test_mask = g.ndata.test_mask
-    ytrain = y[:, train_mask]
+        # LOAD DATA
+        dataset = Cora()
+        classes = dataset.metadata["classes"]
+        g = mldataset2gnngraph(dataset) |> device
+        X = g.ndata.features
+        y = onehotbatch(g.ndata.targets |> cpu, classes) |> device # remove when https://github.com/FluxML/Flux.jl/pull/1959 tagged
+        train_mask = g.ndata.train_mask
+        test_mask = g.ndata.test_mask
+        ytrain = y[:, train_mask]
 
-    nin, nhidden, nout = size(X, 1), args.nhidden, length(classes)
+        nin, nhidden, nout = size(X, 1), args.nhidden, length(classes)
 
-    ## DEFINE MODEL
-    model = GNNChain(Layer(nin, nhidden),
-                     #  Dropout(0.5),
-                     Layer(nhidden, nhidden),
-                     Dense(nhidden, nout)) |> device
+        ## DEFINE MODEL
+        model = GNNChain(Layer(nin, nhidden),
+                        #  Dropout(0.5),
+                        Layer(nhidden, nhidden),
+                        Dense(nhidden, nout)) |> device
 
-    opt = Flux.setup(Adam(args.η), model)
+        opt = Flux.setup(Adam(args.η), model)
 
-    ## TRAINING
-    function report(epoch)
-        train = eval_loss_accuracy(X, y, train_mask, model, g)
-        test = eval_loss_accuracy(X, y, test_mask, model, g)
-        println("Epoch: $epoch   Train: $(train)   Test: $(test)")
-    end
+        ## TRAINING
+        function report(epoch)
+            train = eval_loss_accuracy(X, y, train_mask, model, g)
+            test = eval_loss_accuracy(X, y, test_mask, model, g)
+            println("Epoch: $epoch   Train: $(train)   Test: $(test)")
+        end
 
-    verbose && report(0)
-    @time for epoch in 1:(args.epochs)
-        grad = Flux.gradient(model) do model
-            ŷ = model(g, X)
-            logitcrossentropy(ŷ[:, train_mask], ytrain)
+        verbose && report(0)
+        @time for epoch in 1:(args.epochs)
+            grad = Flux.gradient(model) do model
+                ŷ = model(g, X)
+                logitcrossentropy(ŷ[:, train_mask], ytrain)
+            end
+            Flux.update!(opt, model, grad[1])
+            verbose && report(epoch)
         end
-        Flux.update!(opt, model, grad[1])
-        verbose && report(epoch)
-    end
 
-    train_res = eval_loss_accuracy(X, y, train_mask, model, g)
-    test_res = eval_loss_accuracy(X, y, test_mask, model, g)
-    return train_res, test_res
-end
+        train_res = eval_loss_accuracy(X, y, train_mask, model, g)
+        test_res = eval_loss_accuracy(X, y, test_mask, model, g)
+        return train_res, test_res
+    end
 
-function train_many(; usecuda = false)
-    for (layer, Layer) in [
-        ("GCNConv", (nin, nout) -> GCNConv(nin => nout, relu)),
-        ("ResGatedGraphConv", (nin, nout) -> ResGatedGraphConv(nin => nout, relu)),
-        ("GraphConv", (nin, nout) -> GraphConv(nin => nout, relu, aggr = mean)),
-        ("SAGEConv", (nin, nout) -> SAGEConv(nin => nout, relu)),
-        ("GATConv", (nin, nout) -> GATConv(nin => nout, relu)),
-        ("GINConv", (nin, nout) -> GINConv(Dense(nin, nout, relu), 0.01, aggr = mean)),
-        ("TransformerConv",
-         (nin, nout) -> TransformerConv(nin => nout, concat = false,
-                                        add_self_loops = true, root_weight = false,
-                                        heads = 2)),
-        ## ("ChebConv", (nin, nout) -> ChebConv(nin => nout, 2)), # not working on gpu
-        ## ("NNConv", (nin, nout) -> NNConv(nin => nout)),  # needs edge features
-        ## ("GatedGraphConv", (nin, nout) -> GatedGraphConv(nout, 2)),  # needs nin = nout
-        ## ("EdgeConv",(nin, nout) -> EdgeConv(Dense(2nin, nout, relu))), # Fits the training set but does not generalize well
-    ]
-        @show layer
-        @time train_res, test_res = train(Layer; usecuda, verbose = false)
-        # @show train_res, test_res
-        @test train_res.acc > 94
-        @test test_res.acc > 69
+    function train_many(; usecuda = false)
+        for (layer, Layer) in [
+            ("GCNConv", (nin, nout) -> GCNConv(nin => nout, relu)),
+            ("ResGatedGraphConv", (nin, nout) -> ResGatedGraphConv(nin => nout, relu)),
+            ("GraphConv", (nin, nout) -> GraphConv(nin => nout, relu, aggr = mean)),
+            ("SAGEConv", (nin, nout) -> SAGEConv(nin => nout, relu)),
+            ("GATConv", (nin, nout) -> GATConv(nin => nout, relu)),
+            ("GINConv", (nin, nout) -> GINConv(Dense(nin, nout, relu), 0.01, aggr = mean)),
+            ("TransformerConv",
+            (nin, nout) -> TransformerConv(nin => nout, concat = false,
+                                            add_self_loops = true, root_weight = false,
+                                            heads = 2)),
+            ## ("ChebConv", (nin, nout) -> ChebConv(nin => nout, 2)), # not working on gpu
+            ## ("NNConv", (nin, nout) -> NNConv(nin => nout)),  # needs edge features
+            ## ("GatedGraphConv", (nin, nout) -> GatedGraphConv(nout, 2)),  # needs nin = nout
+            ## ("EdgeConv",(nin, nout) -> EdgeConv(Dense(2nin, nout, relu))), # Fits the training set but does not generalize well
+        ]
+            @show layer
+            @time train_res, test_res = train(Layer; usecuda, verbose = false)
+            # @show train_res, test_res
+            @test train_res.acc > 94
+            @test test_res.acc > 69
+        end
     end
-end
 
-train_many(usecuda = false)
-if TEST_GPU
-    train_many(usecuda = true)
+    train_many(usecuda = false)
+    # #TODO
+    # if TEST_GPU
+    #     train_many(usecuda = true)
+    # end
 end
diff --git a/GraphNeuralNetworks/test/layers/basic.jl b/GraphNeuralNetworks/test/layers/basic.jl
index 2428865ae..caad9458a 100644
--- a/GraphNeuralNetworks/test/layers/basic.jl
+++ b/GraphNeuralNetworks/test/layers/basic.jl
@@ -1,57 +1,60 @@
-@testset "GNNChain" begin
-    n, din, d, dout = 10, 3, 4, 2
-    deg = 4
-
-    g = GNNGraph(random_regular_graph(n, deg),
-                    graph_type = GRAPH_T,
-                    ndata = randn(Float32, din, n))
-    x = g.ndata.x
-
-    gnn = GNNChain(GCNConv(din => d),
-                    LayerNorm(d),
-                    x -> tanh.(x),
-                    GraphConv(d => d, tanh),
-                    Dropout(0.5),
-                    Dense(d, dout))
-
-    testmode!(gnn)
-
-    test_layer(gnn, g, rtol = 1e-5, exclude_grad_fields = [:μ, :σ²])
-
-    @testset "constructor with names" begin
-        m = GNNChain(GCNConv(din => d),
+@testitem "GNNChain" setup=[TestModule] begin
+    using .TestModule
+    @testset "GNNChain $GRAPH_T" for GRAPH_T in GRAPH_TYPES
+        n, din, d, dout = 10, 3, 4, 2
+        deg = 4
+
+        g = GNNGraph(random_regular_graph(n, deg),
+                        graph_type = GRAPH_T,
+                        ndata = randn(Float32, din, n))
+        x = g.ndata.x
+
+        gnn = GNNChain(GCNConv(din => d),
                         LayerNorm(d),
                         x -> tanh.(x),
+                        GraphConv(d => d, tanh),
+                        Dropout(0.5),
                         Dense(d, dout))
 
-        m2 = GNNChain(enc = m,
-                        dec = DotDecoder())
+        Flux.testmode!(gnn)
 
-        @test m2[:enc] === m
-        @test m2(g, x) == m2[:dec](g, m2[:enc](g, x))
-    end
+        test_layer(gnn, g, rtol = 1e-5, exclude_grad_fields = [:μ, :σ²])
 
-    @testset "constructor with vector" begin
-        m = GNNChain(GCNConv(din => d),
-                        LayerNorm(d),
-                        x -> tanh.(x),
-                        Dense(d, dout))
-        m2 = GNNChain([m.layers...])
-        @test m2(g, x) == m(g, x)
-    end
+        @testset "constructor with names" begin
+            m = GNNChain(GCNConv(din => d),
+                            LayerNorm(d),
+                            x -> tanh.(x),
+                            Dense(d, dout))
 
-    @testset "Parallel" begin
-        AddResidual(l) = Parallel(+, identity, l)
+            m2 = GNNChain(enc = m,
+                            dec = DotDecoder())
 
-        gnn = GNNChain(GraphConv(din => d, tanh),
-                        LayerNorm(d),
-                        AddResidual(GraphConv(d => d, tanh)),
-                        BatchNorm(d),
-                        Dense(d, dout))
+            @test m2[:enc] === m
+            @test m2(g, x) == m2[:dec](g, m2[:enc](g, x))
+        end
+
+        @testset "constructor with vector" begin
+            m = GNNChain(GCNConv(din => d),
+                            LayerNorm(d),
+                            x -> tanh.(x),
+                            Dense(d, dout))
+            m2 = GNNChain([m.layers...])
+            @test m2(g, x) == m(g, x)
+        end
+
+        @testset "Parallel" begin
+            AddResidual(l) = Parallel(+, identity, l)
+
+            gnn = GNNChain(GraphConv(din => d, tanh),
+                            LayerNorm(d),
+                            AddResidual(GraphConv(d => d, tanh)),
+                            BatchNorm(d),
+                            Dense(d, dout))
 
-        trainmode!(gnn)
+            Flux.trainmode!(gnn)
 
-        test_layer(gnn, g, rtol = 1e-4, atol=1e-4, exclude_grad_fields = [:μ, :σ²])
+            test_layer(gnn, g, rtol = 1e-4, atol=1e-4, exclude_grad_fields = [:μ, :σ²])
+        end
     end
 
     @testset "Only graph input" begin
@@ -67,27 +70,29 @@
     end
 end
 
-@testset "WithGraph" begin
+@testitem "WithGraph" setup=[TestModule] begin
+    using .TestModule
     x = rand(Float32, 2, 3)
     g = GNNGraph([1, 2, 3], [2, 3, 1], ndata = x)
     model = SAGEConv(2 => 3)
     wg = WithGraph(model, g)
     # No need to feed the graph to `wg`
     @test wg(x) == model(g, x)
-    @test Flux.params(wg) == Flux.params(model)
+    @test Flux.trainables(wg) == Flux.trainables(model)
     g2 = GNNGraph([1, 1, 2, 3], [2, 4, 1, 1])
     x2 = rand(Float32, 2, 4)
     # WithGraph will ignore the internal graph if fed with a new one. 
     @test wg(g2, x2) == model(g2, x2)
 
     wg = WithGraph(model, g, traingraph = false)
-    @test length(Flux.params(wg)) == length(Flux.params(model))
+    @test length(Flux.trainables(wg)) == length(Flux.trainables(model))
 
     wg = WithGraph(model, g, traingraph = true)
-    @test length(Flux.params(wg)) == length(Flux.params(model)) + length(Flux.params(g))
+    @test length(Flux.trainables(wg)) == length(Flux.trainables(model)) + length(Flux.trainables(g))
 end
 
-@testset "Flux restructure" begin
+@testitem "Flux.restructure" setup=[TestModule] begin
+    using .TestModule
     chain = GNNChain(GraphConv(2 => 2))
     params, restructure = Flux.destructure(chain)
     @test restructure(params) isa GNNChain
diff --git a/GraphNeuralNetworks/test/layers/conv.jl b/GraphNeuralNetworks/test/layers/conv.jl
index b96baa880..238315a4f 100644
--- a/GraphNeuralNetworks/test/layers/conv.jl
+++ b/GraphNeuralNetworks/test/layers/conv.jl
@@ -1,51 +1,29 @@
-RTOL_LOW = 1e-2
-RTOL_HIGH = 1e-5
-ATOL_LOW = 1e-3
-
-in_channel = 3
-out_channel = 5
-N = 4
-T = Float32
-
-adj1 = [0 1 0 1
-        1 0 1 0
-        0 1 0 1
-        1 0 1 0]
-
-g1 = GNNGraph(adj1,
-                ndata = rand(T, in_channel, N),
-                graph_type = GRAPH_T)
-
-adj_single_vertex = [0 0 0 1
-                        0 0 0 0
-                        0 0 0 1
-                        1 0 1 0]
-
-g_single_vertex = GNNGraph(adj_single_vertex,
-                            ndata = rand(T, in_channel, N),
-                            graph_type = GRAPH_T)
-
-test_graphs = [g1, g_single_vertex]
+@testsnippet TolSnippet begin
+    RTOL_LOW = 1e-2
+    RTOL_HIGH = 1e-5
+    ATOL_LOW = 1e-3
+end
 
-@testset "GCNConv" begin
-    l = GCNConv(in_channel => out_channel)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+@testitem "GCNConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = GCNConv(D_IN => D_OUT)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
-    l = GCNConv(in_channel => out_channel, tanh, bias = false)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    l = GCNConv(D_IN => D_OUT, tanh, bias = false)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
-    l = GCNConv(in_channel => out_channel, add_self_loops = false)
-    test_layer(l, g1, rtol = RTOL_HIGH, outsize = (out_channel, g1.num_nodes))
+    l = GCNConv(D_IN => D_OUT, add_self_loops = false)
+    test_layer(l, TEST_GRAPHS[1], rtol = RTOL_HIGH, outsize = (D_OUT, TEST_GRAPHS[1].num_nodes))
 
-    @testset "edge weights & custom normalization" begin
+    @testset "edge weights & custom normalization $GRAPH_T" for GRAPH_T in GRAPH_TYPES
         s = [2, 3, 1, 3, 1, 2]
         t = [1, 1, 2, 2, 3, 3]
-        w = T[1, 2, 3, 4, 5, 6]
-        g = GNNGraph((s, t, w), ndata = ones(T, 1, 3), graph_type = GRAPH_T)
+        w = Float32[1, 2, 3, 4, 5, 6]
+        g = GNNGraph((s, t, w), ndata = ones(Float32, 1, 3), graph_type = GRAPH_T)
         x = g.ndata.x
         custom_norm_fn(d) = 1 ./ sqrt.(d)  
         l = GCNConv(1 => 1, add_self_loops = false, use_edge_weight = true)
@@ -57,191 +35,204 @@ test_graphs = [g1, g_single_vertex]
         @test y ≈ l(g, x, w; norm_fn = custom_norm_fn) # checking without custom
 
         # test gradient with respect to edge weights
-        w = rand(T, 6)
-        x = rand(T, 1, 3)
+        w = rand(Float32, 6)
+        x = rand(Float32, 1, 3)
         g = GNNGraph((s, t, w), ndata = x, graph_type = GRAPH_T, edata = w)
         l = GCNConv(1 => 1, add_self_loops = false, use_edge_weight = true)
-        @test gradient(w -> sum(l(g, x, w)), w)[1] isa AbstractVector{T}   # redundant test but more explicit
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (1, g.num_nodes), test_gpu = false)
+        @test gradient(w -> sum(l(g, x, w)), w)[1] isa AbstractVector{Float32}   # redundant test but more explicit
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (1, g.num_nodes))
     end
 
     @testset "conv_weight" begin
-         l = GraphNeuralNetworks.GCNConv(in_channel => out_channel)
-        w = zeros(T, out_channel, in_channel)
-        g1 = GNNGraph(adj1, ndata = ones(T, in_channel, N))
-        @test l(g1, g1.ndata.x, conv_weight = w) == zeros(T, out_channel, N)
-        a = rand(T, in_channel, N)
-        g2 = GNNGraph(adj1, ndata = a)
+        l = GraphNeuralNetworks.GCNConv(D_IN => D_OUT)
+        w = zeros(Float32, D_OUT, D_IN)
+        g1 = GNNGraph(TEST_GRAPHS[1], ndata = ones(Float32, D_IN, 4))
+        @test l(g1, g1.ndata.x, conv_weight = w) == zeros(Float32, D_OUT, 4)
+        a = rand(Float32, D_IN, 4)
+        g2 = GNNGraph(TEST_GRAPHS[1], ndata = a)
         @test l(g2, g2.ndata.x, conv_weight = w) == w * a
     end
 end
 
-@testset "ChebConv" begin
+@testitem "ChebConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     k = 2
-    l = ChebConv(in_channel => out_channel, k)
-    @test size(l.weight) == (out_channel, in_channel, k)
-    @test size(l.bias) == (out_channel,)
+    l = ChebConv(D_IN => D_OUT, k)
+    @test size(l.weight) == (D_OUT, D_IN, k)
+    @test size(l.bias) == (D_OUT,)
     @test l.k == k
-    for g in test_graphs
+    for g in TEST_GRAPHS
         g = add_self_loops(g)
-        test_layer(l, g, rtol = RTOL_HIGH, test_gpu = TEST_GPU,
-                    outsize = (out_channel, g.num_nodes))
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
     @testset "bias=false" begin
-        @test length(Flux.params(ChebConv(2 => 3, 3))) == 2
-        @test length(Flux.params(ChebConv(2 => 3, 3, bias = false))) == 1
+        @test length(Flux.trainables(ChebConv(2 => 3, 3))) == 2
+        @test length(Flux.trainables(ChebConv(2 => 3, 3, bias = false))) == 1
     end
 end
 
-@testset "GraphConv" begin
-    l = GraphConv(in_channel => out_channel)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+@testitem "GraphConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = GraphConv(D_IN => D_OUT)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
-    l = GraphConv(in_channel => out_channel, tanh, bias = false, aggr = mean)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    l = GraphConv(D_IN => D_OUT, tanh, bias = false, aggr = mean)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
     @testset "bias=false" begin
-        @test length(Flux.params(GraphConv(2 => 3))) == 3
-        @test length(Flux.params(GraphConv(2 => 3, bias = false))) == 2
+        @test length(Flux.trainables(GraphConv(2 => 3))) == 3
+        @test length(Flux.trainables(GraphConv(2 => 3, bias = false))) == 2
     end
 end
 
-@testset "GATConv" begin
+@testitem "GATConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     for heads in (1, 2), concat in (true, false)
-        l = GATConv(in_channel => out_channel; heads, concat, dropout=0)
-        for g in test_graphs
+        l = GATConv(D_IN => D_OUT; heads, concat, dropout=0)
+        for g in TEST_GRAPHS
             test_layer(l, g, rtol = RTOL_LOW,
                         exclude_grad_fields = [:negative_slope, :dropout],
-                        outsize = (concat ? heads * out_channel : out_channel,
+                        outsize = (concat ? heads * D_OUT : D_OUT,
                                     g.num_nodes))
         end
     end
 
     @testset "edge features" begin
         ein = 3
-        l = GATConv((in_channel, ein) => out_channel, add_self_loops = false, dropout=0)
-        g = GNNGraph(g1, edata = rand(T, ein, g1.num_edges))
+        l = GATConv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
+        g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
         test_layer(l, g, rtol = RTOL_LOW,
                     exclude_grad_fields = [:negative_slope, :dropout],
-                    outsize = (out_channel, g.num_nodes))
+                    outsize = (D_OUT, g.num_nodes))
     end
 
     @testset "num params" begin
         l = GATConv(2 => 3, add_self_loops = false)
-        @test length(Flux.params(l)) == 3
+        @test length(Flux.trainables(l)) == 3
         l = GATConv((2, 4) => 3, add_self_loops = false)
-        @test length(Flux.params(l)) == 4
+        @test length(Flux.trainables(l)) == 4
         l = GATConv((2, 4) => 3, add_self_loops = false, bias = false)
-        @test length(Flux.params(l)) == 3
+        @test length(Flux.trainables(l)) == 3
     end
 end
 
-@testset "GATv2Conv" begin
+@testitem "GATv2Conv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     for heads in (1, 2), concat in (true, false)
-        l = GATv2Conv(in_channel => out_channel, tanh; heads, concat, dropout=0)
-        for g in test_graphs
+        l = GATv2Conv(D_IN => D_OUT, tanh; heads, concat, dropout=0)
+        for g in TEST_GRAPHS
             test_layer(l, g, rtol = RTOL_LOW, atol=ATOL_LOW,
                         exclude_grad_fields = [:negative_slope, :dropout],
-                        outsize = (concat ? heads * out_channel : out_channel,
+                        outsize = (concat ? heads * D_OUT : D_OUT,
                                     g.num_nodes))
         end
     end
 
     @testset "edge features" begin
         ein = 3
-        l = GATv2Conv((in_channel, ein) => out_channel, add_self_loops = false, dropout=0)
-        g = GNNGraph(g1, edata = rand(T, ein, g1.num_edges))
+        l = GATv2Conv((D_IN, ein) => D_OUT, add_self_loops = false, dropout=0)
+        g = GNNGraph(TEST_GRAPHS[1], edata = rand(Float32, ein, TEST_GRAPHS[1].num_edges))
         test_layer(l, g, rtol = RTOL_LOW, atol=ATOL_LOW,
                     exclude_grad_fields = [:negative_slope, :dropout],
-                    outsize = (out_channel, g.num_nodes))
+                    outsize = (D_OUT, g.num_nodes))
     end
 
     @testset "num params" begin
         l = GATv2Conv(2 => 3, add_self_loops = false)
-        @test length(Flux.params(l)) == 5
+        @test length(Flux.trainables(l)) == 5
         l = GATv2Conv((2, 4) => 3, add_self_loops = false)
-        @test length(Flux.params(l)) == 6
+        @test length(Flux.trainables(l)) == 6
         l = GATv2Conv((2, 4) => 3, add_self_loops = false, bias = false)
-        @test length(Flux.params(l)) == 4
+        @test length(Flux.trainables(l)) == 4
     end
 end
 
-@testset "GatedGraphConv" begin
+@testitem "GatedGraphConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     num_layers = 3
-    l = GatedGraphConv(out_channel, num_layers)
-    @test size(l.weight) == (out_channel, out_channel, num_layers)
+    l = GatedGraphConv(D_OUT, num_layers)
+    @test size(l.weight) == (D_OUT, D_OUT, num_layers)
 
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "EdgeConv" begin
-    l = EdgeConv(Dense(2 * in_channel, out_channel), aggr = +)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+@testitem "EdgeConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = EdgeConv(Dense(2 * D_IN, D_OUT), aggr = +)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "GINConv" begin
-    nn = Dense(in_channel, out_channel)
+@testitem "GINConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    nn = Dense(D_IN, D_OUT)
 
     l = GINConv(nn, 0.01f0, aggr = mean)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
     @test !in(:eps, Flux.trainable(l))
 end
 
-@testset "NNConv" begin
+@testitem "NNConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     edim = 10
-    nn = Dense(edim, out_channel * in_channel)
+    nn = Dense(edim, D_OUT * D_IN)
 
-    l = NNConv(in_channel => out_channel, nn, tanh, bias = true, aggr = +)
-    for g in test_graphs
-        g = GNNGraph(g, edata = rand(T, edim, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    l = NNConv(D_IN => D_OUT, nn, tanh, bias = true, aggr = +)
+    for g in TEST_GRAPHS
+        g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "SAGEConv" begin
-    l = SAGEConv(in_channel => out_channel)
+@testitem "SAGEConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = SAGEConv(D_IN => D_OUT)
     @test l.aggr == mean
 
-    l = SAGEConv(in_channel => out_channel, tanh, bias = false, aggr = +)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    l = SAGEConv(D_IN => D_OUT, tanh, bias = false, aggr = +)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "ResGatedGraphConv" begin
-    l = ResGatedGraphConv(in_channel => out_channel, tanh, bias = true)
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+@testitem "ResGatedGraphConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = ResGatedGraphConv(D_IN => D_OUT, tanh, bias = true)
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "CGConv" begin
+@testitem "CGConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+
     edim = 10
-    l = CGConv((in_channel, edim) => out_channel, tanh, residual = false, bias = true)
-    for g in test_graphs
-        g = GNNGraph(g, edata = rand(T, edim, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+    l = CGConv((D_IN, edim) => D_OUT, tanh, residual = false, bias = true)
+    for g in TEST_GRAPHS
+        g = GNNGraph(g, edata = rand(Float32, edim, g.num_edges))
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 
     # no edge features
-    l1 = CGConv(in_channel => out_channel, tanh, residual = false, bias = true)
+    l1 = CGConv(D_IN => D_OUT, tanh, residual = false, bias = true)
+    g1 = TEST_GRAPHS[1]
     @test l1(g1, g1.ndata.x) == l1(g1).ndata.x
     @test l1(g1, g1.ndata.x, nothing) == l1(g1).ndata.x
 end
 
-@testset "AGNNConv" begin
+@testitem "AGNNConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     l = AGNNConv(trainable=false, add_self_loops=false)
     @test l.β == [1.0f0]
     @test l.add_self_loops == false
@@ -253,115 +244,124 @@ end
     @test l.add_self_loops == true
     @test l.trainable == true 
     Flux.trainable(l) == (; β = [1f0])
-    for g in test_graphs
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (in_channel, g.num_nodes))
+    for g in TEST_GRAPHS
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_IN, g.num_nodes))
     end
 end
 
-@testset "MEGNetConv" begin
-    l = MEGNetConv(in_channel => out_channel, aggr = +)
-    for g in test_graphs
-        g = GNNGraph(g, edata = rand(T, in_channel, g.num_edges))
+@testitem "MEGNetConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    l = MEGNetConv(D_IN => D_OUT, aggr = +)
+    for g in TEST_GRAPHS
+        g = GNNGraph(g, edata = rand(Float32, D_IN, g.num_edges))
         test_layer(l, g, rtol = RTOL_LOW,
                     outtype = :node_edge,
-                    outsize = ((out_channel, g.num_nodes), (out_channel, g.num_edges)))
+                    outsize = ((D_OUT, g.num_nodes), (D_OUT, g.num_edges)))
     end
 end
 
-@testset "GMMConv" begin
+@testitem "GMMConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     ein_channel = 10
     K = 5
-    l = GMMConv((in_channel, ein_channel) => out_channel, K = K)
-    for g in test_graphs
+    l = GMMConv((D_IN, ein_channel) => D_OUT, K = K)
+    for g in TEST_GRAPHS
         g = GNNGraph(g, edata = rand(Float32, ein_channel, g.num_edges))
-        test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
     end
 end
 
-@testset "SGConv" begin
+@testitem "SGConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     K = [1, 2, 3] # for different number of hops       
     for k in K
-        l = SGConv(in_channel => out_channel, k, add_self_loops = true)
-        for g in test_graphs
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
+        for g in TEST_GRAPHS
+            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
         end
 
-        l = SGConv(in_channel => out_channel, k, add_self_loops = true)
-        for g in test_graphs
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        l = SGConv(D_IN => D_OUT, k, add_self_loops = true)
+        for g in TEST_GRAPHS
+            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
         end
     end
 end
 
-@testset "TAGConv" begin
+@testitem "TAGConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     K = [1, 2, 3]
     for k in K
-        l = TAGConv(in_channel => out_channel, k, add_self_loops = true)
-        for g in test_graphs
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
+        for g in TEST_GRAPHS
+            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
         end
 
-        l = TAGConv(in_channel => out_channel, k, add_self_loops = true)
-        for g in test_graphs
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        l = TAGConv(D_IN => D_OUT, k, add_self_loops = true)
+        for g in TEST_GRAPHS
+            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
         end
     end
 end
 
-@testset "EGNNConv" begin
-    hin = 5
-    hout = 5
-    hidden = 5
-    l = EGNNConv(hin => hout, hidden)
-    g = rand_graph(10, 20, graph_type = GRAPH_T)
-    x = rand(T, in_channel, g.num_nodes)
-    h = randn(T, hin, g.num_nodes)
-    hnew, xnew = l(g, h, x)
-    @test size(hnew) == (hout, g.num_nodes)
-    @test size(xnew) == (in_channel, g.num_nodes)
+@testitem "EGNNConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
+    #TODO test gradient
+    @testset "EGNNConv $GRAPH_T" for GRAPH_T in GRAPH_TYPES
+        hin = 5
+        hout = 5
+        hidden = 5
+        l = EGNNConv(hin => hout, hidden)
+        g = rand_graph(10, 20, graph_type = GRAPH_T)
+        x = rand(Float32, D_IN, g.num_nodes)
+        h = randn(Float32, hin, g.num_nodes)
+        hnew, xnew = l(g, h, x)
+        @test size(hnew) == (hout, g.num_nodes)
+        @test size(xnew) == (D_IN, g.num_nodes)
+    end
 end
 
-@testset "TransformerConv" begin
+@testitem "TransformerConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     ein = 2
     heads = 3
     # used like in Kool et al., 2019
-    l = TransformerConv(in_channel * heads => in_channel; heads, add_self_loops = true,
+    l = TransformerConv(D_IN * heads => D_IN; heads, add_self_loops = true,
                         root_weight = false, ff_channels = 10, skip_connection = true,
                         batch_norm = false)
     # batch_norm=false here for tests to pass; true in paper
-    for adj in [adj1, adj_single_vertex]
-        g = GNNGraph(adj, ndata = rand(T, in_channel * heads, size(adj, 1)),
-                        graph_type = GRAPH_T)
+    for g in TEST_GRAPHS
+        g = GNNGraph(g, ndata = rand(Float32, D_IN * heads, g.num_nodes))
         test_layer(l, g, rtol = RTOL_LOW,
                     exclude_grad_fields = [:negative_slope],
-                    outsize = (in_channel * heads, g.num_nodes))
+                    outsize = (D_IN * heads, g.num_nodes))
     end
     # used like in Shi et al., 2021 
-    l = TransformerConv((in_channel, ein) => in_channel; heads, gating = true,
+    l = TransformerConv((D_IN, ein) => D_IN; heads, gating = true,
                         bias_qkv = true)
-    for g in test_graphs
-        g = GNNGraph(g, edata = rand(T, ein, g.num_edges))
+    for g in TEST_GRAPHS
+        g = GNNGraph(g, edata = rand(Float32, ein, g.num_edges))
         test_layer(l, g, rtol = RTOL_LOW,
                     exclude_grad_fields = [:negative_slope],
-                    outsize = (in_channel * heads, g.num_nodes))
+                    outsize = (D_IN * heads, g.num_nodes))
     end
     # test averaging heads
-    l = TransformerConv(in_channel => in_channel; heads, concat = false,
+    l = TransformerConv(D_IN => D_IN; heads, concat = false,
                         bias_root = false,
                         root_weight = false)
-    for g in test_graphs
+    for g in TEST_GRAPHS
         test_layer(l, g, rtol = RTOL_LOW,
                     exclude_grad_fields = [:negative_slope],
-                    outsize = (in_channel, g.num_nodes))
+                    outsize = (D_IN, g.num_nodes))
     end
 end
 
-@testset "DConv" begin
+@testitem "DConv" setup=[TolSnippet, TestModule] begin
+    using .TestModule
     K = [1, 2, 3] # for different number of hops       
     for k in K
-        l = DConv(in_channel => out_channel, k)
-        for g in test_graphs
-            test_layer(l, g, rtol = RTOL_HIGH, outsize = (out_channel, g.num_nodes))
+        l = DConv(D_IN => D_OUT, k)
+        for g in TEST_GRAPHS
+            test_layer(l, g, rtol = RTOL_HIGH, outsize = (D_OUT, g.num_nodes))
         end
     end
 end
\ No newline at end of file
diff --git a/GraphNeuralNetworks/test/layers/heteroconv.jl b/GraphNeuralNetworks/test/layers/heteroconv.jl
index d9eaf0c7f..7584378cf 100644
--- a/GraphNeuralNetworks/test/layers/heteroconv.jl
+++ b/GraphNeuralNetworks/test/layers/heteroconv.jl
@@ -1,4 +1,5 @@
-@testset "HeteroGraphConv" begin
+@testitem "HeteroGraphConv" setup=[TestModule] begin
+    using .TestModule
     d, n = 3, 5
     g = rand_bipartite_heterograph((n, 2*n), 15)
     hg = rand_bipartite_heterograph((2,3), 6)
diff --git a/GraphNeuralNetworks/test/layers/pool.jl b/GraphNeuralNetworks/test/layers/pool.jl
index 24f5d66bf..60789f2e5 100644
--- a/GraphNeuralNetworks/test/layers/pool.jl
+++ b/GraphNeuralNetworks/test/layers/pool.jl
@@ -1,45 +1,52 @@
-@testset "GlobalPool" begin
-    p = GlobalPool(+)
-    n = 10
-    chin = 6
-    X = rand(Float32, 6, n)
-    g = GNNGraph(random_regular_graph(n, 4), ndata = X, graph_type = GRAPH_T)
-    u = p(g, X)
-    @test u ≈ sum(X, dims = 2)
+@testitem "GlobalPool" setup=[TestModule] begin
+    using .TestModule
+    @testset "GlobalPool $GRAPH_T" for GRAPH_T in GRAPH_TYPES
+        p = GlobalPool(+)
+        n = 10
+        chin = 6
+        X = rand(Float32, 6, n)
+        g = GNNGraph(random_regular_graph(n, 4), ndata = X, graph_type = GRAPH_T)
+        u = p(g, X)
+        @test u ≈ sum(X, dims = 2)
 
-    ng = 3
-    g = Flux.batch([GNNGraph(random_regular_graph(n, 4),
-                                ndata = rand(Float32, chin, n),
-                                graph_type = GRAPH_T)
-                    for i in 1:ng])
-    u = p(g, g.ndata.x)
-    @test size(u) == (chin, ng)
-    @test u[:, [1]] ≈ sum(g.ndata.x[:, 1:n], dims = 2)
-    @test p(g).gdata.u == u
+        ng = 3
+        g = Flux.batch([GNNGraph(random_regular_graph(n, 4),
+                                    ndata = rand(Float32, chin, n),
+                                    graph_type = GRAPH_T)
+                        for i in 1:ng])
+        u = p(g, g.ndata.x)
+        @test size(u) == (chin, ng)
+        @test u[:, [1]] ≈ sum(g.ndata.x[:, 1:n], dims = 2)
+        @test p(g).gdata.u == u
 
-    test_layer(p, g, rtol = 1e-5, exclude_grad_fields = [:aggr], outtype = :graph)
+        test_layer(p, g, rtol = 1e-5, exclude_grad_fields = [:aggr], outtype = :graph)
+    end
 end
 
-@testset "GlobalAttentionPool" begin
-    n = 10
-    chin = 6
-    chout = 5
-    ng = 3
+@testitem "GlobalAttentionPool" setup=[TestModule] begin
+    using .TestModule
+    @testset "GlobalAttentionPool $GRAPH_T" for GRAPH_T in GRAPH_TYPES
+        n = 10
+        chin = 6
+        chout = 5
+        ng = 3
 
-    fgate = Dense(chin, 1)
-    ffeat = Dense(chin, chout)
-    p = GlobalAttentionPool(fgate, ffeat)
-    @test length(Flux.params(p)) == 4
+        fgate = Dense(chin, 1)
+        ffeat = Dense(chin, chout)
+        p = GlobalAttentionPool(fgate, ffeat)
+        @test length(Flux.trainables(p)) == 4
 
-    g = Flux.batch([GNNGraph(random_regular_graph(n, 4),
-                                ndata = rand(Float32, chin, n),
-                                graph_type = GRAPH_T)
-                    for i in 1:ng])
+        g = Flux.batch([GNNGraph(random_regular_graph(n, 4),
+                                    ndata = rand(Float32, chin, n),
+                                    graph_type = GRAPH_T)
+                        for i in 1:ng])
 
-    test_layer(p, g, rtol = 1e-5, outtype = :graph, outsize = (chout, ng))
+        test_layer(p, g, rtol = 1e-5, outtype = :graph, outsize = (chout, ng))
+    end
 end
 
-@testset "TopKPool" begin
+@testitem "TopKPool" setup=[TestModule] begin
+    using .TestModule
     N = 10
     k, in_channel = 4, 7
     X = rand(in_channel, N)
@@ -55,23 +62,28 @@ end
     end
 end
 
-@testset "topk_index" begin
+
+@testitem "topk_index" begin
     X = [8, 7, 6, 5, 4, 3, 2, 1]
     @test topk_index(X, 4) == [1, 2, 3, 4]
     @test topk_index(X', 4) == [1, 2, 3, 4]
 end
 
-@testset "Set2Set" begin
-    n_in = 3
-    n_iters = 2
-    n_layers = 1
-    g = batch([rand_graph(10, 40, graph_type = GRAPH_T) for _ in 1:5])
-    g = GNNGraph(g, ndata = rand(Float32, n_in, g.num_nodes))
-    l = Set2Set(n_in, n_iters, n_layers)
-    y = l(g, node_features(g))
-    @test size(y) == (2 * n_in, g.num_graphs)
-    
-    ## TODO the numerical gradient seems to be 3 times smaller than zygote one
-    # test_layer(l, g, rtol = 1e-4, atol=1e-4, outtype = :graph, outsize = (2 * n_in, g.num_graphs), 
-    #         verbose=true, exclude_grad_fields = [:state0, :state])
-end
\ No newline at end of file
+@testitem "Set2Set" setup=[TestModule] begin
+    using .TestModule
+    @testset "Set2Set $GRAPH_T" for GRAPH_T in GRAPH_TYPES
+            
+        n_in = 3
+        n_iters = 2
+        n_layers = 1
+        g = batch([rand_graph(10, 40, graph_type = GRAPH_T) for _ in 1:5])
+        g = GNNGraph(g, ndata = rand(Float32, n_in, g.num_nodes))
+        l = Set2Set(n_in, n_iters, n_layers)
+        y = l(g, node_features(g))
+        @test size(y) == (2 * n_in, g.num_graphs)
+        
+        ## TODO the numerical gradient seems to be 3 times smaller than zygote one
+        # test_layer(l, g, rtol = 1e-4, atol=1e-4, outtype = :graph, outsize = (2 * n_in, g.num_graphs), 
+        #         verbose=true, exclude_grad_fields = [:state0, :state])
+    end
+end
diff --git a/GraphNeuralNetworks/test/layers/temporalconv.jl b/GraphNeuralNetworks/test/layers/temporalconv.jl
index bdf44b45f..ab7d28a3a 100644
--- a/GraphNeuralNetworks/test/layers/temporalconv.jl
+++ b/GraphNeuralNetworks/test/layers/temporalconv.jl
@@ -1,16 +1,21 @@
-in_channel = 3
-out_channel = 5
-N = 4
-S = 5
-T = Float32
+@testmodule TemporalConvTestModule begin
+    using GraphNeuralNetworks
+    export in_channel, out_channel, N, S, T, g1, tg
+    in_channel = 3
+    out_channel = 5
+    N = 4
+    S = 5
+    T = Float32
 
-g1 = GNNGraph(rand_graph(N,8),
-                ndata = rand(T, in_channel, N),
-                graph_type = :sparse)
+    g1 = GNNGraph(rand_graph(N,8),
+                    ndata = rand(T, in_channel, N),
+                    graph_type = :sparse)
 
-tg = TemporalSnapshotsGNNGraph([g1 for _ in 1:S])
+    tg = TemporalSnapshotsGNNGraph([g1 for _ in 1:S])
+end
 
-@testset "TGCNCell" begin
+@testitem "TGCNCell" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     tgcn = GraphNeuralNetworks.TGCNCell(in_channel => out_channel)
     h, x̃ = tgcn(tgcn.state0, g1, g1.ndata.x)
     @test size(h) == (out_channel, N)
@@ -18,7 +23,8 @@ tg = TemporalSnapshotsGNNGraph([g1 for _ in 1:S])
     @test h == x̃
 end
 
-@testset "TGCN" begin
+@testitem "TGCN" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     tgcn = TGCN(in_channel => out_channel)
     @test size(Flux.gradient(x -> sum(tgcn(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
     model = GNNChain(TGCN(in_channel => out_channel), Dense(out_channel, 1))
@@ -26,7 +32,8 @@ end
     @test model(g1) isa GNNGraph            
 end
 
-@testset "A3TGCN" begin
+@testitem "A3TGCN" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     a3tgcn = A3TGCN(in_channel => out_channel)
     @test size(Flux.gradient(x -> sum(a3tgcn(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
     model = GNNChain(A3TGCN(in_channel => out_channel), Dense(out_channel, 1))
@@ -34,26 +41,30 @@ end
     @test model(g1) isa GNNGraph            
 end
 
-@testset "GConvLSTMCell" begin
+@testitem "GConvLSTMCell" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     gconvlstm = GraphNeuralNetworks.GConvLSTMCell(in_channel => out_channel, 2, g1.num_nodes)
     (h, c), h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
     @test size(h) == (out_channel, N)
     @test size(c) == (out_channel, N)
 end
 
-@testset "GConvLSTM" begin
+@testitem "GConvLSTM" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     gconvlstm = GConvLSTM(in_channel => out_channel, 2, g1.num_nodes)
     @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
     model = GNNChain(GConvLSTM(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
 end
 
-@testset "GConvGRUCell" begin
+@testitem "GConvGRUCell" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     gconvlstm = GraphNeuralNetworks.GConvGRUCell(in_channel => out_channel, 2, g1.num_nodes)
     h, h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
     @test size(h) == (out_channel, N)
 end
 
-@testset "GConvGRU" begin
+@testitem "GConvGRU" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     gconvlstm = GConvGRU(in_channel => out_channel, 2, g1.num_nodes)
     @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
     model = GNNChain(GConvGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
@@ -61,7 +72,8 @@ end
     @test model(g1) isa GNNGraph            
 end
 
-@testset "DCGRU" begin
+@testitem "DCGRU" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     dcgru = DCGRU(in_channel => out_channel, 2, g1.num_nodes)
     @test size(Flux.gradient(x -> sum(dcgru(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
     model = GNNChain(DCGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
@@ -69,90 +81,23 @@ end
     @test model(g1) isa GNNGraph            
 end
 
-@testset "EvolveGCNO" begin
+@testitem "EvolveGCNO" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     evolvegcno = EvolveGCNO(in_channel => out_channel)
     @test length(Flux.gradient(x -> sum(sum(evolvegcno(tg, x))), tg.ndata.x)[1]) == S
     @test size(evolvegcno(tg, tg.ndata.x)[1]) ==  (out_channel, N)
 end
 
-@testset "GINConv" begin
+@testitem "GINConv" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     ginconv = GINConv(Dense(in_channel => out_channel),0.3)
     @test length(ginconv(tg, tg.ndata.x)) == S
     @test size(ginconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
     @test length(Flux.gradient(x ->sum(sum(ginconv(tg, x))), tg.ndata.x)[1]) == S    
 end
 
-@testset "ChebConv" begin
-    chebconv = ChebConv(in_channel => out_channel, 5)
-    @test length(chebconv(tg, tg.ndata.x)) == S
-    @test size(chebconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(chebconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "GATConv" begin
-    gatconv = GATConv(in_channel => out_channel)
-    @test length(gatconv(tg, tg.ndata.x)) == S
-    @test size(gatconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(gatconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "GATv2Conv" begin
-    gatv2conv = GATv2Conv(in_channel => out_channel)
-    @test length(gatv2conv(tg, tg.ndata.x)) == S
-    @test size(gatv2conv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(gatv2conv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "GatedGraphConv" begin
-    gatedgraphconv = GatedGraphConv(5, 5)
-    @test length(gatedgraphconv(tg, tg.ndata.x)) == S
-    @test size(gatedgraphconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(gatedgraphconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "CGConv" begin
-    cgconv = CGConv(in_channel => out_channel)
-    @test length(cgconv(tg, tg.ndata.x)) == S
-    @test size(cgconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(cgconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "SGConv" begin
-    sgconv = SGConv(in_channel => out_channel)
-    @test length(sgconv(tg, tg.ndata.x)) == S
-    @test size(sgconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(sgconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "TransformerConv" begin
-    transformerconv = TransformerConv(in_channel => out_channel)
-    @test length(transformerconv(tg, tg.ndata.x)) == S
-    @test size(transformerconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(transformerconv(tg, x))), tg.ndata.x)[1]) == S
-end
-
-@testset "GCNConv" begin
-    gcnconv = GCNConv(in_channel => out_channel)
-    @test length(gcnconv(tg, tg.ndata.x)) == S
-    @test size(gcnconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(gcnconv(tg, x))), tg.ndata.x)[1]) == S    
-end
-
-@testset "ResGatedGraphConv" begin
-    resgatedconv = ResGatedGraphConv(in_channel => out_channel, tanh)
-    @test length(resgatedconv(tg, tg.ndata.x)) == S
-    @test size(resgatedconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(resgatedconv(tg, x))), tg.ndata.x)[1]) == S    
-end
-
-@testset "SAGEConv" begin 
-    sageconv = SAGEConv(in_channel => out_channel)
-    @test length(sageconv(tg, tg.ndata.x)) == S
-    @test size(sageconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(sageconv(tg, x))), tg.ndata.x)[1]) == S    
-end
-
-@testset "GraphConv" begin
+@testitem "GraphConv" setup=[TemporalConvTestModule, TestModule] begin
+    using .TemporalConvTestModule, .TestModule
     graphconv = GraphConv(in_channel => out_channel, tanh)
     @test length(graphconv(tg, tg.ndata.x)) == S
     @test size(graphconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
diff --git a/GraphNeuralNetworks/test/runtests.jl b/GraphNeuralNetworks/test/runtests.jl
index f796651bb..b9e874db1 100644
--- a/GraphNeuralNetworks/test/runtests.jl
+++ b/GraphNeuralNetworks/test/runtests.jl
@@ -1,49 +1,3 @@
-using CUDA
-using GraphNeuralNetworks
-using GNNGraphs: sort_edge_index
-using GNNGraphs: getn, getdata
-using Functors
-using Flux
-using Flux: gpu
-using LinearAlgebra, Statistics, Random
-using NNlib
-import MLUtils
-using SparseArrays
-using Graphs
-using Zygote
-using Test
-using MLDatasets
-using InlineStrings  # not used but with the import we test #98 and #104
+using TestItemRunner
 
-CUDA.allowscalar(false)
-
-const ACUMatrix{T} = Union{CuMatrix{T}, CUDA.CUSPARSE.CuSparseMatrix{T}}
-
-ENV["DATADEPS_ALWAYS_ACCEPT"] = true # for MLDatasets
-
-include("test_utils.jl")
-
-tests = [
-    "layers/basic",
-    "layers/conv",
-    "layers/heteroconv",
-    "layers/temporalconv",
-    "layers/pool",
-    "examples/node_classification_cora",
-    "samplers"
-]
-
-!CUDA.functional() && @warn("CUDA unavailable, not testing GPU support")
-
-# @testset "GraphNeuralNetworks: graph format $graph_type" for graph_type in (:coo, :dense, :sparse)
-for graph_type in (:coo, :dense, :sparse)
-
-    @info "Testing graph format :$graph_type"
-    global GRAPH_T = graph_type
-    global TEST_GPU = CUDA.functional() && (GRAPH_T != :sparse)
-
-    @testset "$t" for t in tests
-        startswith(t, "examples") && GRAPH_T == :dense && continue     # not testing :dense since causes OutOfMememory on github's CI
-        include("$t.jl")
-    end
-end
+@run_package_tests
diff --git a/GraphNeuralNetworks/test/samplers.jl b/GraphNeuralNetworks/test/samplers.jl
index 546291717..649cea70a 100644
--- a/GraphNeuralNetworks/test/samplers.jl
+++ b/GraphNeuralNetworks/test/samplers.jl
@@ -1,125 +1,126 @@
-# Helper function to create a simple graph with node features using GNNGraph
-function create_test_graph()
-    source = [1, 2, 3, 4]  # Define source nodes of edges
-    target = [2, 3, 4, 5]  # Define target nodes of edges
-    node_features = rand(Float32, 5, 5)  # Create random node features (5 features for 5 nodes)
+#TODO reactivate test
+# @testitem "NeighborLoader"  setup=[TestModule] begin
+#     using .TestModule
+#     # Helper function to create a simple graph with node features using GNNGraph
+#     function create_test_graph()
+#         source = [1, 2, 3, 4]  # Define source nodes of edges
+#         target = [2, 3, 4, 5]  # Define target nodes of edges
+#         node_features = rand(Float32, 5, 5)  # Create random node features (5 features for 5 nodes)
 
-    return GNNGraph(source, target, ndata = node_features)  # Create a GNNGraph with edges and features
-end
+#         return GNNGraph(source, target, ndata = node_features)  # Create a GNNGraph with edges and features
+#     end
 
-# Tests for NeighborLoader structure and its functionalities
-@testset "NeighborLoader tests" begin
 
-    # 1. Basic functionality: Check neighbor sampling and subgraph creation
-    @testset "Basic functionality" begin
-        g = create_test_graph()
+#     # 1. Basic functionality: Check neighbor sampling and subgraph creation
+#     @testset "Basic functionality" begin
+#         g = create_test_graph()
 
-        # Define NeighborLoader with 2 neighbors per layer, 2 layers, batch size 2
-        loader = NeighborLoader(g; num_neighbors=[2, 2], input_nodes=[1, 2], num_layers=2, batch_size=2)
+#         # Define NeighborLoader with 2 neighbors per layer, 2 layers, batch size 2
+#         loader = NeighborLoader(g; num_neighbors=[2, 2], input_nodes=[1, 2], num_layers=2, batch_size=2)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph is not empty
-        @test !isempty(mini_batch_gnn.graph)
+#         # Test if the mini-batch graph is not empty
+#         @test !isempty(mini_batch_gnn.graph)
 
-        num_sampled_nodes = mini_batch_gnn.num_nodes
-        println("Number of nodes in mini-batch: ", num_sampled_nodes)
+#         num_sampled_nodes = mini_batch_gnn.num_nodes
+#         println("Number of nodes in mini-batch: ", num_sampled_nodes)
 
-        @test num_sampled_nodes == 2
+#         @test num_sampled_nodes == 2
 
-        # Test if there are edges in the subgraph
-        @test mini_batch_gnn.num_edges > 0
-    end
+#         # Test if there are edges in the subgraph
+#         @test mini_batch_gnn.num_edges > 0
+#     end
 
-    # 2. Edge case: Single node with no neighbors
-    @testset "Single node with no neighbors" begin
-        g = SimpleDiGraph(1)  # A graph with a single node and no edges
-        node_features = rand(Float32, 5, 1)
-        graph = GNNGraph(g, ndata = node_features)
+#     # 2. Edge case: Single node with no neighbors
+#     @testset "Single node with no neighbors" begin
+#         g = SimpleDiGraph(1)  # A graph with a single node and no edges
+#         node_features = rand(Float32, 5, 1)
+#         graph = GNNGraph(g, ndata = node_features)
 
-        loader = NeighborLoader(graph; num_neighbors=[2], input_nodes=[1], num_layers=1)
+#         loader = NeighborLoader(graph; num_neighbors=[2], input_nodes=[1], num_layers=1)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph contains only one node
-        @test size(mini_batch_gnn.x, 2) == 1
-    end
+#         # Test if the mini-batch graph contains only one node
+#         @test size(mini_batch_gnn.x, 2) == 1
+#     end
 
-    # 3. Edge case: A node with no outgoing edges (isolated node)
-    @testset "Node with no outgoing edges" begin
-        g = SimpleDiGraph(2)  # Graph with 2 nodes, no edges
-        node_features = rand(Float32, 5, 2)
-        graph = GNNGraph(g, ndata = node_features)
+#     # 3. Edge case: A node with no outgoing edges (isolated node)
+#     @testset "Node with no outgoing edges" begin
+#         g = SimpleDiGraph(2)  # Graph with 2 nodes, no edges
+#         node_features = rand(Float32, 5, 2)
+#         graph = GNNGraph(g, ndata = node_features)
 
-        loader = NeighborLoader(graph; num_neighbors=[1], input_nodes=[1, 2], num_layers=1)
+#         loader = NeighborLoader(graph; num_neighbors=[1], input_nodes=[1, 2], num_layers=1)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph contains the input nodes only (as no neighbors can be sampled)
-        @test size(mini_batch_gnn.x, 2) == 2  # Only two isolated nodes
-    end
+#         # Test if the mini-batch graph contains the input nodes only (as no neighbors can be sampled)
+#         @test size(mini_batch_gnn.x, 2) == 2  # Only two isolated nodes
+#     end
 
-    # 4. Edge case: A fully connected graph
-    @testset "Fully connected graph" begin
-        g = SimpleDiGraph(3)
-        add_edge!(g, 1, 2)
-        add_edge!(g, 2, 3)
-        add_edge!(g, 3, 1)
-        node_features = rand(Float32, 5, 3)
-        graph = GNNGraph(g, ndata = node_features)
+#     # 4. Edge case: A fully connected graph
+#     @testset "Fully connected graph" begin
+#         g = SimpleDiGraph(3)
+#         add_edge!(g, 1, 2)
+#         add_edge!(g, 2, 3)
+#         add_edge!(g, 3, 1)
+#         node_features = rand(Float32, 5, 3)
+#         graph = GNNGraph(g, ndata = node_features)
 
-        loader = NeighborLoader(graph; num_neighbors=[2, 2], input_nodes=[1], num_layers=2)
+#         loader = NeighborLoader(graph; num_neighbors=[2, 2], input_nodes=[1], num_layers=2)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if all nodes are included in the mini-batch since it's fully connected
-        @test size(mini_batch_gnn.x, 2) == 3  # All nodes should be included
-    end
+#         # Test if all nodes are included in the mini-batch since it's fully connected
+#         @test size(mini_batch_gnn.x, 2) == 3  # All nodes should be included
+#     end
 
-    # 5. Edge case: More layers than the number of neighbors
-    @testset "More layers than available neighbors" begin
-        g = SimpleDiGraph(3)
-        add_edge!(g, 1, 2)
-        add_edge!(g, 2, 3)
-        node_features = rand(Float32, 5, 3)
-        graph = GNNGraph(g, ndata = node_features)
+#     # 5. Edge case: More layers than the number of neighbors
+#     @testset "More layers than available neighbors" begin
+#         g = SimpleDiGraph(3)
+#         add_edge!(g, 1, 2)
+#         add_edge!(g, 2, 3)
+#         node_features = rand(Float32, 5, 3)
+#         graph = GNNGraph(g, ndata = node_features)
 
-        # Test with 3 layers but only enough connections for 2 layers
-        loader = NeighborLoader(graph; num_neighbors=[1, 1, 1], input_nodes=[1], num_layers=3)
+#         # Test with 3 layers but only enough connections for 2 layers
+#         loader = NeighborLoader(graph; num_neighbors=[1, 1, 1], input_nodes=[1], num_layers=3)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph contains all available nodes
-        @test size(mini_batch_gnn.x, 2) == 1
-    end
+#         # Test if the mini-batch graph contains all available nodes
+#         @test size(mini_batch_gnn.x, 2) == 1
+#     end
 
-    # 6. Edge case: Large batch size greater than the number of input nodes
-    @testset "Large batch size" begin
-        g = create_test_graph()
+#     # 6. Edge case: Large batch size greater than the number of input nodes
+#     @testset "Large batch size" begin
+#         g = create_test_graph()
 
-        # Define NeighborLoader with a larger batch size than input nodes
-        loader = NeighborLoader(g; num_neighbors=[2], input_nodes=[1, 2], num_layers=1, batch_size=10)
+#         # Define NeighborLoader with a larger batch size than input nodes
+#         loader = NeighborLoader(g; num_neighbors=[2], input_nodes=[1, 2], num_layers=1, batch_size=10)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph is not empty
-        @test !isempty(mini_batch_gnn.graph)
+#         # Test if the mini-batch graph is not empty
+#         @test !isempty(mini_batch_gnn.graph)
 
-        # Test if the correct number of nodes are sampled
-        @test size(mini_batch_gnn.x, 2) == length(unique([1, 2]))  # Nodes [1, 2] are expected
-    end
+#         # Test if the correct number of nodes are sampled
+#         @test size(mini_batch_gnn.x, 2) == length(unique([1, 2]))  # Nodes [1, 2] are expected
+#     end
 
-    # 7. Edge case: No neighbors sampled (num_neighbors = [0]) and 1 layer
-    @testset "No neighbors sampled" begin
-        g = create_test_graph()
+#     # 7. Edge case: No neighbors sampled (num_neighbors = [0]) and 1 layer
+#     @testset "No neighbors sampled" begin
+#         g = create_test_graph()
 
-        # Define NeighborLoader with 0 neighbors per layer, 1 layer, batch size 2
-        loader = NeighborLoader(g; num_neighbors=[0], input_nodes=[1, 2], num_layers=1, batch_size=2)
+#         # Define NeighborLoader with 0 neighbors per layer, 1 layer, batch size 2
+#         loader = NeighborLoader(g; num_neighbors=[0], input_nodes=[1, 2], num_layers=1, batch_size=2)
 
-        mini_batch_gnn, next_state = iterate(loader)
+#         mini_batch_gnn, next_state = iterate(loader)
 
-        # Test if the mini-batch graph contains only the input nodes
-        @test size(mini_batch_gnn.x, 2) == 2  # No neighbors should be sampled, only nodes 1 and 2 should be in the graph
-    end
+#         # Test if the mini-batch graph contains only the input nodes
+#         @test size(mini_batch_gnn.x, 2) == 2  # No neighbors should be sampled, only nodes 1 and 2 should be in the graph
+#     end
 
-end
\ No newline at end of file
+# end
\ No newline at end of file
diff --git a/GraphNeuralNetworks/test/test_utils.jl b/GraphNeuralNetworks/test/test_module.jl
similarity index 86%
rename from GraphNeuralNetworks/test/test_utils.jl
rename to GraphNeuralNetworks/test/test_module.jl
index fe8f9a997..7c5b9a77a 100644
--- a/GraphNeuralNetworks/test/test_utils.jl
+++ b/GraphNeuralNetworks/test/test_module.jl
@@ -1,6 +1,23 @@
+@testmodule TestModule begin
+
+using GraphNeuralNetworks
+using Test
+using Statistics, Random
+using Flux, Functors
+using Graphs
 using ChainRulesTestUtils, FiniteDifferences, Zygote, Adapt, CUDA
 CUDA.allowscalar(false)
 
+# from other packages
+export Flux, gradient, Dense, Chain, relu, random_regular_graph, erdos_renyi,
+       BatchNorm, LayerNorm, Dropout, Parallel
+export mean, randn
+# from this module
+export D_IN, D_OUT, test_layer, ngradient, GRAPH_TYPES, TEST_GRAPHS
+
+const D_IN = 3
+const D_OUT = 5
+
 function ngradient(f, x...)
     fdm = central_fdm(5, 1)
     return FiniteDifferences.grad(fdm, f, x...)
@@ -24,7 +41,7 @@ end
 function test_layer(l, g::GNNGraph; atol = 1e-5, rtol = 1e-5,
                     exclude_grad_fields = [],
                     verbose = false,
-                    test_gpu = TEST_GPU,
+                    test_gpu = false,
                     outsize = nothing,
                     outtype = :node)
 
@@ -226,3 +243,33 @@ Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Integer}) where T
 Adapt.adapt_storage(::GNNEltypeAdaptor{T}, x::AbstractArray{<:Number}) where T = convert(AbstractArray{T}, x)
 
 _paramtype(::Type{T}, m) where T = fmap(adapt(GNNEltypeAdaptor{T}()), m)
+
+function generate_test_graphs(graph_type)
+    adj1 = [0 1 0 1
+            1 0 1 0
+            0 1 0 1
+            1 0 1 0]
+
+    g1 = GNNGraph(adj1,
+                    ndata = rand(Float32, D_IN, 4);
+                    graph_type)
+
+    adj_single_vertex = [0 0 0 1
+                            0 0 0 0
+                            0 0 0 1
+                            1 0 1 0]
+
+    g_single_vertex = GNNGraph(adj_single_vertex,
+                                ndata = rand(Float32, D_IN, 4);
+                                graph_type)
+
+    return (g1, g_single_vertex)
+end
+
+GRAPH_TYPES = [:coo, :dense, :sparse]
+TEST_GRAPHS = [generate_test_graphs(:coo)...,
+               generate_test_graphs(:dense)...,
+               generate_test_graphs(:sparse)...]
+
+end # testmodule
+