diff --git a/GNNlib/ext/GNNlibCUDAExt.jl b/GNNlib/ext/GNNlibCUDAExt.jl
index 7cf5adceb..afe22c3f0 100644
--- a/GNNlib/ext/GNNlibCUDAExt.jl
+++ b/GNNlib/ext/GNNlibCUDAExt.jl
@@ -10,7 +10,7 @@ using GNNGraphs: GNNGraph, COO_T, SPARSE_T
 ## COPY_XJ 
 
 ## avoid the fast path on gpu until we have better cuda support
-function GNNlib.propagate(::typeof(copy_xj), g::GNNGraph{<:Union{COO_T, SPARSE_T}}, ::typeof(+),
+function GNNlib.propagate(::typeof(copy_xj), g::GNNGraph{COO_T}, ::typeof(+),
         xi, xj::AnyCuMatrix, e)
     propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
 end
diff --git a/GNNlib/src/msgpass.jl b/GNNlib/src/msgpass.jl
index 7bbe2ab58..7b7685e1b 100644
--- a/GNNlib/src/msgpass.jl
+++ b/GNNlib/src/msgpass.jl
@@ -213,7 +213,7 @@ end
 ## COPY_XJ 
 
 function propagate(::typeof(copy_xj), g::GNNGraph, ::typeof(+), xi, xj::AbstractMatrix, e)
-    A = adjacency_matrix(g, weighted = false)
+    A = adjacency_matrix(g, eltype(xj); weighted = false)
     return xj * A
 end
 
diff --git a/GraphNeuralNetworks/perf/Project.toml b/GraphNeuralNetworks/perf/Project.toml
index ddbb1be6e..c09a51049 100644
--- a/GraphNeuralNetworks/perf/Project.toml
+++ b/GraphNeuralNetworks/perf/Project.toml
@@ -1,6 +1,10 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+GNNGraphs = "aed8fd31-079b-4b5a-b342-a13352159b8c"
+GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48"
 GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
+Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 Graphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
diff --git a/GraphNeuralNetworks/perf/sparse_propagate_cuda.jl b/GraphNeuralNetworks/perf/sparse_propagate_cuda.jl
new file mode 100644
index 000000000..fee5372d6
--- /dev/null
+++ b/GraphNeuralNetworks/perf/sparse_propagate_cuda.jl
@@ -0,0 +1,48 @@
+# # Activate the perf environment
+# using Pkg
+# Pkg.activate(@__DIR__)
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
+# Pkg.develop(path=joinpath(@__DIR__, ".."))
+# Pkg.instantiate()
+using SparseArrays
+using GraphNeuralNetworks
+using BenchmarkTools
+import Random: seed!
+using LinearAlgebra
+using Flux, CUDA
+
+# ENV["JULIA_DEBUG"] = "GraphNeuralNetworks,GNNlib,GNNlibCUDAExt,GNNGraphs,GNNGraphsCUDAExt,CUDA" # packages with debugging enabled, don't put a whitespace between the package names
+
+function prop_copy_xj(graph_type, sp_p, n, feat_size)
+    A = sprand(n, n, sp_p)
+    b = rand(1, n)
+    B = rand(feat_size, n)
+    g = GNNGraph(A,
+                 ndata = (; b = b, B = B),
+                 edata = (; A = reshape(A.nzval, 1, :)),
+                 graph_type = graph_type) |> dev
+    printstyled("propagate copy_xj for graph type: $graph_type", "\n", color=:yellow)
+    CUDA.@sync propagate(copy_xj, g, +; xj = g.ndata.B) # run once to compile before benchmarking
+    # @profview for _ in 1:1000
+    #     propagate(copy_xj, g, +; xj = g.ndata.B)
+    # end
+    @btime CUDA.@sync propagate($copy_xj, $g, +; xj = $g.ndata.B) # using spmm for :sparse
+    printstyled("gather/scatter propagate copy_xj for graph type: $graph_type", "\n", color=:yellow)
+    CUDA.@sync propagate((xi, xj, e) -> xj, g, +; xj = g.ndata.B) # run once to compile before benchmarking
+    @btime CUDA.@sync propagate((xi, xj, e) -> xj, $g, +; xj = $g.ndata.B) # using gather/scatter
+    return nothing
+end
+
+seed!(0)
+dev = gpu_device()
+println("Device: ", dev)
+feat_size = 128
+# test for :sparse graph_type
+for n in (32, 128, 1024)
+    for sp_p in (0.01, 0.1, 0.9)
+        printstyled("n = $n, feat_size = $feat_size, sparsity = $sp_p\n", color=:blue)
+        prop_copy_xj(:sparse, sp_p, n, feat_size)
+        println()
+    end
+end