diff --git a/GNNGraphs/ext/GNNGraphsCUDAExt.jl b/GNNGraphs/ext/GNNGraphsCUDAExt.jl
index af9e9f820..2b38ee739 100644
--- a/GNNGraphs/ext/GNNGraphsCUDAExt.jl
+++ b/GNNGraphs/ext/GNNGraphsCUDAExt.jl
@@ -5,8 +5,10 @@ using Random, Statistics, LinearAlgebra
 using GNNGraphs
 using GNNGraphs: COO_T, ADJMAT_T, SPARSE_T 
 using SparseArrays
+using Graphs
 
 const CUMAT_T = Union{CUDA.AnyCuMatrix, CUDA.CUSPARSE.CuSparseMatrix}
+const CUDA_COO_T = Tuple{T, T, V} where {T <: AnyCuArray{<:Integer}, V <: Union{Nothing, AnyCuArray}}
 
 # Query 
 
@@ -35,5 +37,31 @@ function sort_edge_index(u::AnyCuArray, v::AnyCuArray)
     sort_edge_index(u, v) |> dev
 end
 
+# Convert
+
+function GNNGraphs.to_sparse(coo::CUDA_COO_T, T = nothing; dir = :out, num_nodes = nothing,
+                   weighted = true, is_coalesced = false)
+    s, t, eweight = coo
+    T = T === nothing ? (eweight === nothing ? eltype(s) : eltype(eweight)) : T
+
+    if eweight === nothing || !weighted
+        eweight = fill!(similar(s, T), 1)
+    end
+
+    num_nodes::Int = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes
+    
+    # if coalesced build directly sparse coo matrix
+    if is_coalesced
+        A = CUDA.CUSPARSE.CuSparseMatrixCOO{T,eltype(s)}(s, t, eweight, (num_nodes, num_nodes)) 
+    else
+        A = sparse(s, t, eweight, num_nodes, num_nodes)
+    end
+
+    num_edges::Int = nnz(A)
+    if eltype(A) != T
+        A = T.(A)
+    end
+    return A, num_nodes, num_edges
+end
 
 end #module
diff --git a/GNNGraphs/src/gnngraph.jl b/GNNGraphs/src/gnngraph.jl
index b90cfc032..e59c8f1e0 100644
--- a/GNNGraphs/src/gnngraph.jl
+++ b/GNNGraphs/src/gnngraph.jl
@@ -113,7 +113,7 @@ struct GNNGraph{T <: Union{COO_T, ADJMAT_T}} <: AbstractGNNGraph{T}
     ndata::DataStore
     edata::DataStore
     gdata::DataStore
-    is_coalesced::Bool # only for :coo, true if the graph is coalesced, i.e., indices ordered by row and no multi edges
+    is_coalesced::Bool # only for :coo, true if the graph is coalesced, i.e., no multi edges and indices ordered by target, then source
 end
 
 # GNNGraph constructor setting the is_coalesced field to false
diff --git a/GNNGraphs/src/query.jl b/GNNGraphs/src/query.jl
index e2e5f9bdb..482502b4a 100644
--- a/GNNGraphs/src/query.jl
+++ b/GNNGraphs/src/query.jl
@@ -231,13 +231,7 @@ If `weighted=true`, the `A` will contain the edge weights if any, otherwise the
 """
 function Graphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType = eltype(g); dir = :out,
                                  weighted = true)
-    if iscuarray(g.graph[1])
-        # Revisit after 
-        # https://github.com/JuliaGPU/CUDA.jl/issues/1113
-        A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted)
-    else
-        A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted)
-    end
+    A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted)
     @assert size(A) == (n, n)
     return dir == :out ? A : A'
 end
diff --git a/GNNGraphs/src/transform.jl b/GNNGraphs/src/transform.jl
index ce8d90b6a..1c8cffaa9 100644
--- a/GNNGraphs/src/transform.jl
+++ b/GNNGraphs/src/transform.jl
@@ -148,7 +148,7 @@ end
 """
     coalesce(g::GNNGraph; aggr=+)
 
-Return a new GNNGraph where all multiple edges between the same pair of nodes are merged (using aggr for edge weights and features), and the edge indices are sorted lexicographically (by source, then target).
+Return a new GNNGraph where all multiple edges between the same pair of nodes are merged (using aggr for edge weights and features), and the edge indices are sorted lexicographically (by target, then by source).
 This method is only applicable to graphs of type `:coo`.
 
 `aggr` can take value `+`,`min`, `max` or `mean`.
@@ -158,7 +158,8 @@ function Base.coalesce(g::GNNGraph{<:COO_T}; aggr = +)
     w = get_edge_weight(g)
     edata = g.edata
     num_edges = g.num_edges
-    idxs, idxmax = edge_encoding(s, t, g.num_nodes)
+    # order by target first and then source as a workaround of CUDA.jl issue: https://github.com/JuliaGPU/CUDA.jl/issues/2820
+    idxs, idxmax = edge_encoding(t, s, g.num_nodes)
 
     perm = sortperm(idxs)
     idxs = idxs[perm]
diff --git a/GNNGraphs/test/gnngraph.jl b/GNNGraphs/test/gnngraph.jl
index 2b18fe7b7..1d6d27b77 100644
--- a/GNNGraphs/test/gnngraph.jl
+++ b/GNNGraphs/test/gnngraph.jl
@@ -99,13 +99,14 @@ end
                 mat_gpu = adjacency_matrix(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Int}
                 @test get_device(mat_gpu) isa AbstractGPUDevice
-                @test Array(mat_gpu) == adj_mat
+                # Convert to float first because poor Int support in CUSPARSE, throws an error
+                @test Array(Float32.(mat_gpu)) == Float32.(adj_mat)
             end
         end
 
         @testset "normalized_laplacian" begin
             mat = normalized_laplacian(g)
-            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse
+            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse && GRAPH_T != :coo
                 mat_gpu = normalized_laplacian(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Float32}
                 @test get_device(mat_gpu)isa AbstractGPUDevice
@@ -114,7 +115,7 @@ end
         end
 
         @testset "scaled_laplacian" begin 
-            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse
+            if TEST_GPU && !(dev isa MetalDevice) && GRAPH_T != :sparse && GRAPH_T != :coo
                 mat = scaled_laplacian(g)
                 mat_gpu = scaled_laplacian(g_gpu)
                 @test mat_gpu isa AbstractMatrix{Float32}
diff --git a/GNNGraphs/test/transform.jl b/GNNGraphs/test/transform.jl
index 256d851bf..fb7e95bf4 100644
--- a/GNNGraphs/test/transform.jl
+++ b/GNNGraphs/test/transform.jl
@@ -456,8 +456,10 @@ end
 
             s2, t2 = edge_index(g2)
             w2 = get_edge_weight(g2)
-            @test s2 == [1, 2, 2, 3, 3, 4, 4]
-            @test t2 == [2, 1, 3, 2, 4, 3, 4]
+            # @test s2 == [1, 2, 2, 3, 3, 4, 4]
+            # @test t2 == [2, 1, 3, 2, 4, 3, 4]
+            @test s2 == [2, 1, 3, 2, 4, 3, 4]
+            @test t2 == [1, 2, 2, 3, 3, 4, 4]
             @test w2 == [1, 1, 2, 2, 3.5, 3.5, 5]
             @test g2.edata.e == [10.0, 10.0, 20.0, 20.0, 35.0, 35.0, 50.0]
         end 
diff --git a/GNNlib/ext/GNNlibCUDAExt.jl b/GNNlib/ext/GNNlibCUDAExt.jl
index 56a6738e9..f745d51da 100644
--- a/GNNlib/ext/GNNlibCUDAExt.jl
+++ b/GNNlib/ext/GNNlibCUDAExt.jl
@@ -3,7 +3,10 @@ module GNNlibCUDAExt
 using CUDA
 using Random, Statistics, LinearAlgebra
 using GNNlib: GNNlib, propagate, copy_xj, e_mul_xj, w_mul_xj
-using GNNGraphs: GNNGraph, COO_T, SPARSE_T
+using GNNGraphs: GNNGraph, COO_T, SPARSE_T, to_dense, to_sparse
+using ChainRulesCore: @non_differentiable
+
+const CUDA_COO_T = Tuple{T, T, V} where {T <: AnyCuArray{<:Integer}, V <: Union{Nothing, AnyCuArray}}
 
 ###### PROPAGATE SPECIALIZATIONS ####################
 
@@ -12,7 +15,9 @@ using GNNGraphs: GNNGraph, COO_T, SPARSE_T
 ## avoid the fast path on gpu until we have better cuda support
 function GNNlib.propagate(::typeof(copy_xj), g::GNNGraph{<:COO_T}, ::typeof(+),
         xi, xj::AnyCuMatrix, e)
-    propagate((xi, xj, e) -> copy_xj(xi, xj, e), g, +, xi, xj, e)
+    A = _adjacency_matrix(g, eltype(xj); weighted = false)
+
+    return xj * A
 end
 
 ## E_MUL_XJ 
@@ -42,4 +47,21 @@ end
 
 # Flux.Zygote.@nograd compute_degree
 
+## CUSTOM ADJACENCY_MATRIX IMPLEMENTATION FOR CUDA COO GRAPHS, returning dense matrix when not coalesced, more efficient 
+
+function _adjacency_matrix(g::GNNGraph{<:CUDA_COO_T}, T::DataType = eltype(g); dir = :out,
+                                 weighted = true)
+    if !g.is_coalesced
+        # Revisit after 
+        # https://github.com/JuliaGPU/CUDA.jl/issues/1113
+        A, n, m = to_dense(g.graph, T; num_nodes = g.num_nodes, weighted) # if not coalesced, construction of sparse matrix is slow
+    else
+        A, n, m = to_sparse(g.graph, T; num_nodes = g.num_nodes, weighted, is_coalesced = true)
+    end
+    @assert size(A) == (n, n)
+    return dir == :out ? A : A'
+end
+
+@non_differentiable _adjacency_matrix(x...)
+
 end #module
diff --git a/GNNlib/test/test_module.jl b/GNNlib/test/test_module.jl
index b6894cdfa..075881af8 100644
--- a/GNNlib/test/test_module.jl
+++ b/GNNlib/test/test_module.jl
@@ -150,7 +150,7 @@ function test_gradients(
     return true
 end
 
-function generate_test_graphs(graph_type)
+function generate_test_graphs(graph_type; do_coalesce=false)
     adj1 = [0 1 0 1
             1 0 1 0
             0 1 0 1
@@ -168,12 +168,18 @@ function generate_test_graphs(graph_type)
     g_single_vertex = GNNGraph(adj_single_vertex,
                                 ndata = rand(Float32, D_IN, 4);
                                 graph_type)
+    
+    if graph_type == :coo && do_coalesce
+        g1 = coalesce(g1)
+        g_single_vertex = coalesce(g_single_vertex)
+    end
 
     return (g1, g_single_vertex)
 end
 
 GRAPH_TYPES = [:coo, :dense, :sparse]
 TEST_GRAPHS = [generate_test_graphs(:coo)...,
+               generate_test_graphs(:coo, do_coalesce=true)...,
                generate_test_graphs(:dense)...,
                generate_test_graphs(:sparse)...]
 
diff --git a/GraphNeuralNetworks/test/layers/conv.jl b/GraphNeuralNetworks/test/layers/conv.jl
index 16e9b2fd5..97cc7a355 100644
--- a/GraphNeuralNetworks/test/layers/conv.jl
+++ b/GraphNeuralNetworks/test/layers/conv.jl
@@ -108,7 +108,7 @@ end
         
         if gpu_backend() == "AMDGPU"
             broken = true
-        elseif gpu_backend() == "CUDA" && get_graph_type(g) == :sparse
+        elseif gpu_backend() == "CUDA" && get_graph_type(g) in [:coo, :sparse]
             broken = true
         else
             broken = false
diff --git a/GraphNeuralNetworks/test/test_module.jl b/GraphNeuralNetworks/test/test_module.jl
index 8f7a0446b..74c25a555 100644
--- a/GraphNeuralNetworks/test/test_module.jl
+++ b/GraphNeuralNetworks/test/test_module.jl
@@ -157,7 +157,7 @@ function test_gradients(
 end
 
 
-function generate_test_graphs(graph_type)
+function generate_test_graphs(graph_type; do_coalesce=false)
     adj1 = [0 1 0 1
             1 0 1 0
             0 1 0 1
@@ -175,12 +175,18 @@ function generate_test_graphs(graph_type)
     g_single_vertex = GNNGraph(adj_single_vertex,
                                 ndata = rand(Float32, D_IN, 4);
                                 graph_type)
+    
+    if graph_type == :coo && do_coalesce
+        g1 = coalesce(g1)
+        g_single_vertex = coalesce(g_single_vertex)
+    end
 
     return (g1, g_single_vertex)
 end
 
 GRAPH_TYPES = [:coo, :dense, :sparse]
 TEST_GRAPHS = [generate_test_graphs(:coo)...,
+               generate_test_graphs(:coo, do_coalesce=true)...,
                generate_test_graphs(:dense)...,
                generate_test_graphs(:sparse)...]