improve degree and various fixes

CarloLucibello · CarloLucibello · commit 96df77896918 · 2021-09-25T20:00:47.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -18,6 +18,7 @@ MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
+ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/perf/perf.jl b/perf/perf.jl
@@ -15,15 +15,27 @@ function run_single_benchmark(N, c, D, CONV; gtype=:lg)
     g_gpu = g |> gpu    
     
     m = CONV(D => D)
+    ps = Flux.params(m)
+    
     m_gpu = m |> gpu
+    ps_gpu = Flux.params(m_gpu)
+
     
     res = Dict()
-    res["CPU"] = @benchmark $m($g)
+
+    res["CPU_FWD"] = @benchmark $m($g)
+    res["CPU_GRAD"] = @benchmark gradient(() -> sum($m($g).ndata.x), $ps)
     
-    try [GCNConv, GraphConv, GATConv]
-        res["GPU"] = @benchmark CUDA.@sync($m_gpu($g_gpu)) teardown=(GC.gc(); CUDA.reclaim())
+    try
+        res["GPU_FWD"] = @benchmark CUDA.@sync($m_gpu($g_gpu)) teardown=(GC.gc(); CUDA.reclaim())
+    catch
+        res["GPU_FWD"] = missing
+    end
+
+    try
+        res["GPU_GRAD"] = @benchmark CUDA.@sync(gradient(() -> sum($m_gpu($g_gpu).ndata.x), $ps_gpu)) teardown=(GC.gc(); CUDA.reclaim())
     catch
-        res["GPU"] = missing
+        res["GPU_GRAD"] = missing
     end
 
     return res
@@ -45,7 +57,7 @@ function run_benchmarks(;
         Ns = [10, 100, 1000, 10000],
         c = 6,
         D = 100,
-        layers = [GCNConv, GraphConv, GATConv],
+        layers = [GCNConv, GATConv],
         gtypes = [:coo, :sparse, :dense],
         )
 
diff --git a/src/gnngraph.jl b/src/gnngraph.jl
@@ -102,7 +102,7 @@ struct GNNGraph{T<:Union{COO_T,ADJMAT_T}}
     num_nodes::Int
     num_edges::Int
     num_graphs::Int
-    graph_indicator
+    graph_indicator       # vector of ints or nothing
     ndata::NamedTuple
     edata::NamedTuple
     gdata::NamedTuple
@@ -216,10 +216,12 @@ s, t = edge_index(g)
 """
 edge_index(g::GNNGraph{<:COO_T}) = g.graph[1:2]
 
-edge_index(g::GNNGraph{<:ADJMAT_T}) = to_coo(g.graph)[1][1:2]
+edge_index(g::GNNGraph{<:ADJMAT_T}) = to_coo(g.graph, num_nodes=g.num_nodes)[1][1:2]
 
 edge_weight(g::GNNGraph{<:COO_T}) = g.graph[3]
 
+edge_weight(g::GNNGraph{<:ADJMAT_T}) = to_coo(g.graph, num_nodes=g.num_nodes)[1][3]
+
 LightGraphs.edges(g::GNNGraph) = zip(edge_index(g)...)
 
 LightGraphs.edgetype(g::GNNGraph) = Tuple{Int, Int}
@@ -278,6 +280,7 @@ end
 
 function LightGraphs.adjacency_matrix(g::GNNGraph{<:COO_T}, T::DataType=Int; dir=:out)
     if g.graph[1] isa CuVector
+        # TODO revisi after https://github.com/JuliaGPU/CUDA.jl/pull/1152
         A, n, m = to_dense(g.graph, T, num_nodes=g.num_nodes)
     else
         A, n, m = to_sparse(g.graph, T, num_nodes=g.num_nodes)
@@ -293,17 +296,18 @@ function LightGraphs.adjacency_matrix(g::GNNGraph{<:ADJMAT_T}, T::DataType=eltyp
     return dir == :out ? A : A'
 end
 
-function LightGraphs.degree(g::GNNGraph{<:COO_T}, T=Int; dir=:out)
+function LightGraphs.degree(g::GNNGraph{<:COO_T}, T=nothing; dir=:out)
     s, t = edge_index(g)
+    T = isnothing(T) ? eltype(s) : T
     degs = fill!(similar(s, T, g.num_nodes), 0)
-    o = fill!(similar(s, Int, g.num_edges), 1)
+    src = 1
     if dir ∈ [:out, :both]
-        NNlib.scatter!(+, degs, o, s)
+        NNlib.scatter!(+, degs, src, s)
     end
     if dir ∈ [:in, :both]
-        NNlib.scatter!(+, degs, o, t)
+        NNlib.scatter!(+, degs, src, t)
     end
-    return degs
+    return degs 
 end
 
 function LightGraphs.degree(g::GNNGraph{<:ADJMAT_T}, T=Int; dir=:out)
@@ -318,6 +322,7 @@ function LightGraphs.laplacian_matrix(g::GNNGraph, T::DataType=Int; dir::Symbol=
     return D - A
 end
 
+
 """
     normalized_laplacian(g, T=Float32; add_self_loops=false, dir=:out)
 
@@ -406,14 +411,15 @@ end
 function add_self_loops(g::GNNGraph{<:ADJMAT_T})
     A = g.graph
     @assert g.edata === (;)
+    num_edges = g.num_edges + g.num_nodes
     A = A + I
-    num_edges =  g.num_edges + g.num_nodes
     GNNGraph(A, 
             g.num_nodes, num_edges, g.num_graphs, 
             g.graph_indicator,
             g.ndata, g.edata, g.gdata)
 end
 
+
 function remove_self_loops(g::GNNGraph{<:COO_T})
     s, t = edge_index(g)
     # TODO remove these constraints
@@ -572,9 +578,3 @@ end
 @non_differentiable degree(x...)
 @non_differentiable add_self_loops(x...)     # TODO this is wrong, since g carries feature arrays, needs rrule
 @non_differentiable remove_self_loops(x...)  # TODO this is wrong, since g carries feature arrays, needs rrule
-
-# # delete when https://github.com/JuliaDiff/ChainRules.jl/pull/472 is merged
-# function ChainRulesCore.rrule(::typeof(copy), x)
-#     copy_pullback(ȳ) = (NoTangent(), ȳ)
-#     return copy(x), copy_pullback
-# end
diff --git a/src/graph_conversions.jl b/src/graph_conversions.jl
@@ -12,6 +12,17 @@ function to_coo(coo::COO_T; dir=:out, num_nodes=nothing)
     return coo, num_nodes, num_edges
 end
 
+function to_coo(A::SPARSE_T; dir=:out, num_nodes=nothing)
+    s, t, v = findnz(A)
+    if dir == :in
+        s, t = t, s
+    end
+    num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes 
+    num_edges = length(s)
+
+    return (s, t, nothing), num_nodes, num_edges
+end
+
 function to_coo(A::ADJMAT_T; dir=:out, num_nodes=nothing)
     nz = findall(!=(0), A) # vec of cartesian indexes
     s, t = ntuple(i -> map(t->t[i], nz), 2)
diff --git a/src/msgpass.jl b/src/msgpass.jl
@@ -146,16 +146,25 @@ copyxj(xi, xj, e) = xj
 # ximulxj(xi, xj, e) = xi .* xj
 # xiaddxj(xi, xj, e) = xi .+ xj
 
-# function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(+), xi, xj::AbstractMatrix, e)
-#     A = adjacency_matrix(g)
-#     return xj * A
-# end
+
+function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(+), xi, xj::AbstractMatrix, e)
+    A = adjacency_matrix(g)
+    return xj * A
+end
+
+## avoid the fast path on gpu until we have better cuda support
+function propagate(::typeof(copyxj), g::GNNGraph{<:Union{COO_T,SPARSE_T}}, ::typeof(+), xi, xj::AnyCuMatrix, e)
+    propagate((xi,xj,e)->copyxj(xi,xj,e), g, +, xi, xj, e)
+end
 
 # function propagate(::typeof(copyxj), g::GNNGraph, ::typeof(mean), xi, xj::AbstractMatrix, e)
 #     A = adjacency_matrix(g)
-#     degs = vec(sum(A; dims=2))
-#     D = Diagonal(ofeltype(xj, 1) ./ degs)
-#     # A, D = _aa(g, xj)
+#     D = compute_degree(A)
 #     return xj * A * D
 # end
 
+# # Zygote bug. Error with sparse matrix without nograd
+# compute_degree(A) = Diagonal(1f0 ./ vec(sum(A; dims=2)))
+
+# Flux.Zygote.@nograd compute_degree
+
diff --git a/src/utils.jl b/src/utils.jl
@@ -68,5 +68,69 @@ function normalize_graphdata(data::NamedTuple; default_name, n, duplicate_if_nee
     return data
 end
 
+ofeltype(x, y) = convert(float(eltype(x)), y)
 
-ofeltype(x, y) = convert(float(eltype(x)), y)
+# TODO move to flux. fix for https://github.com/FluxML/Flux.jl/issues/1720
+Flux._cpu_array(x::AbstractSparseArray) = Flux.adapt(SparseMatrixCSC, x)
+
+# TODO. FIX THIS HACK. CUDA.jl support to sparse matrices is very bad, convert to dense
+# Revisit after https://github.com/JuliaGPU/CUDA.jl/pull/1152
+Flux._gpu_array(x::AbstractSparseArray) = CuMatrix(x)
+
+
+# Considers the src a zero dimensional object.
+# Useful for implementing `StatsBase.counts`, `degree`, etc...
+# function NNlib.scatter!(op, dst::AbstractArray, src::Number, idx::AbstractArray)
+#     for k in CartesianIndices(idx)
+#         # dst_v = NNlib._view(dst, idx[k])
+#         # dst_v .= (op).(dst_v, src)
+#         dst[idx[k]] .= (op).(dst[idx[k]], src)
+#     end
+#     dst
+# end
+
+# 10 time faster than the generic version above. 
+# All the speedup comes from not broadcasting `op`, i dunno why.
+function NNlib.scatter!(op, dst::AbstractVector, src::Number, idx::AbstractVector{<:Integer})
+    for i in idx
+        dst[i] = op(dst[i], src)
+    end
+end
+
+# NNlib._view(X, k) = view(X, k...)
+# NNlib._view(X, k::Union{Integer, CartesianIndex}) = view(X,  k)
+
+# Considers src as a zero dimensional object to be scattered
+# function NNlib.scatter(op,
+#                 src::Tsrc,
+#                 idx::AbstractArray{Tidx,Nidx};
+#                 init = nothing, dstsize = nothing) where {Tsrc<:Number,Tidx,Nidx}
+    
+#     dstsz = isnothing(dstsize) ? maximum_dims(idx) : dstsize 
+#     dst = similar(src, Tsrc, dstsz)
+#     xinit = isnothing(init) ? scatter_empty(op, Tsrc) : init 
+#     fill!(dst, xinit)
+#     scatter!(op, dst, src, idx)
+# end
+
+
+function scatter_scalar_kernel!(op, dst, src, idx)
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= length(idx)
+        CUDA.@atomic dst[idx[index]...] = op(dst[idx[index]...], src)
+    end
+    return nothing
+end
+
+function NNlib.scatter!(op, dst::AnyCuArray, src::Number, idx::AnyCuArray)
+    max_idx = length(idx)
+    args = op, dst, src, idx
+    
+    kernel = @cuda launch=false scatter_scalar_kernel!(args...)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = min(max_idx, config.threads)
+    blocks = cld(max_idx, threads)
+    kernel(args...; threads=threads, blocks=blocks)
+    return dst
+end
diff --git a/test/cuda/gnngraph.jl b/test/cuda/gnngraph.jl
@@ -32,6 +32,13 @@ const ACUMatrix{T} = Union{CuMatrix{T}, CUDA.CUSPARSE.CuSparseMatrix{T}}
         @test Array(mat_gpu) == mat 
     end
 
+    @teset "degree" begin
+        d = degree(g)
+        d_gpu = degree(g_gpu)
+        @test d_gpu isa CuVector
+        @test Array(d_gpu) == d
+    end
+
     @testset "scaled_laplacian" begin
         @test_broken begin 
             mat = scaled_laplacian(g)
diff --git a/test/examples/node_classification_cora.jl b/test/examples/node_classification_cora.jl
@@ -65,7 +65,7 @@ function train(Layer; verbose=false, kws...)
     end
     
     verbose && report(0)
-    for epoch in 1:args.epochs
+    @time for epoch in 1:args.epochs
         gs = Flux.gradient(ps) do
             ŷ = model(g, X)
             logitcrossentropy(ŷ[:,train_ids], ytrain)
@@ -79,21 +79,29 @@ function train(Layer; verbose=false, kws...)
     return train_res, test_res
 end
 
-for (layer, Layer) in [
-            ("GCNConv", (nin, nout) -> GCNConv(nin => nout, relu)),
-            ("GraphConv", (nin, nout) -> GraphConv(nin => nout, relu, aggr=mean)),
-            ("SAGEConv", (nin, nout) -> SAGEConv(nin => nout, relu)),
-            ("GATConv", (nin, nout) -> GATConv(nin => nout, relu)),
-            ("GINConv", (nin, nout) -> GINConv(Dense(nin, nout, relu), 0.01, aggr=mean)),
-            ("ChebConv", (nin, nout) -> ChebConv(nin => nout, 2)),
-            ("ResGatedGraphConv", (nin, nout) -> ResGatedGraphConv(nin => nout, relu)),        
-            # (nin, nout) -> NNConv(nin => nout),  # needs edge features
-            # (nin, nout) -> GatedGraphConv(nout, 2),  # needs nin = nout
-            # (nin, nout) -> EdgeConv(Dense(2nin, nout, relu)), # Fits the traning set but does not generalize well
-              ]
+function train_many(; usecuda=false)
+    for (layer, Layer) in [
+                ("GCNConv", (nin, nout) -> GCNConv(nin => nout, relu)),
+                ("ResGatedGraphConv", (nin, nout) -> ResGatedGraphConv(nin => nout, relu)),        
+                ("GraphConv", (nin, nout) -> GraphConv(nin => nout, relu, aggr=mean)),
+                ("SAGEConv", (nin, nout) -> SAGEConv(nin => nout, relu)),
+                ("GATConv", (nin, nout) -> GATConv(nin => nout, relu)),
+                ("GINConv", (nin, nout) -> GINConv(Dense(nin, nout, relu), 0.01, aggr=mean)),
+                ## ("ChebConv", (nin, nout) -> ChebConv(nin => nout, 2)), # not working on gpu
+                ## ("NNConv", (nin, nout) -> NNConv(nin => nout)),  # needs edge features
+                ## ("GatedGraphConv", (nin, nout) -> GatedGraphConv(nout, 2)),  # needs nin = nout
+                ## ("EdgeConv",(nin, nout) -> EdgeConv(Dense(2nin, nout, relu))), # Fits the traning set but does not generalize well
+                ]
 
-    @show layer
-    @time train_res, test_res = train(Layer, verbose=false)
-    @test train_res.acc > 95
-    @test test_res.acc > 70
+        @show layer
+        @time train_res, test_res = train(Layer; usecuda, verbose=false)
+        @test train_res.acc > 94
+        @test test_res.acc > 70
+    end
+end
+
+## if GRAPH_T != :dense # some erratic errors with :dense
+train_many(usecuda=false)
+if TEST_GPU
+    train_many(usecuda=true)
 end
diff --git a/test/msgpass.jl b/test/msgpass.jl
@@ -57,37 +57,4 @@
 
         @test m.a == ones(out_channel, num_E)
     end
-
-
-    # @testset "NamedTuples" begin
-    #     struct NewLayerNT{G}
-    #         W
-    #     end
-        
-    #     NewLayerNT(in, out) = NewLayerNT{GRAPH_T}(randn(T, out, in))
-        
-    #     function GraphNeuralNetworks.compute_message(l::NewLayerNT{GRAPH_T}, di, dj, dij)
-    #         a = l.W * (di.x .+ dj.x .+ dij.e) 
-    #         b = l.W * di.x
-    #         return (; a, b)
-    #     end
-    #     function GraphNeuralNetworks.update_node(l::NewLayerNT{GRAPH_T}, m, d) 
-    #         return (α=l.W * d.x + m.a + m.b, β=m)
-    #     end
-    #     function GraphNeuralNetworks.update_edge(l::NewLayerNT{GRAPH_T}, m, e) 
-    #         return m.a
-    #     end
-
-    #     function (::NewLayerNT{GRAPH_T})(g, x, e)
-    #         x, e = propagate(l, g, mean, (; x), (; e))
-    #         return x.α .+ x.β.a, e
-    #     end
-
-    #     l = NewLayerNT(in_channel, out_channel)
-    #     g = GNNGraph(adj, graph_type=GRAPH_T)
-    #     X′, E′ = l(g, X, E)
-
-    #     @test size(X′) == (out_channel, num_V)
-    #     @test size(E′) == (out_channel, num_E)
-    # end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -30,7 +30,7 @@ tests = [
 @testset "GraphNeuralNetworks: graph format $graph_type" for graph_type in (:coo,:sparse,:dense)
 
     global GRAPH_T = graph_type
-    global TEST_GPU = CUDA.functional() && GRAPH_T != :sparse
+    global TEST_GPU = CUDA.functional()# && GRAPH_T != :sparse
 
     for t in tests
         include("$t.jl")