add Parallel support in GNNChain

CarloLucibello · CarloLucibello · commit 0273b252c3e3 · 2021-09-22T19:22:05.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Carlo Lucibello and contributors"]
 version = "0.1.1"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
diff --git a/docs/src/models.md b/docs/src/models.md
@@ -78,11 +78,26 @@ X = randn(Float32, din, 10)
 model = GNNChain(GCNConv(din => d),
                  BatchNorm(d),
                  x -> relu.(x),
-                 GraphConv(d => d, relu),
+                 GCNConv(d => d, relu),
                  Dropout(0.5),
                  Dense(d, dout))
 
-y = model(g, X)
+y = model(g, X)  # output size: (dout, g.num_nodes)
 ```
 
 The `GNNChain` only propagates the graph and the node features. More complex scenarios, e.g. when also edge features are updated, have to be handled using the explicit definition of the forward pass. 
+
+A `GNNChain` oppurtunely propagates the graph into the branches created by the `Flux.Parallel` layer:
+
+```julia
+AddResidual(l) = Parallel(+, identity, l) 
+
+model = GNNChain( AddResidual(ResGatedGraphConv(din => d, relu)),
+                  BatchNorm(d),
+                  AddResidual(ResGatedGraphConv(d => d, relu)),
+                  BatchNorm(d),
+                  GlobalPooling(mean),
+                  Dense(d, dout))
+
+y = model(g, X) # output size: (dout, g.num_graphs)
+```            
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -46,6 +46,7 @@ export
     GINConv,
     GraphConv,
     NNConv,
+    ResGatedGraphConv,
     SAGEConv,
 
     # layers/pool
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -63,6 +63,15 @@ Flux.functor(::Type{<:GNNChain}, c) = c.layers, ls -> GNNChain(ls...)
 applylayer(l, g::GNNGraph, x) = l(x)
 applylayer(l::GNNLayer, g::GNNGraph, x) = l(g, x)
 
+# Handle Flux.Parallel
+applylayer(l::Parallel, g::GNNGraph, x::AbstractArray) = mapreduce(f -> applylayer(l, g, x), l.connection, l.layers)
+applylayer(l::Parallel, g::GNNGraph, xs::Vararg{<:AbstractArray}) = mapreduce((f, x) -> applylayer(l, g, x), l.connection, l.layers, xs)
+applylayer(l::Parallel, g::GNNGraph, xs::Tuple) = applylayer(l, g, xs...)
+applylayer(l::Parallel, g::GNNGraph, x::AbstractArray) = mapreduce(f -> applylayer(l, g, x), l.connection, l.layers)
+applylayer(l::Parallel, g::GNNGraph, xs::Vararg{<:AbstractArray}) = mapreduce((f, x) -> applylayer(l, g, x), l.connection, l.layers, xs)
+
+
+
 applychain(::Tuple{}, g::GNNGraph, x) = x
 applychain(fs::Tuple, g::GNNGraph, x) = applychain(tail(fs), g, applylayer(first(fs), g, x))
 
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -224,7 +224,7 @@ with ``z_i`` a normalization factor.
 
 - `in`: The dimension of input features.
 - `out`: The dimension of output features.
-- `bias::Bool`: Keyword argument, whether to learn the additive bias.
+- `bias`: Learn the additive bias if true.
 - `heads`: Number attention heads.
 - `concat`: Concatenate layer output or not. If not, layer output is averaged over the heads.
 - `negative_slope`: The parameter of LeakyReLU.
@@ -572,3 +572,77 @@ function Base.show(io::IO, l::SAGEConv)
     print(io, ", aggr=", l.aggr)
     print(io, ")")
 end
+
+
+@doc raw"""
+    ResGatedGraphConv(in => out, act=identity; init=glorot_uniform, bias=true)
+
+The residual gated graph convolutional operator from the [Residual Gated Graph ConvNets]((https://arxiv.org/abs/1711.07553)) paper.
+
+The layer's forward pass is given by
+
+```math
+\mathbf{x}_i' = act\big(U\mathbf{xhttps://github.com/ArtLabBocconi/deepJuliaNN}_i + \sum_{j \in N(i)} \eta_{ij} V \mathbf{x}_j\big),
+```
+where the edge gates ``\eta_{ij}`` are given by
+
+```math
+\eta_{ij} = sigmoid(A\mathbf{x}_i + B\mathbf{x}_j).
+```
+
+# Arguments
+
+- `in`: The dimension of input features.
+- `out`: The dimension of output features.
+- `act`: Activation function.
+- `init`: Weight matrices' initializing function. 
+- `bias`: Learn an additive bias if true.
+"""
+struct ResGatedGraphConv <: GNNLayer
+    A
+    B
+    U
+    V
+    bias
+    σ
+end
+
+@functor ResGatedGraphConv
+
+function ResGatedGraphConv(ch::Pair{Int,Int}, σ=identity;
+                      init=glorot_uniform, bias::Bool=true)
+    in, out = ch             
+    A = init(out, in)
+    B = init(out, in)
+    U = init(out, in)
+    V = init(out, in)
+    b = bias ? Flux.create_bias(A, true, out) : false
+    return ResGatedGraphConv(A, B, U, V, b, σ)
+end
+
+function compute_message(l::ResGatedGraphConv, di, dj)
+    η = sigmoid.(di.Ax .+ dj.Bx)
+    return η .* dj.Vx
+end
+
+update_node(l::ResGatedGraphConv, m, x) = m
+
+function (l::ResGatedGraphConv)(g::GNNGraph, x::AbstractMatrix)
+    check_num_nodes(g, x)
+
+    Ax = l.A * x
+    Bx = l.B * x
+    Vx = l.V * x
+    
+    m, _ = propagate(l, g, +, (; Ax, Bx, Vx))
+
+    return l.σ.(l.U*x .+ m .+ l.bias)                                      
+end
+
+
+function Base.show(io::IO, l::ResGatedGraphConv)
+    out_channel, in_channel = size(l.weight)
+    print(io, "ResGatedGraphConv(", in_channel, "=>", out_channel)
+    l.σ == identity || print(io, ", ", l.σ)
+    print(io, ")")
+end
diff --git a/test/examples/node_classification_cora.jl b/test/examples/node_classification_cora.jl
@@ -70,8 +70,8 @@ function train(Layer; verbose=false, kws...)
             ŷ = model(g, X)
             logitcrossentropy(ŷ[:,train_ids], ytrain)
         end
-        verbose && report(epoch)
         Flux.Optimise.update!(opt, ps, gs)
+        verbose && report(epoch)
     end
 
     train_res = eval_loss_accuracy(X, y, train_ids, model, g)
@@ -87,11 +87,12 @@ for Layer in [
             (nin, nout) -> GATConv(nin => nout÷2, relu, heads=2),
             (nin, nout) -> GINConv(Dense(nin, nout, relu)),
             (nin, nout) -> ChebConv(nin => nout, 3),
+            (nin, nout) -> ResGatedGraphConv(nin => nout, relu),          
             # (nin, nout) -> NNConv(nin => nout),  # needs edge features
             # (nin, nout) -> GatedGraphConv(nout, 2),  # needs nin = nout
             # (nin, nout) -> EdgeConv(Dense(2nin, nout, relu)), # Fits the traning set but does not generalize well
               ]
-    train_res, test_res = train(Layer, verbose=true)
+    train_res, test_res = train(Layer, verbose=false)
     # @show Layer(2,2) train_res, test_res
     @test train_res.acc > 95
     @test test_res.acc > 70
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -2,32 +2,35 @@
     @testset "GNNChain" begin
         n, din, d, dout = 10, 3, 4, 2
         
-        g = GNNGraph(random_regular_graph(n, 4), graph_type=GRAPH_T)
+        g = GNNGraph(random_regular_graph(n, 4), 
+                    graph_type=GRAPH_T,
+                    ndata= randn(Float32, din, n))
         
         gnn = GNNChain(GCNConv(din => d),
                        BatchNorm(d),
-                       x -> relu.(x),
-                       GraphConv(d => d, relu),
+                       x -> tanh.(x),
+                       GraphConv(d => d, tanh),
                        Dropout(0.5),
                        Dense(d, dout))
+
+        testmode!(gnn)
         
-        X = randn(Float32, din, n)
+        test_layer(gnn, g, rtol=1e-5) # exclude BN buffers
 
-        y = gnn(g, X)
-  
-        @test y isa Matrix{Float32}
-        @test size(y) == (dout, n)
 
-        @test length(params(gnn)) == 9
-        
-        gs = gradient(x -> sum(gnn(g, x)), X)[1]
-        @test gs isa Matrix{Float32}
-        @test size(gs) == size(X) 
+        @testset "Parallel" begin
+            AddResidual(l) = Parallel(+, identity, l) 
+
+            gnn = GNNChain(AddResidual(ResGatedGraphConv(din => d, tanh)),
+                           BatchNorm(d),
+                           AddResidual(ResGatedGraphConv(d => d, tanh)),
+                           BatchNorm(d),
+                           Dense(d, dout))
 
-        gs = gradient(() -> sum(gnn(g, X)), Flux.params(gnn))
-        for p in Flux.params(gnn)
-            @test eltype(gs[p]) == Float32
-            @test size(gs[p]) == size(p)
+            testmode!(gnn)
+                           
+            test_layer(gnn, g, rtol=1e-5, verbose=true, 
+                    exclude_grad_fields=[:μ, :σ², :ϵ]) # exclude BN buffers
         end
     end
 end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
diff --git a/test/runtests.jl b/test/runtests.jl
diff --git a/test/test_utils.jl b/test/test_utils.jl