add NNConv tests

CarloLucibello · CarloLucibello · commit b4b9020473a0 · 2021-09-18T17:11:42.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Carlo Lucibello and contributors"]
 version = "0.1.0"
 
 [deps]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -38,13 +38,14 @@ export
     GNNChain,
 
     # layers/conv
-    GCNConv,
     ChebConv,
-    GraphConv,
+    EdgeConv,
     GATConv,
     GatedGraphConv,
-    EdgeConv,
+    GCNConv,
     GINConv,
+    GraphConv,
+    NNConv,
 
     # layers/pool
     GlobalPool,
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -7,7 +7,8 @@ See also [`GNNChain`](@ref).
 """
 abstract type GNNLayer end
 
-#TODO extend to store also edge and global features
+# Forward pass with graph-only input.
+# To be specialized by layers also needing edge features as input (e.g. NNConv). 
 (l::GNNLayer)(g::GNNGraph) = GNNGraph(g, ndata=l(g, node_features(g)))
 
 """
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -196,7 +196,7 @@ end
 
 
 @doc raw"""
-    GATConv(in => out, , σ=identity;
+    GATConv(in => out, σ=identity;
             heads=1,
             concat=true,
             init=glorot_uniform    
@@ -224,7 +224,7 @@ with ``z_i`` a normalization factor.
 - `concat`: Concatenate layer output or not. If not, layer output is averaged over the heads.
 - `negative_slope::Real`: Keyword argument, the parameter of LeakyReLU.
 """
-struct GATConv{T, A<:AbstractMatrix{T}, B} <: GNNLayer
+struct GATConv{T, A<:AbstractMatrix, B} <: GNNLayer
     weight::A
     bias::B
     a::A
@@ -239,12 +239,13 @@ end
 Flux.trainable(l::GATConv) = (l.weight, l.bias, l.a)
 
 function GATConv(ch::Pair{Int,Int}, σ=identity;
-                 heads::Int=1, concat::Bool=true, negative_slope=0.2f0,
+                 heads::Int=1, concat::Bool=true, negative_slope=0.2,
                  init=glorot_uniform, bias::Bool=true)
     in, out = ch             
     W = init(out*heads, in)
     b = Flux.create_bias(W, bias, out*heads)
     a = init(2*out, heads)
+    negative_slope = convert(eltype(W), negative_slope)
     GATConv(W, b, a, σ, negative_slope, ch, heads, concat)
 end
 
@@ -437,7 +438,7 @@ end
 
 
 @doc raw"""
-    NNConv(in => out, σ=identity; aggr=+, bias=true, init=glorot_uniform)
+    NNConv(in => out, f, σ=identity; aggr=+, bias=true, init=glorot_uniform)
 
 The continuous kernel-based convolutional operator from the 
 [Neural Message Passing for Quantum Chemistry](https://arxiv.org/abs/1704.01212) paper. 
@@ -447,7 +448,7 @@ This convolution is also known as the edge-conditioned convolution from the
 Performs the operation
 
 ```math
-\mathbf{x}_i' = W x_i + \square_{j \in N(i)} f_\Theta(\mathbf{e}_{j\to i})\,\mathbf{x}_j
+\mathbf{x}_i' = W \mathbf{x}_i + \square_{j \in N(i)} f_\Theta(\mathbf{e}_{j\to i})\,\mathbf{x}_j
 ```
 
 where ``f_\Theta``  denotes a learnable function (e.g. a linear layer or a multi-layer perceptron).
@@ -459,6 +460,7 @@ For convenience, also functions returning a single `(out*in, num_edges)` matrix
 
 - `in`: The dimension of input features.
 - `out`: The dimension of output features.
+- ``f``: A (possibly learnable) function acting on edge features.
 - `aggr`: Aggregation operator for the incoming messages (e.g. `+`, `*`, `max`, `min`, and `mean`).
 - `σ`: Activation function.
 - `bias`: Add learnable bias.
@@ -468,32 +470,39 @@ struct NNConv <: GNNLayer
     weight
     bias
     nn
+    σ
     aggr
 end
 
 @functor NNConv
 
-function NNConv(ch::Pair{Int,Int}, σ=identity; aggr=+, bias=true, init=glorot_uniform)
+function NNConv(ch::Pair{Int,Int}, nn, σ=identity; aggr=+, bias=true, init=glorot_uniform)
     in, out = ch
     W = init(out, in)
     b = Flux.create_bias(W, bias, out)
-    return NNConv(W, b, nn, aggr)
+    return NNConv(W, b, nn, σ, aggr)
 end
 
 function compute_message(l::NNConv, x_i, x_j, e_ij) 
     nin, nedges = size(x_i)
     W = reshape(l.nn(e_ij), (:, nin, nedges))
-    return NNlib.batched_mul(W, x_j)
+    x_j = reshape(x_j, (nin, 1, nedges)) # needed by batched_mul
+    m = NNlib.batched_mul(W, x_j)
+    return reshape(m, :, nedges)
 end
 
-update_node(l::NNConv, m, x) = l.weight*x + m
+function update_node(l::NNConv, m, x)
+    l.σ.(l.weight*x .+ m .+ l.bias)
+end
 
 function (l::NNConv)(g::GNNGraph, x::AbstractMatrix, e)
-    check_num_nodes(g, X)
+    check_num_nodes(g, x)
     x, _ = propagate(l, g, l.aggr, x, e)
-    return l.σ.(x + l.bias)
+    return x
 end
 
+(l::NNConv)(g::GNNGraph) = GNNGraph(g, ndata=l(g, node_features(g), edge_features(g)))
+
 function Base.show(io::IO, l::NNConv)
     out, in = size(l.weight)
     print(io, "NNConv( $in => $out")
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -44,7 +44,7 @@
         @test size(l.bias) == (out_channel,)
         @test l.k == k
         for g in test_graphs
-            gradtest(l, g, rtol=1e-4, broken_grad_fields=[:weight])
+            gradtest(l, g, rtol=1e-5, broken_grad_fields=[:weight])
         end
         
         @testset "bias=false" begin
@@ -72,37 +72,13 @@
 
     @testset "GATConv" begin
 
-        heads = 1
-        concat = true
-        l = GATConv(in_channel => out_channel; heads, concat)
-        for g in test_graphs
-            gradtest(l, g, rtol=1e-4)
-        end
-
-        heads = 2
-        concat = true
-        l = GATConv(in_channel => out_channel; heads, concat)
-        for g in test_graphs
-            gradtest(l, g, rtol=1e-4, 
-                broken_grad_fields = [:a])
-        end
-
-        heads = 1
-        concat = false
-        l = GATConv(in_channel => out_channel; heads, concat)
-        for g in test_graphs
-            gradtest(l, g, rtol=1e-4, 
-                broken_grad_fields = [:a])
+        for heads in (1, 2), concat in (true, false)
+            l = GATConv(in_channel => out_channel; heads, concat)
+            for g in test_graphs
+                gradtest(l, g, rtol=1e-4)
+            end
         end
 
-        heads = 2
-        concat = false
-        l = GATConv(in_channel => out_channel; heads, concat)
-        gradtest(l, test_graphs[1], atol=1e-4, rtol=1e-4, 
-                    broken_grad_fields = [:a])
-        gradtest(l, test_graphs[2], atol=1e-4, rtol=1e-4)
-        
-
         @testset "bias=false" begin
             @test length(Flux.params(GATConv(2=>3))) == 3
             @test length(Flux.params(GATConv(2=>3, bias=false))) == 2
@@ -115,7 +91,7 @@
         @test size(l.weight) == (out_channel, out_channel, num_layers)
 
         for g in test_graphs
-            gradtest(l, g, atol=1e-5, rtol=1e-5) 
+            gradtest(l, g, rtol=1e-5) 
         end
     end
 
@@ -131,9 +107,19 @@
         eps = 0.001f0
         l = GINConv(nn, eps=eps)
         for g in test_graphs
-            gradtest(l, g, atol=1e-5, rtol=1e-5) 
+            gradtest(l, g, rtol=1e-5, exclude_grad_fields=[:eps]) 
         end
     
         @test !in(:eps, Flux.trainable(l))
     end
+
+    @testset "NNConv" begin
+        edim = 10
+        nn = Dense(edim, out_channel * in_channel)
+        l = NNConv(in_channel => out_channel, nn)
+        for g in test_graphs
+            g = GNNGraph(g, edata=rand(T, edim, g.num_edges))
+            gradtest(l, g, rtol=1e-5) 
+        end
+    end
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -1,4 +1,4 @@
-using ChainRulesTestUtils, FiniteDifferences, Zygote
+using ChainRulesTestUtils, FiniteDifferences, Zygote, Adapt
 
 const rule_config = Zygote.ZygoteRuleConfig()
 
@@ -11,64 +11,114 @@ end
 
 function gradtest(l, g::GNNGraph; atol=1e-7, rtol=1e-5,
                                  exclude_grad_fields=[],
-                                 broken_grad_fields=[]
+                                 broken_grad_fields=[],
+                                 verbose = false
                                 )
     # TODO these give errors, probably some bugs in ChainRulesTestUtils
     # test_rrule(rule_config, x -> l(g, x), x; rrule_f=rrule_via_ad, check_inferred=false)
     # test_rrule(rule_config, l -> l(g, x), l; rrule_f=rrule_via_ad, check_inferred=false)
 
-    !haskey(g.ndata, :x) && error("Plese pass input graph with :x ndata")
+    isnothing(node_features(g)) && error("Plese add node data to the input graph")
     fdm = central_fdm(5, 1)
     
-    x = g.ndata.x
+    x = node_features(g)
+    e = edge_features(g)
+
+    f(l, g) = l(g)
+    f(l, g, x) = isnothing(e) ? l(g, x) : l(g, x, e)
+    
+    loss(l, g) = sum(node_features(f(l, g))) 
+    loss(l, g, x) = sum(f(l, g, x)) 
+    loss(l, g, x, e) = sum(l(g, x, e)) 
     
+    x64, e64, l64, g64 = to64.([x, e, l, g]) 
     # TEST OUTPUT
-    y = l(g, x)
+    y = f(l, g, x)
     @test eltype(y) == eltype(x)
     
-    g′ = l(g)
+    g′ = f(l, g)
     @test g′.ndata.x ≈ y
     
-    # TEST INPUT GRADIENT
-    x̄  = gradient(x -> sum(l(g, x)), x)[1]
-    x̄_fd = FiniteDifferences.grad(fdm, x -> sum(l(g, x)), x)[1]
+    # TEST X INPUT GRADIENT
+    x̄  = gradient(x -> loss(l, g, x), x)[1]
+    x̄_fd = FiniteDifferences.grad(fdm, x64 -> loss(l64, g64, x64), x64)[1]
     @test x̄ ≈ x̄_fd    atol=atol rtol=rtol
 
+    if e !== nothing
+        # TEST E INPUT GRADIENT
+        ē  = gradient(e -> loss(l, g, x, e), e)[1]
+        ē_fd = FiniteDifferences.grad(fdm, e64 -> loss(l64, g64, x64, e64), e64)[1]
+        @test ē ≈ ē_fd    atol=atol rtol=rtol
+    end
+
     # TEST LAYER GRADIENT - l(g, x) 
-    l̄ = gradient(l -> sum(l(g, x)), l)[1]
-    l̄_fd = FiniteDifferences.grad(fdm, l -> sum(l(g, x)), l)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, broken_grad_fields, exclude_grad_fields)
+    l̄ = gradient(l -> loss(l, g, x), l)[1]
+    l̄_fd = FiniteDifferences.grad(fdm, l64 -> loss(l64, g64, x64), l64)[1]
+    test_approx_structs(l, l̄, l̄_fd; atol, rtol, broken_grad_fields, exclude_grad_fields, verbose)
     # TEST LAYER GRADIENT - l(g)
-    l̄ = gradient(l -> sum(l(g).ndata.x), l)[1]
-    l̄_fd = FiniteDifferences.grad(fdm, l -> sum(l(g).ndata.x), l)[1]
-    test_approx_structs(l, l̄, l̄_fd; atol, rtol, broken_grad_fields, exclude_grad_fields)
+    l̄ = gradient(l -> loss(l, g), l)[1]
+    l̄_fd = FiniteDifferences.grad(fdm, l64 -> loss(l64, g64), l64)[1]
+    test_approx_structs(l, l̄, l̄_fd; atol, rtol, broken_grad_fields, exclude_grad_fields, verbose)
 end
 
 function test_approx_structs(l, l̄, l̄_fd; atol=1e-5, rtol=1e-5, 
             broken_grad_fields=[],
-            exclude_grad_fields=[])
+            exclude_grad_fields=[],
+            verbose=false)
+
     for f in fieldnames(typeof(l))
         f ∈ exclude_grad_fields && continue
         f̄, f̄_fd = getfield(l̄, f), getfield(l̄_fd, f)
+        if verbose
+            println() 
+            @show f getfield(l, f) f̄ f̄_fd
+        end 
         if isnothing(f̄)
-            # @show f f̄_fd
+            verbose && println("A")
             @test !(f̄_fd isa AbstractArray) || isapprox(f̄_fd, fill!(similar(f̄_fd), 0); atol=atol, rtol=rtol)
         elseif f̄ isa Union{AbstractArray, Number}
+            verbose && println("B")
             @test eltype(f̄) == eltype(getfield(l, f))
             if f ∈ broken_grad_fields
                 @test_broken f̄ ≈ f̄_fd   atol=atol rtol=rtol
             else
-                # @show f getfield(l, f) f̄ f̄_fd broken_grad_fields
                 @test f̄ ≈ f̄_fd   atol=atol rtol=rtol
             end
         else
+            verbose && println("C")
             test_approx_structs(getfield(l, f), f̄, f̄_fd; broken_grad_fields)
         end
     end
     return true
 end
 
 
+"""
+    to32(m)
+
+Convert the `eltype` of model's parameters to `Float32` or `Int32`.
+"""
+function to32(m)
+    f(x::AbstractArray) = eltype(x) <: Integer ? adapt(Int32, x) : adapt(Float32, x)
+    f(x::Number) = typeof(x) <: Integer ? adapt(Int32, x) : adapt(Float32, x)
+    f(x) = adapt(Float32, x)
+    return fmap(f, m)
+end
+
+"""
+    to64(m)
+
+Convert the `eltype` of model's parameters to `Float64` or `Int64`.
+"""
+function to64(m)
+    f(x::AbstractArray) = eltype(x) <: Integer ? adapt(Int64, x) : adapt(Float64, x)
+    f(x::Number) = typeof(x) <: Integer ? adapt(Int64, x) : adapt(Float64, x)
+    f(x) = adapt(Float64, x)
+    return fmap(f, m)
+end
+
+
+
 # function gpu_gradtest(l, x_cpu = nothing, args...; test_cpu = true)
 #     isnothing(x_cpu) && error("Missing input to test the layers against.")
 #     @testset "$name GPU grad tests" begin