Add a transformer-like convolutional layer (#249)

graidl · CarloLucibello · web-flow · commit 26a979ecc790 · 2023-01-04T00:49:01.000+01:00
* TransformerConv developed

* Typo fixed

* Typo fixed

* Julia 1.7 line-breaks in strings avoided

* Update src/layers/conv.jl

Sure, thanks!

Co-authored-by: Carlo Lucibello &lt;carlo.lucibello@gmail.com&gt;

* Update src/layers/conv.jl

Co-authored-by: Carlo Lucibello &lt;carlo.lucibello@gmail.com&gt;

* Polishing of documentation, beta renamed to gated

* Fix beta -&gt; gating in tests

Co-authored-by: Carlo Lucibello &lt;carlo.lucibello@gmail.com&gt;
diff --git a/docs/src/api/conv.md b/docs/src/api/conv.md
@@ -32,6 +32,7 @@ The table below lists all graph convolutional layers implemented in the *GraphNe
 | [`ResGatedGraphConv`](@ref) |          |           |             |
 | [`SAGEConv`](@ref)          |     ✓    |           |             |
 | [`SGConv`](@ref)            |     ✓    |           |             |
+| [`TransformerConv`](@ref)   |          |           |     ✓       |
 
 
 ## Docs
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -64,6 +64,7 @@ export
     ResGatedGraphConv,
     SAGEConv,
     SGConv,
+    TransformerConv,
     
     # layers/pool
     GlobalPool,
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -1452,3 +1452,209 @@ function Base.show(io::IO, l::EGNNConv)
     end
     print(io, ")")
 end
+
+
+@doc raw"""
+    TransformerConv((in, ein) => out; [heads, concat, init, add_self_loops, bias_qkv,
+        bias_root, root_weight, gating, skip_connection, batch_norm, ff_channels]))
+
+The transformer-like multi head attention convolutional operator from the 
+[Masked Label Prediction: Unified Message Passing Model for Semi-Supervised 
+Classification](https://arxiv.org/abs/2009.03509) paper, which also considers 
+edge features.
+It further contains options to also be configured as the transformer-like convolutional operator from the 
+[Attention, Learn to Solve Routing Problems!](https://arxiv.org/abs/1706.03762) paper,
+including a successive feed-forward network as well as skip layers and batch normalization.
+
+The layer's basic forward pass is given by
+```math
+x_i' = W_1x_i + \sum_{j\in N(i)} \alpha_{ij} (W_2 x_j + W_6e_{ij})
+```
+where the attention scores are
+```math
+\alpha_{ij} = \mathrm{softmax}\left(\frac{(W_3x_i)^T(W_4x_j+
+W_6e_{ij})}{\sqrt{d}}\right).
+```
+
+Optionally, a combination of the aggregated value with transformed root node features 
+by a gating mechanism via
+```math
+x'_i = \beta_i W_1 x_i + (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)}
+\alpha_{i,j} W_2 x_j \right)}_{=m_i}
+```
+with
+```math
+\beta_i = \textrm{sigmoid}(W_5^{\top} [ W_1 x_i, m_i, W_1 x_i - m_i ]).
+```
+can be performed.
+
+# Arguments 
+
+- `in`: Dimension of input features, which also corresponds to the dimension of 
+    the output features.
+- `ein`: Dimension of the edge features; if 0, no edge features will be used.
+- `out`: Dimension of the output.
+- `heads`: Number of heads in output. Default `1`.
+- `concat`: Concatenate layer output or not. If not, layer output is averaged
+    over the heads. Default `true`.
+- `init`: Weight matrices' initializing function. Default `glorot_uniform`.
+- `add_self_loops`: Add self loops to the input graph. Default `false`.
+- `bias_qkv`: If set, bias is used in the key, query and value transformations for nodes.
+    Default `true`.
+- `bias_root`: If set, the layer will also learn an additive bias for the root when root 
+    weight is used. Default `true`.
+- `root_weight`: If set, the layer will add the transformed root node features
+    to the output. Default `true`.
+- `gating`: If set, will combine aggregation and transformed root node features by a
+    gating mechanism. Default `false`.
+- `skip_connection`: If set, a skip connection will be made from the input and 
+    added to the output. Default `false`.
+- `batch_norm`: If set, a batch normalization will be applied to the output. Default `false`.
+- `ff_channels`: If positive, a feed-forward NN is appended, with the first having the given
+    number of hidden nodes; this NN also gets a skip connection and batch normalization 
+    if the respective parameters are set. Default: `0`.
+"""
+struct TransformerConv{TW1, TW2, TW3, TW4, TW5, TW6, TFF, TBN1, TBN2} <: GNNLayer
+    W1::TW1
+    W2::TW2
+    W3::TW3
+    W4::TW4
+    W5::TW5
+    W6::TW6
+    FF::TFF
+    BN1::TBN1
+    BN2::TBN2
+    channels::Pair{NTuple{2,Int},Int}
+    heads::Int
+    add_self_loops::Bool
+    concat::Bool
+    skip_connection::Bool
+    sqrt_out::Float32
+end
+
+@functor TransformerConv
+
+Flux.trainable(l::TransformerConv) = (l.W1, l.W2, l.W3, l.W4, l.W5, l.W6, l.FF, l.BN1, l.BN2)
+
+TransformerConv(ch::Pair{Int,Int}, args...; kws...) = TransformerConv((ch[1], 0) => ch[2], args...; kws...)
+
+function TransformerConv(ch::Pair{NTuple{2, Int}, Int}; 
+        heads::Int = 1, 
+        concat::Bool = true, 
+        init = glorot_uniform, 
+        add_self_loops::Bool = false, 
+        bias_qkv = true, 
+        bias_root::Bool = true, 
+        root_weight::Bool = true, 
+        gating::Bool = false, 
+        skip_connection::Bool = false, 
+        batch_norm::Bool = false, 
+        ff_channels::Int = 0)
+
+    (in, ein), out = ch
+
+    if add_self_loops
+        @assert iszero(ein) "Using edge features and setting add_self_loops=true at the same time is not yet supported."
+    end
+
+    W1 = root_weight ? Dense(in, out * (concat ? heads : 1); bias=bias_root, init=init) : nothing
+    W2 = Dense(in => out*heads; bias=bias_qkv, init=init)
+    W3 = Dense(in => out*heads; bias=bias_qkv, init=init)
+    W4 = Dense(in => out*heads; bias=bias_qkv, init=init)
+    out_mha = out * (concat ? heads : 1)
+    W5 = gating ? Dense(3 * out_mha => 1, sigmoid; bias=false, init=init) : nothing
+    W6 = ein > 0 ? Dense(ein => out*heads; bias=bias_qkv, init=init) : nothing
+    FF = ff_channels > 0 ? Chain(
+            Dense(out_mha => ff_channels, relu), 
+            Dense(ff_channels => out_mha)
+        ) : nothing
+    BN1 = batch_norm ? BatchNorm(out_mha) : nothing
+    BN2 = (batch_norm && ff_channels > 0) ? BatchNorm(out_mha) : nothing
+
+    return TransformerConv(W1, W2, W3, W4, W5, W6, FF, BN1, BN2,
+        ch, heads, add_self_loops, concat, skip_connection, Float32(√out))
+end
+
+function (l::TransformerConv)(g::GNNGraph, x::AbstractMatrix, 
+        e::Union{AbstractMatrix, Nothing}=nothing)
+    check_num_nodes(g, x)
+
+    if l.add_self_loops
+        g = add_self_loops(g)
+    end
+
+    out = l.channels[2]
+    heads = l.heads
+    W1x = !isnothing(l.W1) ? l.W1(x) : nothing
+    W2x = reshape(l.W2(x), out, heads, :)
+    W3x = reshape(l.W3(x), out, heads, :)
+    W4x = reshape(l.W4(x), out, heads, :)
+    W6e = !isnothing(l.W6) ? reshape(l.W6(e), out, heads, :) : nothing
+
+    m = apply_edges(message_uij, g, l; xi=(; W3x), xj=(; W4x), e=(; W6e))
+    α = softmax_edge_neighbors(g, m)
+    α_val = propagate(message_main, g, +, l; xi=(; W3x), xj=(; W2x), e=(; W6e, α))
+
+    h = α_val
+    if l.concat
+        h = reshape(h, out * heads, :)  # concatenate heads
+    else
+        h = mean(h, dims=2)  # average heads
+        h = reshape(h, out, :)
+    end
+
+    if !isnothing(W1x)  # root_weight
+        if !isnothing(l.W5)  # gating
+            β = l.W5(vcat(h, W1x, h .- W1x))
+            h = β .* W1x + (1f0 .- β) .* h
+        else
+            h += W1x
+        end
+    end
+
+    if l.skip_connection
+        @assert size(h, 1) == size(x, 1) "In-channels must correspond to out-channels * heads if skip_connection is used"
+        h += x
+    end
+    if !isnothing(l.BN1)
+        h = l.BN1(h)
+    end
+
+    if !isnothing(l.FF)
+        h1 = h
+        h = l.FF(h) 
+        if l.skip_connection
+            h += h1
+        end
+        if !isnothing(l.BN2)
+            h = l.BN2(h)
+        end
+    end
+
+    return h
+end
+
+(l::TransformerConv)(g::GNNGraph) = GNNGraph(g, ndata=l(g, node_features(g), edge_features(g)))
+
+function message_uij(l::TransformerConv, xi, xj, e) 
+    key = xj.W4x
+    if !isnothing(e.W6e)
+        key += e.W6e
+    end
+    uij = sum(xi.W3x .* key, dims=1) ./ l.sqrt_out
+    return uij
+end
+
+function message_main(l::TransformerConv, xi, xj, e) 
+    val = xj.W2x
+    if !isnothing(e.W6e)
+        val += e.W6e
+    end
+    return e.α .* val
+end
+
+function Base.show(io::IO, l::TransformerConv)
+    (in, ein), out = l.channels
+    print(io, "TransformerConv(($in, $ein) => $out, heads=$(l.heads))")
+end
+
diff --git a/test/examples/node_classification_cora.jl b/test/examples/node_classification_cora.jl
@@ -16,10 +16,10 @@ end
 
 # arguments for the `train` function 
 Base.@kwdef mutable struct Args
-    η = 5f-3             # learning rate
+    η = 5f-3            # learning rate
     epochs = 10         # number of epochs
-    seed = 17             # set seed > 0 for reproducibility
-    usecuda = false      # if true use cuda (if available)
+    seed = 17           # set seed > 0 for reproducibility
+    usecuda = false     # if true use cuda (if available)
     nhidden = 64        # dimension of hidden features
 end
 
@@ -58,8 +58,8 @@ function train(Layer; verbose=false, kws...)
 
     ## TRAINING
     function report(epoch)
-        train = eval_loss_accuracy(X, y, train_ids, model, g)
-        test = eval_loss_accuracy(X, y, test_ids, model, g)        
+        train = eval_loss_accuracy(X, y, train_mask, model, g)
+        test = eval_loss_accuracy(X, y, test_mask, model, g)  
         println("Epoch: $epoch   Train: $(train)   Test: $(test)")
     end
     
@@ -86,6 +86,8 @@ function train_many(; usecuda=false)
                 ("SAGEConv", (nin, nout) -> SAGEConv(nin => nout, relu)),
                 ("GATConv", (nin, nout) -> GATConv(nin => nout, relu)),
                 ("GINConv", (nin, nout) -> GINConv(Dense(nin, nout, relu), 0.01, aggr=mean)),
+                ("TransformerConv", (nin, nout) -> TransformerConv(nin => nout, concat=false,
+                    add_self_loops=true, root_weight=false, heads=2))
                 ## ("ChebConv", (nin, nout) -> ChebConv(nin => nout, 2)), # not working on gpu
                 ## ("NNConv", (nin, nout) -> NNConv(nin => nout)),  # needs edge features
                 ## ("GatedGraphConv", (nin, nout) -> GatedGraphConv(nout, 2)),  # needs nin = nout
@@ -94,6 +96,7 @@ function train_many(; usecuda=false)
 
         @show layer
         @time train_res, test_res = train(Layer; usecuda, verbose=false)
+        # @show train_res, test_res
         @test train_res.acc > 94
         @test test_res.acc > 70
     end
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -266,7 +266,7 @@
     @testset "GMMConv" begin
         ein_channel = 10
         K = 5
-        l = GMMConv((in_channel, ein_channel )=> out_channel, K=K)
+        l = GMMConv((in_channel, ein_channel ) => out_channel, K=K)
         for g in test_graphs
             g = GNNGraph(g, edata=rand(Float32, ein_channel, g.num_edges))
             test_layer(l, g, rtol=RTOL_HIGH, outsize = (out_channel, g.num_nodes))
@@ -300,4 +300,37 @@
         @test size(hnew) == (hout, g.num_nodes)
         @test size(xnew) == (in_channel, g.num_nodes)
     end
+
+    @testset "TransformerConv" begin
+        ein = 2
+        heads = 3
+        # used like in Kool et al., 2019
+        l = TransformerConv(in_channel * heads => in_channel; heads, add_self_loops=true, 
+            root_weight=false, ff_channels=10, skip_connection=true, batch_norm=false)
+            # batch_norm=false here for tests to pass; true in paper
+        for adj in [adj1, adj_single_vertex]
+            g = GNNGraph(adj, ndata=rand(T, in_channel * heads, size(adj, 1)), graph_type=GRAPH_T)
+            test_layer(l, g, rtol=RTOL_LOW, 
+                exclude_grad_fields = [:negative_slope],
+                outsize=(in_channel * heads, g.num_nodes))
+        end
+        # used like in Shi et al., 2021 
+        l = TransformerConv((in_channel, ein) => in_channel; heads, gating=true, bias_qkv=true)
+        for g in test_graphs
+            g = GNNGraph(g, edata=rand(T, ein, g.num_edges))
+            test_layer(l, g, rtol=RTOL_LOW, 
+                exclude_grad_fields = [:negative_slope],
+                outsize=(in_channel * heads, g.num_nodes))
+        end
+        # test averaging heads
+        l = TransformerConv(in_channel => in_channel; heads, concat=false, bias_root=false, 
+            root_weight=false)
+        for g in test_graphs
+            test_layer(l, g, rtol=RTOL_LOW, 
+                exclude_grad_fields = [:negative_slope],
+                outsize=(in_channel, g.num_nodes))
+        end
+    end
 end
+
+
diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -18,7 +18,7 @@ end
 # Tests also gradient on cpu and gpu comparing with
 # finite difference methods.
 # Test gradients with respects to layer weights and to input. 
-# If `g` has edge features, it is assumed that the layer can be 
+# If `g` has edge features, it is assumed that the layer can 
 # use them in the forward pass as `l(g, x, e)`.
 # Test also gradient with repspect to `e`. 
 function test_layer(l, g::GNNGraph; atol = 1e-6, rtol = 1e-5,