add subgraph

CarloLucibello · CarloLucibello · commit 15d230be2b59 · 2021-09-07T00:37:33.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -11,7 +11,6 @@ Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
diff --git a/examples/Project.toml b/examples/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
+LightGraphs = "093fc24a-ae57-5d10-9952-331d41423f4d"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
diff --git a/examples/graph_classification_mutag.jl b/examples/graph_classification_mutag.jl
@@ -0,0 +1,106 @@
+# An example of semi-supervised node classification
+
+using Flux
+using Flux: @functor, dropout, onecold, onehotbatch
+using Flux.Losses: logitbinarycrossentropy
+using GraphNeuralNetworks
+using MLDatasets: TUDataset
+using Statistics, Random
+using CUDA
+CUDA.allowscalar(false)
+
+function eval_loss_accuracy(model, g, X, y)
+    ŷ = model(g, X) |> vec
+    l = logitbinarycrossentropy(ŷ, y)
+    acc = mean((2 .* ŷ .- 1) .* (2 .* y .- 1) .> 0)
+    return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
+end
+
+struct GNNData
+    g
+    X
+    y
+end
+
+
+function getdataset(idxs)
+    data = TUDataset("MUTAG")[idxs]
+    @info "MUTAG: num_nodes: $(data.num_nodes)  num_edges: $(data.num_edges)  num_graphs: $(data.num_graphs)"
+    g = GNNGraph(data.source, data.target, num_nodes=data.num_nodes, graph_indicator=data.graph_indicator)
+    X = Array{Float32}(onehotbatch(data.node_labels, 0:6))
+    # E = Array{Float32}(onehotbatch(data.edge_labels, sort(unique(data.edge_labels))))
+    y = (1 .+ Array{Float32}(data.graph_labels)) ./ 2
+    @assert all(∈([0,1]), y) # binary classification 
+    return GNNData(g, X, y)
+end
+
+# arguments for the `train` function 
+Base.@kwdef mutable struct Args
+    η = 1f-3             # learning rate
+    epochs = 1000          # number of epochs
+    seed = 17             # set seed > 0 for reproducibility
+    use_cuda = false      # if true use cuda (if available)
+    nhidden = 128        # dimension of hidden features
+    infotime = 10 	     # report every `infotime` epochs
+end
+
+function train(; kws...)
+    args = Args(; kws...)
+    args.seed > 0 && Random.seed!(args.seed)
+    
+    if args.use_cuda && CUDA.functional()
+        device = gpu
+        args.seed > 0 && CUDA.seed!(args.seed)
+        @info "Training on GPU"
+    else
+        device = cpu
+        @info "Training on CPU"
+    end
+
+    # LOAD DATA
+    
+    permindx = randperm(188)
+    ntrain = 150
+    gtrain, Xtrain, ytrain = getdataset(permindx[1:ntrain]) 
+    gtest, Xtest, ytest = getdataset(permindx[ntrain+1:end]) 
+    
+    # DEFINE MODEL
+
+    nin = size(Xtrain,1)
+    nhidden = args.nhidden
+    
+    model = GNNChain(GCNConv(nin => nhidden, relu),
+                     Dropout(0.5),
+                     GCNConv(nhidden => nhidden, relu),
+                     GlobalPool(mean), 
+                     Dense(nhidden, 1))  |> device
+
+    ps = Flux.params(model)
+    opt = ADAM(args.η)
+
+    
+    # LOGGING FUNCTION
+
+    function report(epoch)
+        train = eval_loss_accuracy(model, gtrain, Xtrain, ytrain)
+        test = eval_loss_accuracy(model, gtest, Xtest, ytest)
+        println("Epoch: $epoch   Train: $(train)   Test: $(test)")
+    end
+    
+    # TRAIN
+    
+    report(0)
+    for epoch in 1:args.epochs
+        # for (g, X, y) in train_loader
+            gs = Flux.gradient(ps) do
+                ŷ = model(gtrain, Xtrain) |> vec
+                logitbinarycrossentropy(ŷ, ytrain)
+            end
+            Flux.Optimise.update!(opt, ps, gs)
+        # end
+        
+        epoch % args.infotime == 0 && report(epoch)
+    end
+end
+
+# train()
diff --git a/examples/node_classification_cora.jl b/examples/node_classification_cora.jl
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -24,7 +24,8 @@ export
     edge_index,
     node_feature, edge_feature, global_feature,
     adjacency_list, normalized_laplacian, scaled_laplacian,
-    add_self_loops,
+    add_self_loops, remove_self_loops,
+    subgraph,
 
     # from LightGraphs
     adjacency_matrix, 
diff --git a/src/gnngraph.jl b/src/gnngraph.jl
@@ -11,7 +11,7 @@ const ADJMAT_T = AbstractMatrix
 const SPARSE_T = AbstractSparseMatrix # subset of ADJMAT_T
 
 """
-    GNNGraph(data; [graph_type, nf, ef, gf, num_nodes, num_graphs, graph_indicator, dir])
+    GNNGraph(data; [graph_type, nf, ef, gf, num_nodes, graph_indicator, dir])
     GNNGraph(g::GNNGraph; [nf, ef, gf])
 
 A type representing a graph structure and storing also arrays 
@@ -50,7 +50,6 @@ from the LightGraphs' graph library can be used on it.
 - `dir`. The assumed edge direction when given adjacency matrix or adjacency list input data `g`. 
         Possible values are `:out` and `:in`. Default `:out`.
 - `num_nodes`. The number of nodes. If not specified, inferred from `g`. Default `nothing`.
-- `num_graphs`. The number of graphs. Larger than 1 in case of batched graphs. Default `1`.
 - `graph_indicator`. For batched graphs, a vector containeing the graph assigment of each node. Default `nothing`.  
 - `nf`: Node features. Either nothing, or an array whose last dimension has size num_nodes. Default `nothing`.
 - `ef`: Edge features. Either nothing, or an array whose last dimension has size num_edges. Default `nothing`.
@@ -123,17 +122,17 @@ function GNNGraph(data;
 
     @assert graph_type ∈ [:coo, :dense, :sparse] "Invalid graph_type $graph_type requested"
     @assert dir ∈ [:in, :out]
+    
     if graph_type == :coo
         g, num_nodes, num_edges = to_coo(data; num_nodes, dir)
     elseif graph_type == :dense
         g, num_nodes, num_edges = to_dense(data; dir)
     elseif graph_type == :sparse
         g, num_nodes, num_edges = to_sparse(data; dir)
     end
-    if num_graphs > 1
-        @assert len(graph_indicator) = num_nodes "When batching multiple graphs `graph_indicator` should be filled with the nodes' memberships."
-    end 
-
+    
+    num_graphs = !isnothing(graph_indicator) ? maximum(graph_indicator) : 1
+    
     ## Possible future implementation of feature maps. 
     ## Currently this doesn't play well with zygote due to 
     ## https://github.com/FluxML/Zygote.jl/issues/717    
@@ -154,8 +153,8 @@ GNNGraph((s, t)::NTuple{2}; kws...) = GNNGraph((s, t, nothing); kws...)
 
 function GNNGraph(g::AbstractGraph; kws...)
     s = LightGraphs.src.(LightGraphs.edges(g))
-    t = LightGraphs.dst.(LightGraphs.edges(g)) 
-    GNNGraph((s, t); kws...)
+    t = LightGraphs.dst.(LightGraphs.edges(g))
+    GNNGraph((s, t); num_nodes = nv(g), kws...)
 end
 
 function GNNGraph(g::GNNGraph; 
@@ -436,36 +435,77 @@ function _catgraphs(g1::GNNGraph{<:COO_T}, g2::GNNGraph{<:COO_T})
     )
 end
 
-# Cat public interfaces
+### Cat public interfaces #############
 
-```
+"""
     blockdiag(xs::GNNGraph...)
 
 Batch togheter multiple `GNNGraph`s into a single one 
 containing the total number of nodes and edges of the original graphs.
 
 Equivalent to [`Flux.batch`](@ref).
-```
+"""
 function SparseArrays.blockdiag(g1::GNNGraph, gothers::GNNGraph...)
-    @assert length(gothers) >= 1
     g = g1
     for go in gothers
         g = _catgraphs(g, go)
     end
     return g
 end
 
-```
+"""
     batch(xs::Vector{<:GNNGraph})
 
 Batch togheter multiple `GNNGraph`s into a single one 
 containing the total number of nodes and edges of the original graphs.
 
 Equivalent to [`SparseArrays.blockdiag`](@ref).
-```
+"""
 Flux.batch(xs::Vector{<:GNNGraph}) = blockdiag(xs...)
 #########################
 
+"""
+    subgraph(g::GNNGraph, i)
+
+Return the subgraph of `g` induced by those nodes `v`
+for which `g.graph_indicator[v] ∈ i`. In other words, it
+extract the component graphs from a batched graph. 
+
+It also returns a vector `nodes` mapping the new nodes to the old ones. 
+The node `i` in the subgraph corresponds to the node `nodes[i]` in `g`.
+"""
+subgraph(g::GNNGraph, i::Int) = subgraph(g::GNNGraph{<:COO_T}, [i])
+
+function subgraph(g::GNNGraph{<:COO_T}, i::AbstractVector)
+    node_mask = g.graph_indicator .∈ Ref(i)
+    
+    nodes = (1:g.num_nodes)[node_mask]
+    nodemap = Dict(v => vnew for (vnew, v) in enumerate(nodes))
+
+    graphmap = Dict(i => inew for (inew, i) in enumerate(i))
+    graph_indicator = [graphmap[i] for i in g.graph_indicator[node_mask]]
+    
+    s, t, w = g.graph
+    edge_mask = s .∈ Ref(nodes) 
+    s = [nodemap[i] for i in s[edge_mask]]
+    t = [nodemap[i] for i in t[edge_mask]]
+    w = isnothing(w) ? nothing : w[edge_mask]
+    @show size(g.nf) size(node_mask)
+    nf = isnothing(g.nf) ? nothing : g.nf[:,node_mask]
+    ef = isnothing(g.ef) ? nothing : g.ef[:,edge_mask]
+    gf = isnothing(g.gf) ? nothing : g.gf[:,i]
+
+    num_nodes = length(graph_indicator)
+    num_edges = length(s)
+    num_graphs = length(i)
+
+    gnew = GNNGraph((s,t,w), 
+                num_nodes, num_edges, num_graphs,
+                graph_indicator,
+                nf, ef, gf)
+    return gnew, nodes
+end
+
 @non_differentiable normalized_laplacian(x...)
 @non_differentiable normalized_adjacency(x...)
 @non_differentiable scaled_laplacian(x...)
diff --git a/src/layers/pool.jl b/src/layers/pool.jl
@@ -24,7 +24,7 @@ X = rand(32, 10)
 pool(g, X) # => 32x1 matrix
 ```
 """
-struct GlobalPool{F}
+struct GlobalPool{F} <: GNNLayer
     aggr::F
 end