dataloader support

CarloLucibello · CarloLucibello · commit 401147e30df8 · 2021-09-09T11:17:46.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -15,6 +15,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
+PrettyPrint = "8162dcfd-2161-5ef2-ae6c-7681170c5f98"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/docs/make.jl b/docs/make.jl
@@ -4,7 +4,7 @@ using Documenter
 DocMeta.setdocmeta!(GraphNeuralNetworks, :DocTestSetup, :(using GraphNeuralNetworks); recursive=true)
 
 makedocs(;
-    modules=[GraphNeuralNetworks],
+    modules=[GraphNeuralNetworks, NNlib],
     sitename = "GraphNeuralNetworks.jl",
     pages = ["Home" => "index.md",
              "GNNGraph" => "gnngraph.md",
diff --git a/docs/src/gnngraph.md b/docs/src/gnngraph.md
@@ -2,7 +2,78 @@
 
 TODO
 
-```@docs
-GNNGraph
+## Graph Creation
+
+
+```julia
+using GraphNeuralNetworks, LightGraphs, SparseArrays
+
+
+# From LightGraphs's graph
+lg_graph = erdos_renyi(10, 0.3)
+g = GNNGraph(lg_graph)
+
+
+# From adjacency matrix
+A = sprand(10, 10, 0.3)
+
+g = GNNGraph(A)
+
+@assert adjacency_matrix(g) == A
+
+# From adjacency list
+adjlist = [[] [] [] ]
+
+g = GNNGraph(adjlist)
+
+@assert sort.(adjacency_list(g)) == sort.(adjlist)
+
+# From COO representation
+source = []
+target = []
+g = GNNGraph(source, target)
+@assert edge_index(g) == (source, target)
+```
+
+We have also seen some useful methods such as [`adjacency_matrix`](@ref) and [`edge_index`](@ref).
+
+
+
+## Data Features
+
+```julia
+GNNGraph(sprand(10, 0.3), ndata = (; X=rand(32, 10)))
+# or equivalently
+GNNGraph(sprand(10, 0.3), ndata=rand(32, 10))
+
+
+g = GNNGraph(sprand(10, 0.3), ndata = (X=rand(32, 10), y=rand(10)))
+
+g = GNNGraph(g, edata=rand(6, g.num_edges))
+```
+
+
+## Graph Manipulation
+
+```julia
+g = add_self_loops(g)
+
+g = remove_self_loops(g)
+```
+
+## Batches and Subgraphs
+
+```julia
+g = Flux.batch([g1, g2, g3])
+
+subgraph(g, 2:3)
+```
+
+
+## LightGraphs integration
+
+```julia
+@assert LightGraphs.isdirected(g)
 ```
 
+## Other methods
diff --git a/examples/graph_classification_tudataset.jl b/examples/graph_classification_tudataset.jl
@@ -7,49 +7,39 @@ using Flux.Data: DataLoader
 using GraphNeuralNetworks
 using MLDatasets: TUDataset
 using Statistics, Random
+using LearnBase: getobs
 using CUDA
 CUDA.allowscalar(false)
 
 function eval_loss_accuracy(model, data_loader, device)
     loss = 0.
     acc = 0.
     ntot = 0
-    for (g, X, y) in data_loader
-        g, X, y = g |> device, X |> device, y |> device
-        n = length(y) 
-        ŷ = model(g, X) |> vec
+    for g in data_loader
+        g = g |> device
+        n = g.num_graphs
+        y = g.gdata.y
+        ŷ = model(g, g.ndata.X) |> vec
         loss += logitbinarycrossentropy(ŷ, y) * n 
         acc += mean((2 .* ŷ .- 1) .* (2 .* y .- 1) .> 0) * n
         ntot += n
-    end        
+    end 
     return (loss = round(loss/ntot, digits=4), acc = round(acc*100/ntot, digits=2))
 end
 
-struct GNNData
-    g
-    X
-    y
-end
-
-Base.getindex(data::GNNData, i::Int) = getindex(data, [i])
-
-function Base.getindex(data::GNNData, i::AbstractVector)
-    sg, nodemap = subgraph(data.g, i)
-    return (sg, data.X[:,nodemap], data.y[i])
-end
-
-# Flux's Dataloader compatibility. Related PR https://github.com/FluxML/Flux.jl/pull/1683
-Flux.Data._nobs(data::GNNData) = data.g.num_graphs
-Flux.Data._getobs(data::GNNData, i) = data[i] 
-
-function process_dataset(data)
-    g = GNNGraph(data.source, data.target, num_nodes=data.num_nodes, graph_indicator=data.graph_indicator)
+function getdataset()
+    data = TUDataset("MUTAG")
+    
     X = Array{Float32}(onehotbatch(data.node_labels, 0:6))
-    # The dataset also has edge features but we won't be using them
-    # E = Array{Float32}(onehotbatch(data.edge_labels, sort(unique(data.edge_labels))))
     y = (1 .+ Array{Float32}(data.graph_labels)) ./ 2
     @assert all(∈([0,1]), y) # binary classification 
-    return GNNData(g, X, y)
+    # The dataset also has edge features but we won't be using them
+    E = Array{Float32}(onehotbatch(data.edge_labels, sort(unique(data.edge_labels))))
+    
+    return GNNGraph(data.source, data.target, 
+                num_nodes=data.num_nodes, 
+                graph_indicator=data.graph_indicator,
+                ndata=(; X), edata=(; E), gdata=(; y))
 end
 
 # arguments for the `train` function 
@@ -78,23 +68,25 @@ function train(; kws...)
 
     # LOAD DATA
 
+    
     NUM_TRAIN = 150
-    full_data = TUDataset("MUTAG")
     
+    gfull = getdataset()
+
     @info "MUTAG DATASET
-            num_nodes: $(full_data.num_nodes)  
-            num_edges: $(full_data.num_edges)  
-            num_graphs: $(full_data.num_graphs)"
-    
-    perm = randperm(full_data.num_graphs)
-    dtrain = process_dataset(full_data[perm[1:NUM_TRAIN]]) 
-    dtest = process_dataset(full_data[perm[NUM_TRAIN+1:end]]) 
-    train_loader = DataLoader(dtrain, batchsize=args.batchsize, shuffle=true)
-    test_loader = DataLoader(dtest, batchsize=args.batchsize, shuffle=false)
+            num_nodes: $(gfull.num_nodes)  
+            num_edges: $(gfull.num_edges)  
+            num_graphs: $(gfull.num_graphs)"
+
+    perm = randperm(gfull.num_graphs)
+    gtrain = getobs(gfull, perm[1:NUM_TRAIN])
+    gtest = getobs(gfull, perm[NUM_TRAIN+1:end]) 
+    train_loader = DataLoader(gtrain, batchsize=args.batchsize, shuffle=true)
+    test_loader = DataLoader(gtest, batchsize=args.batchsize, shuffle=false)
     
     # DEFINE MODEL
 
-    nin = size(dtrain.X, 1)
+    nin = size(gtrain.ndata.X, 1)
     nhidden = args.nhidden
     
     model = GNNChain(GraphConv(nin => nhidden, relu),
@@ -119,11 +111,11 @@ function train(; kws...)
     
     report(0)
     for epoch in 1:args.epochs
-        for (g, X, y) in train_loader
-            g, X, y = g |> device, X |> device, y |> device
+        for g in train_loader
+            g = g |> device
             gs = Flux.gradient(ps) do
-                ŷ = model(g, X) |> vec
-                logitbinarycrossentropy(ŷ, y)
+                ŷ = model(g, g.ndata.X) |> vec
+                logitbinarycrossentropy(ŷ, g.gdata.y)
             end
             Flux.Optimise.update!(opt, ps, gs)
         end
@@ -132,4 +124,4 @@ function train(; kws...)
     end
 end
 
-train()
+# train()
diff --git a/src/GraphNeuralNetworks.jl b/src/GraphNeuralNetworks.jl
@@ -7,8 +7,9 @@ import KrylovKit
 using Base: tail
 using CUDA
 using Flux
-using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
+using Flux: glorot_uniform, leakyrelu, GRUCell, @functor, batch
 using MacroTools: @forward
+import LearnBase
 using LearnBase: getobs
 using NNlib, NNlibCUDA
 using ChainRulesCore
@@ -26,6 +27,8 @@ export
 
     # from LightGraphs
     adjacency_matrix, 
+    # from SparseArrays
+    sprand, sparse, 
 
     # msgpass
     # update, update_edge, update_global, message, propagate,
diff --git a/src/gnngraph.jl b/src/gnngraph.jl
@@ -418,7 +418,17 @@ containing the total number of nodes and edges of the original graphs.
 Equivalent to [`SparseArrays.blockdiag`](@ref).
 """
 Flux.batch(xs::Vector{<:GNNGraph}) = blockdiag(xs...)
+
+### LearnBase compatibility
+LearnBase.nobs(g::GNNGraph) = g.num_graphs 
+LearnBase.getobs(g::GNNGraph, i) = subgraph(g, i)[1]
+
+# Flux's Dataloader compatibility. Related PR https://github.com/FluxML/Flux.jl/pull/1683
+Flux.Data._nobs(g::GNNGraph) = g.num_graphs
+Flux.Data._getobs(g::GNNGraph, i) = subgraph(g, i)[1]
+
 #########################
+Base.:(==)(g1::GNNGraph, g2::GNNGraph) = all(k -> getfield(g1,k)==getfield(g2,k), fieldnames(typeof(g1)))
 
 """
     subgraph(g::GNNGraph, i)
@@ -432,7 +442,12 @@ The node `i` in the subgraph corresponds to the node `nodes[i]` in `g`.
 """
 subgraph(g::GNNGraph, i::Int) = subgraph(g::GNNGraph{<:COO_T}, [i])
 
-function subgraph(g::GNNGraph{<:COO_T}, i::AbstractVector)
+function subgraph(g::GNNGraph{<:COO_T}, i::AbstractVector{Int})
+    if g.graph_indicator === nothing
+        @assert i == [1]
+        return g
+    end
+
     node_mask = g.graph_indicator .∈ Ref(i)
     
     nodes = (1:g.num_nodes)[node_mask]
@@ -446,8 +461,9 @@ function subgraph(g::GNNGraph{<:COO_T}, i::AbstractVector)
     s = [nodemap[i] for i in s[edge_mask]]
     t = [nodemap[i] for i in t[edge_mask]]
     w = isnothing(w) ? nothing : w[edge_mask]
+    
     ndata = getobs(g.ndata, node_mask)
-    edata = getobs(g.ndata, edge_mask)
+    edata = getobs(g.edata, edge_mask)
     gdata = getobs(g.gdata, i)
 
     num_nodes = length(graph_indicator)
@@ -461,7 +477,6 @@ function subgraph(g::GNNGraph{<:COO_T}, i::AbstractVector)
     return gnew, nodes
 end
 
-### TO DEPRECATE ?? ###
 function node_features(g::GNNGraph)
     if isempty(g.ndata)
         return nothing
@@ -491,7 +506,6 @@ function global_features(g::GNNGraph)
         return g.gdata[1]
     end
 end
-#########
 
 
 @non_differentiable normalized_laplacian(x...)
diff --git a/src/layers/pool.jl b/src/layers/pool.jl
@@ -8,20 +8,25 @@ Takes a graph and feature nodes as inputs
 and performs the operation
 
 ```math
-\mathbf{u}_V = \box_{i \in V} \mathbf{x}_i
-````
+\mathbf{u}_V = \square_{i \in V} \mathbf{x}_i
+```
 where ``V`` is the set of nodes of the input graph and 
-the type of aggregation represented by `\box` is selected by the `aggr` argument. 
+the type of aggregation represented by ``\square`` is selected by the `aggr` argument. 
 Commonly used aggregations are are `mean`, `max`, and `+`.
 
 ```julia
-using GraphNeuralNetworks, LightGraphs
+using Flux, GraphNeuralNetworks, LightGraphs
 
 pool = GlobalPool(mean)
 
-g = GNNGraph(random_regular_graph(10, 4))
+g = GNNGraph(erdos_renyi(10, 4))
 X = rand(32, 10)
 pool(g, X) # => 32x1 matrix
+
+
+g = Flux.batch([GNNGraph(erdos_renyi(10, 4)) for _ in 1:5])
+X = rand(32, 50)
+pool(g, X) # => 32x5 matrix
 ```
 """
 struct GlobalPool{F} <: GNNLayer
diff --git a/src/msgpass.jl b/src/msgpass.jl
diff --git a/test/gnngraph.jl b/test/gnngraph.jl