diff --git a/.gitignore b/.gitignore index 3b60b63b8..daa7226d7 100644 --- a/.gitignore +++ b/.gitignore @@ -18,5 +18,6 @@ GraphNeuralNetworks/docs/build GraphNeuralNetworks/docs/src/GNNGraphs GraphNeuralNetworks/docs/src/GNNlib tutorials/docs/build +docs/src/tutorials # generated by DemoCards from docs/tutorials/ prova.jl pyg.ipynb diff --git a/GraphNeuralNetworks/docs/Project.toml b/GraphNeuralNetworks/docs/Project.toml index b4814d9e0..62c4ff8d3 100644 --- a/GraphNeuralNetworks/docs/Project.toml +++ b/GraphNeuralNetworks/docs/Project.toml @@ -1,8 +1,10 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +DemoCards = "311a05b2-6137-4a5a-b473-18580a3d38b5" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656" +FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" GNNGraphs = "aed8fd31-079b-4b5a-b342-a13352159b8c" GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48" @@ -14,8 +16,6 @@ Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -PlutoStaticHTML = "359b1769-a58e-495b-9770-312e911026ad" -PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/GraphNeuralNetworks/docs/make.jl b/GraphNeuralNetworks/docs/make.jl index d07c41d7a..5579ad80d 100644 --- a/GraphNeuralNetworks/docs/make.jl +++ b/GraphNeuralNetworks/docs/make.jl @@ -8,6 +8,7 @@ Pkg.develop([ Pkg.instantiate() using Documenter +using DemoCards: DemoCards using GraphNeuralNetworks using Flux, GNNGraphs, GNNlib, Graphs using DocumenterInterLinks @@ -39,14 +40,25 @@ cp(joinpath(@__DIR__, "../../GNNGraphs/docs/src"), cp(joinpath(@__DIR__, "../../GNNlib/docs/src"), joinpath(@__DIR__, "src/GNNlib"), force=true) + +## DEMO CARDS AUTOMATICALLY DETECTS TUTORIALS FROM FOLDER STRUCTURE +tutorials, tutorials_postprocess_cb, tutorials_assets = DemoCards.makedemos(joinpath(@__DIR__, "tutorials")) +## UNCOMMENT TO DISABLE TUTORIALS AND SPEED UP DOCS BUILDING +# tutorials, tutorials_postprocess_cb, tutorials_assets = + # "Tutorials" => "index.md", () -> nothing, nothing + +assets = [] +isnothing(tutorials_assets) || push!(assets, tutorials_assets) + makedocs(; modules = [GraphNeuralNetworks, GNNGraphs, GNNlib], plugins = [interlinks], format = Documenter.HTML(; mathengine, prettyurls = get(ENV, "CI", nothing) == "true", - assets = [], + assets, size_threshold=nothing, - size_threshold_warn=2000000), + size_threshold_warn=2000000, + example_size_threshold=2000000), sitename = "GraphNeuralNetworks.jl", pages = [ @@ -60,19 +72,7 @@ makedocs(; "Heterogeneous Graphs" => "GNNGraphs/guides/heterograph.md", "Temporal Graphs" => "GNNGraphs/guides/temporalgraph.md", ], - - "Tutorials" => [ - "Introductory tutorials" => [ - "Hands on" => "tutorials/gnn_intro.md", - "Node classification" => "tutorials/node_classification.md", - "Graph classification" => "tutorials/graph_classification.md" - ], - "Temporal graph neural networks" =>[ - "Node autoregression" => "tutorials/traffic_prediction.md", - "Temporal graph classification" => "tutorials/temporal_graph_classification.md" - ], - ], - + tutorials, "API Reference" => [ "Graphs (GNNGraphs.jl)" => [ "GNNGraph" => "GNNGraphs/api/gnngraph.md", @@ -99,6 +99,7 @@ makedocs(; ], ) +tutorials_postprocess_cb() rm(joinpath(@__DIR__, "src/GNNGraphs"), force=true, recursive=true) rm(joinpath(@__DIR__, "src/GNNlib"), force=true, recursive=true) diff --git a/GraphNeuralNetworks/docs/make_tutorials_literate.jl b/GraphNeuralNetworks/docs/old_tutorials/make_tutorials_literate.jl similarity index 100% rename from GraphNeuralNetworks/docs/make_tutorials_literate.jl rename to GraphNeuralNetworks/docs/old_tutorials/make_tutorials_literate.jl diff --git a/GraphNeuralNetworks/docs/make_tutorials_pluto.jl b/GraphNeuralNetworks/docs/old_tutorials/make_tutorials_pluto.jl similarity index 100% rename from GraphNeuralNetworks/docs/make_tutorials_pluto.jl rename to GraphNeuralNetworks/docs/old_tutorials/make_tutorials_pluto.jl diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/temporal_graph_classification.jl b/GraphNeuralNetworks/docs/old_tutorials/temporal_graph_classification.jl similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/temporal_graph_classification.jl rename to GraphNeuralNetworks/docs/old_tutorials/temporal_graph_classification.jl diff --git a/GraphNeuralNetworks/docs/src/democards/gridtheme.css b/GraphNeuralNetworks/docs/src/democards/gridtheme.css new file mode 100644 index 000000000..4d02fc844 --- /dev/null +++ b/GraphNeuralNetworks/docs/src/democards/gridtheme.css @@ -0,0 +1,59 @@ +.grid-card-section { + display: flex; + flex-direction: row; + flex-wrap: wrap; + align-content: space-between; +} + +.grid-card:hover{ + box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.4), 0 6px 20px 0 rgba(0, 0, 0, 0.1); +} + +.grid-card { + width: 210px; + max-height: 400px; + margin: 10px 15px; + box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2); + transition: 0.3s; + border-radius: 5px; +} + +.grid-card-text { + padding: 0 15px; +} + +.grid-card-cover img { + width: 100%; +} + +.grid-card-cover { + width: 200px; + height: 220px; + padding: 5px; + box-shadow: 0 2px 4px 0 rgba(0, 0, 0, 0.2); + transition: 0.3s; + border-radius: 5px; + display:block; + margin:auto; +} + +.grid-card-cover .grid-card-description { + opacity: 0; + z-index: -1; + position: absolute; + top: 25%; + left: 140%; + width: 100%; + transform: translate(-50%, -50%); + padding: 10px; + border-radius: 5px; + background: rgba(0, 0, 0, 0.8); + color: #fff; + text-align: center; + font-size: 14px; +} + +.grid-card-cover:hover .grid-card-description{ + z-index: 3; + opacity: 1; +} diff --git a/GraphNeuralNetworks/docs/src/tutorials/gnn_intro.md b/GraphNeuralNetworks/docs/src/tutorials/gnn_intro.md deleted file mode 100644 index 5eb908d73..000000000 --- a/GraphNeuralNetworks/docs/src/tutorials/gnn_intro.md +++ /dev/null @@ -1,390 +0,0 @@ -# Hands-on introduction to Graph Neural Networks - -*This tutorial is a Julia adaptation of the Pytorch Geometric tutorials that can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html).* - -Recently, deep learning on graphs has emerged to one of the hottest research fields in the deep learning community. -Here, **Graph Neural Networks (GNNs)** aim to generalize classical deep learning concepts to irregular structured data (in contrast to images or texts) and to enable neural networks to reason about objects and their relations. - -This is done by following a simple **neural message passing scheme**, where node features $\mathbf{x}_i^{(\ell)}$ of all nodes $i \in \mathcal{V}$ in a graph $\mathcal{G} = (\mathcal{V}, \mathcal{E})$ are iteratively updated by aggregating localized information from their neighbors $\mathcal{N}(i)$: - -```math -\mathbf{x}_i^{(\ell + 1)} = f^{(\ell + 1)}_{\theta} \left( \mathbf{x}_i^{(\ell)}, \left\{ \mathbf{x}_j^{(\ell)} : j \in \mathcal{N}(i) \right\} \right) -``` - -This tutorial will introduce you to some fundamental concepts regarding deep learning on graphs via Graph Neural Networks based on the **[GraphNeuralNetworks.jl library](https://github.com/JuliaGraphs/GraphNeuralNetworks.jl)**. -GraphNeuralNetworks.jl is an extension library to the popular deep learning framework [Flux.jl](https://fluxml.ai/Flux.jl/stable/), and consists of various methods and utilities to ease the implementation of Graph Neural Networks. - -Let's first import the packages we need: - -````julia -using Flux, GraphNeuralNetworks -using Flux: onecold, onehotbatch, logitcrossentropy -using MLDatasets -using LinearAlgebra, Random, Statistics -import GraphMakie -import CairoMakie as Makie - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -rng = Random.seed!(17); # for reproducibility -```` - -Following [Kipf et al. (2017)](https://arxiv.org/abs/1609.02907), let's dive into the world of GNNs by looking at a simple graph-structured example, the well-known [**Zachary's karate club network**](https://en.wikipedia.org/wiki/Zachary%27s_karate_club). This graph describes a social network of 34 members of a karate club and documents links between members who interacted outside the club. Here, we are interested in detecting communities that arise from the member's interaction. -GraphNeuralNetworks.jl provides utilities to convert [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl)'s datasets to its own type: - -````julia -dataset = MLDatasets.KarateClub() -```` - -```` -dataset KarateClub: - metadata => Dict{String, Any} with 0 entries - graphs => 1-element Vector{MLDatasets.Graph} -```` - -After initializing the `KarateClub` dataset, we first can inspect some of its properties. -For example, we can see that this dataset holds exactly **one graph**. -Furthermore, the graph holds exactly **4 classes**, which represent the community each node belongs to. - -````julia -karate = dataset[1] - -karate.node_data.labels_comm -```` - -```` -34-element Vector{Int64}: - 1 - 1 - 1 - 1 - 3 - 3 - 3 - 1 - 0 - 1 - 3 - 1 - 1 - 1 - 0 - 0 - 3 - 1 - 0 - 1 - 0 - 1 - 0 - 0 - 2 - 2 - 0 - 0 - 2 - 0 - 0 - 2 - 0 - 0 -```` - -Now we convert the single-graph dataset to a `GNNGraph`. Moreover, we add a an array of node features, a **34-dimensional feature vector** for each node which uniquely describes the members of the karate club. We also add a training mask selecting the nodes to be used for training in our semi-supervised node classification task. - -````julia -g = mldataset2gnngraph(dataset) # convert a MLDatasets.jl's dataset to a GNNGraphs (or a collection of graphs) - -x = zeros(Float32, g.num_nodes, g.num_nodes) -x[diagind(x)] .= 1 - -train_mask = [true, false, false, false, true, false, false, false, true, - false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false, true, false, false, false, false, false, - false, false, false, false] - -labels = g.ndata.labels_comm -y = onehotbatch(labels, 0:3) - -g = GNNGraph(g, ndata = (; x, y, train_mask)) -```` - -```` -GNNGraph: - num_nodes: 34 - num_edges: 156 - ndata: - y = 4×34 OneHotMatrix(::Vector{UInt32}) with eltype Bool - train_mask = 34-element Vector{Bool} - x = 34×34 Matrix{Float32} -```` - -Let's now look at the underlying graph in more detail: - -````julia -println("Number of nodes: $(g.num_nodes)") -println("Number of edges: $(g.num_edges)") -println("Average node degree: $(g.num_edges / g.num_nodes)") -println("Number of training nodes: $(sum(g.ndata.train_mask))") -println("Training node label rate: $(mean(g.ndata.train_mask))") -println("Has isolated nodes: $(has_isolated_nodes(g))") -println("Has self-loops: $(has_self_loops(g))") -println("Is undirected: $(is_bidirected(g))") -```` - -```` -Number of nodes: 34 -Number of edges: 156 -Average node degree: 4.588235294117647 -Number of training nodes: 4 -Training node label rate: 0.11764705882352941 -Has isolated nodes: false -Has self-loops: false -Is undirected: true - -```` - -Each graph in GraphNeuralNetworks.jl is represented by a `GNNGraph` object, which holds all the information to describe its graph representation. -We can print the data object anytime via `print(g)` to receive a short summary about its attributes and their shapes. - -The `g` object holds 3 attributes: -- `g.ndata`: contains node-related information. -- `g.edata`: holds edge-related information. -- `g.gdata`: this stores the global data, therefore neither node nor edge-specific features. - -These attributes are `NamedTuples` that can store multiple feature arrays: we can access a specific set of features e.g. `x`, with `g.ndata.x`. - -In our task, `g.ndata.train_mask` describes for which nodes we already know their community assignments. In total, we are only aware of the ground-truth labels of 4 nodes (one for each community), and the task is to infer the community assignment for the remaining nodes. - -The `g` object also provides some **utility functions** to infer some basic properties of the underlying graph. -For example, we can easily infer whether there exist isolated nodes in the graph (*i.e.* there exists no edge to any node), whether the graph contains self-loops (*i.e.*, $(v, v) \in \mathcal{E}$), or whether the graph is bidirected (*i.e.*, for each edge $(v, w) \in \mathcal{E}$ there also exists the edge $(w, v) \in \mathcal{E}$). - -Let us now inspect the `edge_index` method: - -````julia -edge_index(g) -```` - -```` -([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 14, 14, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34], [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32, 1, 3, 4, 8, 14, 18, 20, 22, 31, 1, 2, 4, 8, 9, 10, 14, 28, 29, 33, 1, 2, 3, 8, 13, 14, 1, 7, 11, 1, 7, 11, 17, 1, 5, 6, 17, 1, 2, 3, 4, 1, 3, 31, 33, 34, 3, 34, 1, 5, 6, 1, 1, 4, 1, 2, 3, 4, 34, 33, 34, 33, 34, 6, 7, 1, 2, 33, 34, 1, 2, 34, 33, 34, 1, 2, 33, 34, 26, 28, 30, 33, 34, 26, 28, 32, 24, 25, 32, 30, 34, 3, 24, 25, 34, 3, 32, 34, 24, 27, 33, 34, 2, 9, 33, 34, 1, 25, 26, 29, 33, 34, 3, 9, 15, 16, 19, 21, 23, 24, 30, 31, 32, 34, 9, 10, 14, 15, 16, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33]) -```` - -By printing `edge_index(g)`, we can understand how GraphNeuralNetworks.jl represents graph connectivity internally. -We can see that for each edge, `edge_index` holds a tuple of two node indices, where the first value describes the node index of the source node and the second value describes the node index of the destination node of an edge. - -This representation is known as the **COO format (coordinate format)** commonly used for representing sparse matrices. -Instead of holding the adjacency information in a dense representation $\mathbf{A} \in \{ 0, 1 \}^{|\mathcal{V}| \times |\mathcal{V}|}$, GraphNeuralNetworks.jl represents graphs sparsely, which refers to only holding the coordinates/values for which entries in $\mathbf{A}$ are non-zero. - -Importantly, GraphNeuralNetworks.jl does not distinguish between directed and undirected graphs, and treats undirected graphs as a special case of directed graphs in which reverse edges exist for every entry in the `edge_index`. - -Since a `GNNGraph` is an `AbstractGraph` from the `Graphs.jl` library, it supports graph algorithms and visualization tools from the wider julia graph ecosystem: - -````julia -GraphMakie.graphplot(g |> to_unidirected, node_size = 20, node_color = labels, arrow_show = false) -```` - -```@raw html - -``` - -## Implementing Graph Neural Networks - -After learning about GraphNeuralNetworks.jl's data handling, it's time to implement our first Graph Neural Network! - -For this, we will use on of the most simple GNN operators, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)), which is defined as - -```math -\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} -``` - -where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. - -GraphNeuralNetworks.jl implements this layer via `GCNConv`, which can be executed by passing in the node feature representation `x` and the COO graph connectivity representation `edge_index`. - -With this, we are ready to create our first Graph Neural Network by defining our network architecture: - -````julia -struct GCN - layers::NamedTuple -end - -Flux.@layer GCN # Provides parameter collection, gpu movement and more - -function GCN(num_features, num_classes) - layers = (conv1 = GCNConv(num_features => 4), - conv2 = GCNConv(4 => 4), - conv3 = GCNConv(4 => 2), - classifier = Dense(2, num_classes)) - return GCN(layers) -end; - -function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) - l = gcn.layers - x = l.conv1(g, x) - x = tanh.(x) - x = l.conv2(g, x) - x = tanh.(x) - x = l.conv3(g, x) - x = tanh.(x) # Final GNN embedding space. - out = l.classifier(x) # Apply a final (linear) classifier. - return out, x -end; -```` - -Here, we first initialize all of our building blocks in the constructor and define the computation flow of our network in the call method. -We first define and stack **three graph convolution layers**, which corresponds to aggregating 3-hop neighborhood information around each node (all nodes up to 3 "hops" away). -In addition, the `GCNConv` layers reduce the node feature dimensionality to ``2``, *i.e.*, $34 \rightarrow 4 \rightarrow 4 \rightarrow 2$. Each `GCNConv` layer is enhanced by a `tanh` non-linearity. - -After that, we apply a single linear transformation (`Flux.Dense` that acts as a classifier to map our nodes to 1 out of the 4 classes/communities. - -We return both the output of the final classifier as well as the final node embeddings produced by our GNN. -We proceed to initialize our final model via `GCN()`, and printing our model produces a summary of all its used sub-modules. - -### Embedding the Karate Club Network - -Let's take a look at the node embeddings produced by our GNN. -Here, we pass in the initial node features `x` and the graph information `g` to the model, and visualize its 2-dimensional embedding. - -````julia -num_features = 34 -num_classes = 4 -gcn = GCN(num_features, num_classes) -```` - -```` -GCN( - GCNConv(34 => 4), # 140 parameters - GCNConv(4 => 4), # 20 parameters - GCNConv(4 => 2), # 10 parameters - Dense(2 => 4), # 12 parameters -) # Total: 8 arrays, 182 parameters, 1.203 KiB. -```` - -````julia -_, h = gcn(g, g.ndata.x); -```` - -````julia -function visualize_embeddings(h; colors = nothing) - xs = h[1, :] |> vec - ys = h[2, :] |> vec - Makie.scatter(xs, ys, color = labels, markersize = 20) -end - -visualize_embeddings(h, colors = labels) -```` - -```@raw html - -``` - -Remarkably, even before training the weights of our model, the model produces an embedding of nodes that closely resembles the community-structure of the graph. -Nodes of the same color (community) are already closely clustered together in the embedding space, although the weights of our model are initialized **completely at random** and we have not yet performed any training so far! -This leads to the conclusion that GNNs introduce a strong inductive bias, leading to similar embeddings for nodes that are close to each other in the input graph. - -### Training on the Karate Club Network - -But can we do better? Let's look at an example on how to train our network parameters based on the knowledge of the community assignments of 4 nodes in the graph (one for each community). - -Since everything in our model is differentiable and parameterized, we can add some labels, train the model and observe how the embeddings react. -Here, we make use of a semi-supervised or transductive learning procedure: we simply train against one node per class, but are allowed to make use of the complete input graph data. - -Training our model is very similar to any other Flux model. -In addition to defining our network architecture, we define a loss criterion (here, `logitcrossentropy`), and initialize a stochastic gradient optimizer (here, `Adam`). -After that, we perform multiple rounds of optimization, where each round consists of a forward and backward pass to compute the gradients of our model parameters w.r.t. to the loss derived from the forward pass. -If you are not new to Flux, this scheme should appear familiar to you. - -Note that our semi-supervised learning scenario is achieved by the following line: -```julia -loss = logitcrossentropy(ŷ[:,train_mask], y[:,train_mask]) -``` - -While we compute node embeddings for all of our nodes, we **only make use of the training nodes for computing the loss**. -Here, this is implemented by filtering the output of the classifier `out` and ground-truth labels `data.y` to only contain the nodes in the `train_mask`. - -Let us now start training and see how our node embeddings evolve over time (best experienced by explicitly running the code): - -````julia -model = GCN(num_features, num_classes) -opt = Flux.setup(Adam(1e-2), model) -epochs = 2000 - -emb = h -function report(epoch, loss, h) - @info (; epoch, loss) -end - -report(0, 10.0, emb) -for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ, emb = model(g, g.ndata.x) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - report(epoch, loss, emb) - end -end; -```` - -```` -[ Info: (epoch = 0, loss = 10.0) -[ Info: (epoch = 200, loss = 0.23355734f0) -[ Info: (epoch = 400, loss = 0.121178314f0) -[ Info: (epoch = 600, loss = 0.015975919f0) -[ Info: (epoch = 800, loss = 0.006572252f0) -[ Info: (epoch = 1000, loss = 0.003620736f0) -[ Info: (epoch = 1200, loss = 0.002298967f0) -[ Info: (epoch = 1400, loss = 0.0015922752f0) -[ Info: (epoch = 1600, loss = 0.0011724582f0) -[ Info: (epoch = 1800, loss = 0.00090479344f0) -[ Info: (epoch = 2000, loss = 0.000724492f0) - -```` - -````julia -ŷ, emb_final = model(g, g.ndata.x) -```` - -```` -(Float32[0.55356395 1.551039 5.5705533 2.7778258 0.63898975 0.6825545 0.6721916 6.125821 8.097307 7.9838457 0.6283343 1.7970009 3.1581564 7.628894 7.9750724 7.923052 0.57084787 4.591306 7.9582367 7.692672 7.950883 4.9723816 7.959677 2.842043 -6.593864 -6.5728755 7.701645 -1.8145508 5.421144 7.7040224 8.095816 -2.058383 8.124023 8.098329; 9.003047 7.975671 3.5220292 6.6122437 -8.852302 -8.942742 -8.931593 2.886099 0.6999368 0.7491364 -8.841045 7.1023135 6.0150228 1.2226517 0.48847342 0.414697 -8.823587 4.4446645 0.4644488 1.0500264 0.4528217 4.0468783 0.46605742 -2.5005548 -0.60694873 -0.6042747 0.1013584 2.9975584 2.9418213 0.11139287 0.6978008 2.7612474 0.6939737 0.7232862; 1.7117454 0.37239516 -5.089068 -1.2953348 -2.0455256 -2.1133642 -2.0993521 -5.8477736 -8.526862 -8.388386 -2.0311606 -0.08498147 -1.8481481 -7.8896513 -8.431897 -8.388157 -1.9625307 -3.7918186 -8.417771 -7.997212 -8.411835 -4.3045874 -8.419072 -3.236257 7.829445 7.806243 -8.201856 3.160245 -5.0389504 -8.202489 -8.525613 3.3877134 -8.558316 -8.523232; -8.8194895 -7.8159304 -3.4724863 -6.4863243 8.228086 8.315421 8.304541 -2.8527176 -0.72068405 -0.7702427 8.217097 -6.9765635 -5.907503 -1.2303886 -0.5216009 -0.4523579 8.199119 -4.3756385 -0.4990499 -1.0641356 -0.48811853 -3.9871912 -0.50055283 2.2146838 0.1915166 0.1894428 -0.15828416 -3.1404116 -2.9220016 -0.16780908 -0.7186786 -2.9203842 -0.71438146 -0.7429512], Float32[0.9507177 0.7007442 -0.33878684 0.38304588 -0.89570314 -0.9113016 -0.9086557 -0.4845161 -0.9945633 -0.9730794 -0.89300394 0.5754177 0.26662543 -0.8730406 -0.9985834 -0.9986284 -0.8828861 -0.10221174 -0.9986132 -0.9000079 -0.9987426 -0.19825462 -0.9986566 -0.562294 0.99862534 0.9958607 -0.9987517 0.6767777 -0.37674817 -0.9980653 -0.99456674 0.6878139 -0.99904454 -0.9923129; 0.99127704 0.99994814 0.99982524 0.99944764 -0.99439627 -0.9991124 -0.99914926 0.99749506 0.99723953 0.9886884 -0.9944575 0.932767 0.9797799 0.99766874 0.9584594 0.9437685 -0.9996251 0.98170143 0.9536883 0.9862676 0.9514777 0.984423 0.9540466 -0.011422882 -0.96831375 -0.9654154 0.8813165 0.026573658 0.9164535 0.88273275 0.99681604 -0.030043826 0.99988157 0.99997663]) -```` - -Train accuracy: - -````julia -mean(onecold(ŷ[:, train_mask]) .== onecold(y[:, train_mask])) -```` - -```` -1.0 -```` - -Test accuracy: - -````julia -mean(onecold(ŷ[:, .!train_mask]) .== onecold(y[:, .!train_mask])) -```` - -```` -0.7 -```` - -Final embedding: - -````julia -visualize_embeddings(emb_final, colors = labels) -```` - -```@raw html - -``` - -As one can see, our 3-layer GCN model manages to linearly separating the communities and classifying most of the nodes correctly. - -Furthermore, we did this all with a few lines of code, thanks to the GraphNeuralNetworks.jl which helped us out with data handling and GNN implementations. - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/GraphNeuralNetworks/docs/src/tutorials/node_classification.md b/GraphNeuralNetworks/docs/src/tutorials/node_classification.md deleted file mode 100644 index 7026ea281..000000000 --- a/GraphNeuralNetworks/docs/src/tutorials/node_classification.md +++ /dev/null @@ -1,5897 +0,0 @@ -# Node Classification with Graph Neural Networks - -In this tutorial, we will be learning how to use Graph Neural Networks (GNNs) for node classification. Given the ground-truth labels of only a small subset of nodes, and want to infer the labels for all the remaining nodes (transductive learning). - -## Import -Let us start off by importing some libraries. We will be using `Flux.jl` and `GraphNeuralNetworks.jl` for our tutorial. - -````julia -using Flux, GraphNeuralNetworks -using Flux: onecold, onehotbatch, logitcrossentropy -using MLDatasets -using Plots, TSne -using Statistics, Random - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(17); # for reproducibility -```` - -## Visualize -We want to visualize our results using t-distributed stochastic neighbor embedding (tsne) to project our output onto a 2D plane. - -````julia -function visualize_tsne(out, targets) - z = tsne(out, 2) - scatter(z[:, 1], z[:, 2], color = Int.(targets[1:size(z, 1)]), leg = false) -end; -```` - -## Dataset: Cora - -For our tutorial, we will be using the `Cora` dataset. `Cora` is a citation network of 2708 documents categorized into seven classes with 5,429 citation links. Each node represents an article or document, and edges between nodes indicate a citation relationship, where one cites the other. - -Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. - -This dataset was first introduced by [Yang et al. (2016)](https://arxiv.org/abs/1603.08861) as one of the datasets of the `Planetoid` benchmark suite. We will be using [MLDatasets.jl](https://juliaml.github.io/MLDatasets.jl/stable/) for an easy access to this dataset. - -````julia -dataset = Cora() -```` - -```` -dataset Cora: - metadata => Dict{String, Any} with 3 entries - graphs => 1-element Vector{MLDatasets.Graph} -```` - -Datasets in MLDatasets.jl have `metadata` containing information about the dataset itself. - -````julia -dataset.metadata -```` - -```` -Dict{String, Any} with 3 entries: - "name" => "cora" - "classes" => [1, 2, 3, 4, 5, 6, 7] - "num_classes" => 7 -```` - -The `graphs` variable contains the graph. The `Cora` dataset contains only 1 graph. - -````julia -dataset.graphs -```` - -```` -1-element Vector{MLDatasets.Graph}: - Graph(2708, 10556) -```` - -There is only one graph of the dataset. The `node_data` contains `features` indicating if certain words are present or not and `targets` indicating the class for each document. We convert the single-graph dataset to a `GNNGraph`. - -````julia -g = mldataset2gnngraph(dataset) - -println("Number of nodes: $(g.num_nodes)") -println("Number of edges: $(g.num_edges)") -println("Average node degree: $(g.num_edges / g.num_nodes)") -println("Number of training nodes: $(sum(g.ndata.train_mask))") -println("Training node label rate: $(mean(g.ndata.train_mask))") -println("Has isolated nodes: $(has_isolated_nodes(g))") -println("Has self-loops: $(has_self_loops(g))") -println("Is undirected: $(is_bidirected(g))") -```` - -```` -Number of nodes: 2708 -Number of edges: 10556 -Average node degree: 3.8980797636632203 -Number of training nodes: 140 -Training node label rate: 0.051698670605613 -Has isolated nodes: false -Has self-loops: false -Is undirected: true - -```` - -Overall, this dataset is quite similar to the previously used [`KarateClub`](https://juliaml.github.io/MLDatasets.jl/stable/datasets/graphs/#MLDatasets.KarateClub) network. -We can see that the `Cora` network holds 2,708 nodes and 10,556 edges, resulting in an average node degree of 3.9. -For training this dataset, we are given the ground-truth categories of 140 nodes (20 for each class). -This results in a training node label rate of only 5%. - -We can further see that this network is undirected, and that there exists no isolated nodes (each document has at least one citation). - -````julia -x = g.ndata.features # we onehot encode both the node labels (what we want to predict): -y = onehotbatch(g.ndata.targets, 1:7) -train_mask = g.ndata.train_mask -num_features = size(x)[1] -hidden_channels = 16 -num_classes = dataset.metadata["num_classes"]; -```` - -## Multi-layer Perception Network (MLP) - -In theory, we should be able to infer the category of a document solely based on its content, *i.e.* its bag-of-words feature representation, without taking any relational information into account. - -Let's verify that by constructing a simple MLP that solely operates on input node features (using shared weights across all nodes): - -````julia -struct MLP - layers::NamedTuple -end - -Flux.@layer :expand MLP - -function MLP(num_features, num_classes, hidden_channels; drop_rate = 0.5) - layers = (hidden = Dense(num_features => hidden_channels), - drop = Dropout(drop_rate), - classifier = Dense(hidden_channels => num_classes)) - return MLP(layers) -end; - -function (model::MLP)(x::AbstractMatrix) - l = model.layers - x = l.hidden(x) - x = relu(x) - x = l.drop(x) - x = l.classifier(x) - return x -end; -```` - -### Training a Multilayer Perceptron - -Our MLP is defined by two linear layers and enhanced by [ReLU](https://fluxml.ai/Flux.jl/stable/models/nnlib/#NNlib.relu) non-linearity and [Dropout](https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.Dropout). -Here, we first reduce the 1433-dimensional feature vector to a low-dimensional embedding (`hidden_channels=16`), while the second linear layer acts as a classifier that should map each low-dimensional node embedding to one of the 7 classes. - -Let's train our simple MLP by following a similar procedure as described in [the first part of this tutorial](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/tutorials/gnn_intro/). -We again make use of the **cross entropy loss** and **Adam optimizer**. -This time, we also define a **`accuracy` function** to evaluate how well our final model performs on the test node set (which labels have not been observed during training). - -````julia -function train(model::MLP, data::AbstractMatrix, epochs::Int, opt) - Flux.trainmode!(model) - - for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ = model(data) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - @show epoch, loss - end - end -end; - -function accuracy(model::MLP, x::AbstractMatrix, y::Flux.OneHotArray, mask::BitVector) - Flux.testmode!(model) - mean(onecold(model(x))[mask] .== onecold(y)[mask]) -end; - -mlp = MLP(num_features, num_classes, hidden_channels) -opt_mlp = Flux.setup(Adam(1e-3), mlp) -epochs = 2000 -train(mlp, g.ndata.features, epochs, opt_mlp) -```` - -```` -(epoch, loss) = (200, 0.34664646f0) -(epoch, loss) = (400, 0.17937787f0) -(epoch, loss) = (600, 0.18458092f0) -(epoch, loss) = (800, 0.23548584f0) -(epoch, loss) = (1000, 0.17101304f0) -(epoch, loss) = (1200, 0.17516974f0) -(epoch, loss) = (1400, 0.15338397f0) -(epoch, loss) = (1600, 0.17862f0) -(epoch, loss) = (1800, 0.18102548f0) -(epoch, loss) = (2000, 0.15243146f0) - -```` - -After training the model, we can call the `accuracy` function to see how well our model performs on unseen labels. -Here, we are interested in the accuracy of the model, *i.e.*, the ratio of correctly classified nodes: - -````julia -accuracy(mlp, g.ndata.features, y, .!train_mask) -```` - -```` -0.5408878504672897 -```` - -As one can see, our MLP performs rather bad with only about ~50% test accuracy. -But why does the MLP do not perform better? -The main reason for that is that this model suffers from heavy overfitting due to only having access to a **small amount of training nodes**, and therefore generalizes poorly to unseen node representations. - -It also fails to incorporate an important bias into the model: **Cited papers are very likely related to the category of a document**. -That is exactly where Graph Neural Networks come into play and can help to boost the performance of our model. - -## Training a Graph Convolutional Neural Network (GNN) - -Following-up on the first part of this tutorial, we replace the `Dense` linear layers by the [`GCNConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GCNConv) module. -To recap, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)) is defined as - -```math -\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} -``` - -where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. -In contrast, a single `Linear` layer is defined as - -```math -\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \mathbf{x}_v^{(\ell)} -``` - -which does not make use of neighboring node information. - -````julia -struct GCN - layers::NamedTuple -end - -Flux.@layer GCN # provides parameter collection, gpu movement and more - -function GCN(num_features, num_classes, hidden_channels; drop_rate = 0.5) - layers = (conv1 = GCNConv(num_features => hidden_channels), - drop = Dropout(drop_rate), - conv2 = GCNConv(hidden_channels => num_classes)) - return GCN(layers) -end; - -function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) - l = gcn.layers - x = l.conv1(g, x) - x = relu.(x) - x = l.drop(x) - x = l.conv2(g, x) - return x -end; -```` - -Now let's visualize the node embeddings of our **untrained** GCN network. - -````julia -gcn = GCN(num_features, num_classes, hidden_channels) -h_untrained = gcn(g, x) |> transpose -visualize_tsne(h_untrained, g.ndata.targets) -```` - -```@raw html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -``` - -We certainly can do better by training our model. -The training and testing procedure is once again the same, but this time we make use of the node features `x` **and** the graph `g` as input to our GCN model. - -````julia -function train(model::GCN, g::GNNGraph, x::AbstractMatrix, epochs::Int, opt) - Flux.trainmode!(model) - - for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ = model(g, x) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - @show epoch, loss - end - end -end; -```` - -````julia -mlp = MLP(num_features, num_classes, hidden_channels) -opt_mlp = Flux.setup(Adam(1e-3), mlp) -epochs = 2000 -train(mlp, g.ndata.features, epochs, opt_mlp) -```` - -```` -(epoch, loss) = (200, 0.3903322f0) -(epoch, loss) = (400, 0.24701364f0) -(epoch, loss) = (600, 0.18073516f0) -(epoch, loss) = (800, 0.16055994f0) -(epoch, loss) = (1000, 0.1603872f0) -(epoch, loss) = (1200, 0.16059926f0) -(epoch, loss) = (1400, 0.12264546f0) -(epoch, loss) = (1600, 0.16914158f0) -(epoch, loss) = (1800, 0.13800614f0) -(epoch, loss) = (2000, 0.14582604f0) - -```` - -````julia -function accuracy(model::GCN, g::GNNGraph, x::AbstractMatrix, y::Flux.OneHotArray, - mask::BitVector) - Flux.testmode!(model) - mean(onecold(model(g, x))[mask] .== onecold(y)[mask]) -end -```` - -```` -accuracy (generic function with 2 methods) -```` - -````julia -accuracy(mlp, g.ndata.features, y, .!train_mask) -```` - -```` -0.508177570093458 -```` - -````julia -opt_gcn = Flux.setup(Adam(1e-2), gcn) -train(gcn, g, x, epochs, opt_gcn) -```` - -```` -(epoch, loss) = (200, 0.011098934f0) -(epoch, loss) = (400, 0.00403386f0) -(epoch, loss) = (600, 0.0066408515f0) -(epoch, loss) = (800, 0.0054795616f0) -(epoch, loss) = (1000, 0.00048175652f0) -(epoch, loss) = (1200, 0.00011097621f0) -(epoch, loss) = (1400, 6.766344f-5) -(epoch, loss) = (1600, 0.0009650132f0) -(epoch, loss) = (1800, 0.0097566405f0) -(epoch, loss) = (2000, 5.067821f-5) - -```` - -Now let's evaluate the loss of our trained GCN. - -````julia -train_accuracy = accuracy(gcn, g, g.ndata.features, y, train_mask) -test_accuracy = accuracy(gcn, g, g.ndata.features, y, .!train_mask) - -println("Train accuracy: $(train_accuracy)") -println("Test accuracy: $(test_accuracy)") -```` - -```` -Train accuracy: 1.0 -Test accuracy: 0.7620716510903427 - -```` - -**There it is!** -By simply swapping the linear layers with GNN layers, we can reach **76% of test accuracy**! -This is in stark contrast to the 59% of test accuracy obtained by our MLP, indicating that relational information plays a crucial role in obtaining better performance. - -We can also verify that once again by looking at the output embeddings of our trained model, which now produces a far better clustering of nodes of the same category. - -````julia -Flux.testmode!(gcn) # inference mode - -out_trained = gcn(g, x) |> transpose -visualize_tsne(out_trained, g.ndata.targets) -```` - -```@raw html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -``` - -## (Optional) Exercises - -1. To achieve better model performance and to avoid overfitting, it is usually a good idea to select the best model based on an additional validation set. The `Cora` dataset provides a validation node set as `g.ndata.val_mask`, but we haven't used it yet. Can you modify the code to select and test the model with the highest validation performance? This should bring test performance to **82% accuracy**. - -2. How does `GCN` behave when increasing the hidden feature dimensionality or the number of layers? Does increasing the number of layers help at all? - -3. You can try to use different GNN layers to see how model performance changes. What happens if you swap out all `GCNConv` instances with [`GATConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GATConv) layers that make use of attention? Try to write a 2-layer `GAT` model that makes use of 8 attention heads in the first layer and 1 attention head in the second layer, uses a `dropout` ratio of `0.6` inside and outside each `GATConv` call, and uses a `hidden_channels` dimensions of `8` per head. - -## Conclusion -In this tutorial, we have seen how to apply GNNs to real-world problems, and, in particular, how they can effectively be used for boosting a model's performance. In the next tutorial, we will look into how GNNs can be used for the task of graph classification. - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification.md b/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification.md deleted file mode 100644 index 2385ccc4c..000000000 --- a/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification.md +++ /dev/null @@ -1,183 +0,0 @@ -# Temporal Graph classification with GraphNeuralNetworks.jl - -In this tutorial, we will learn how to extend the graph classification task to the case of temporal graphs, i.e., graphs whose topology and features are time-varying. - -We will design and train a simple temporal graph neural network architecture to classify subjects' gender (female or male) using the temporal graphs extracted from their brain fMRI scan signals. Given the large amount of data, we will implement the training so that it can also run on the GPU. - -## Import - -We start by importing the necessary libraries. We use `GraphNeuralNetworks.jl`, `Flux.jl` and `MLDatasets.jl`, among others. - -````julia -using Flux -using GraphNeuralNetworks -using Statistics, Random -using LinearAlgebra -using MLDatasets: TemporalBrains -using CUDA # comment out if you don't have a CUDA GPU - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(17); # for reproducibility -```` - -## Dataset: TemporalBrains -The TemporalBrains dataset contains a collection of functional brain connectivity networks from 1000 subjects obtained from resting-state functional MRI data from the [Human Connectome Project (HCP)](https://www.humanconnectome.org/study/hcp-young-adult/document/extensively-processed-fmri-data-documentation). -Functional connectivity is defined as the temporal dependence of neuronal activation patterns of anatomically separated brain regions. - -The graph nodes represent brain regions and their number is fixed at 102 for each of the 27 snapshots, while the edges, representing functional connectivity, change over time. -For each snapshot, the feature of a node represents the average activation of the node during that snapshot. -Each temporal graph has a label representing gender ('M' for male and 'F' for female) and age group (22-25, 26-30, 31-35, and 36+). -The network's edge weights are binarized, and the threshold is set to 0.6 by default. - -````julia -brain_dataset = TemporalBrains() -```` - -```` -dataset TemporalBrains: - graphs => 1000-element Vector{MLDatasets.TemporalSnapshotsGraph} -```` - -After loading the dataset from the MLDatasets.jl package, we see that there are 1000 graphs and we need to convert them to the `TemporalSnapshotsGNNGraph` format. -So we create a function called `data_loader` that implements the latter and splits the dataset into the training set that will be used to train the model and the test set that will be used to test the performance of the model. Due to computational costs, we use only 250 out of the original 1000 graphs, 200 for training and 50 for testing. - -````julia -function data_loader(brain_dataset) - graphs = brain_dataset.graphs - dataset = Vector{TemporalSnapshotsGNNGraph}(undef, length(graphs)) - for i in 1:length(graphs) - graph = graphs[i] - dataset[i] = TemporalSnapshotsGNNGraph(GNNGraphs.mlgraph2gnngraph.(graph.snapshots)) - # Add graph and node features - for t in 1:27 - s = dataset[i].snapshots[t] - s.ndata.x = [I(102); s.ndata.x'] - end - dataset[i].tgdata.g = Float32.(Flux.onehot(graph.graph_data.g, ["F", "M"])) - end - # Split the dataset into a 80% training set and a 20% test set - train_loader = dataset[1:200] - test_loader = dataset[201:250] - return train_loader, test_loader -end -```` - -```` -data_loader (generic function with 1 method) -```` - -The first part of the `data_loader` function calls the `mlgraph2gnngraph` function for each snapshot, which takes the graph and converts it to a `GNNGraph`. The vector of `GNNGraph`s is then rewritten to a `TemporalSnapshotsGNNGraph`. - -The second part adds the graph and node features to the temporal graphs, in particular it adds the one-hot encoding of the label of the graph (in this case we directly use the identity matrix) and appends the mean activation of the node of the snapshot (which is contained in the vector `dataset[i].snapshots[t].ndata.x`, where `i` is the index indicating the subject and `t` is the snapshot). For the graph feature, it adds the one-hot encoding of gender. - -The last part splits the dataset. - -## Model - -We now implement a simple model that takes a `TemporalSnapshotsGNNGraph` as input. -It consists of a `GINConv` applied independently to each snapshot, a `GlobalPool` to get an embedding for each snapshot, a pooling on the time dimension to get an embedding for the whole temporal graph, and finally a `Dense` layer. - -First, we start by adapting the `GlobalPool` to the `TemporalSnapshotsGNNGraphs`. - -````julia -function (l::GlobalPool)(g::TemporalSnapshotsGNNGraph, x::AbstractVector) - h = [reduce_nodes(l.aggr, g[i], x[i]) for i in 1:(g.num_snapshots)] - return mean(h) -end -```` - -Then we implement the constructor of the model, which we call `GenderPredictionModel`, and the foward pass. - -````julia -struct GenderPredictionModel - gin::GINConv - mlp::Chain - globalpool::GlobalPool - dense::Dense -end - -Flux.@layer GenderPredictionModel - -function GenderPredictionModel(; nfeatures = 103, nhidden = 128, σ = relu) - mlp = Chain(Dense(nfeatures => nhidden, σ), Dense(nhidden => nhidden, σ)) - gin = GINConv(mlp, 0.5) - globalpool = GlobalPool(mean) - dense = Dense(nhidden => 2) - return GenderPredictionModel(gin, mlp, globalpool, dense) -end - -function (m::GenderPredictionModel)(g::TemporalSnapshotsGNNGraph) - h = m.gin(g, g.ndata.x) - h = m.globalpool(g, h) - return m.dense(h) -end -```` - -## Training - -We train the model for 100 epochs, using the Adam optimizer with a learning rate of 0.001. We use the `logitbinarycrossentropy` as the loss function, which is typically used as the loss in two-class classification, where the labels are given in a one-hot format. -The accuracy expresses the number of correct classifications. - -````julia -lossfunction(ŷ, y) = Flux.logitbinarycrossentropy(ŷ, y); - -function eval_loss_accuracy(model, data_loader) - error = mean([lossfunction(model(g), g.tgdata.g) for g in data_loader]) - acc = mean([round(100 * mean(Flux.onecold(model(g)) .== Flux.onecold(g.tgdata.g)); digits = 2) for g in data_loader]) - return (loss = error, acc = acc) -end - -function train(dataset) - device = gpu_device() - - function report(epoch) - train_loss, train_acc = eval_loss_accuracy(model, train_loader) - test_loss, test_acc = eval_loss_accuracy(model, test_loader) - println("Epoch: $epoch $((; train_loss, train_acc)) $((; test_loss, test_acc))") - return (train_loss, train_acc, test_loss, test_acc) - end - - model = GenderPredictionModel() |> device - - opt = Flux.setup(Adam(1.0f-3), model) - - train_loader, test_loader = data_loader(dataset) - train_loader = train_loader |> device - test_loader = test_loader |> device - - report(0) - for epoch in 1:100 - for g in train_loader - grads = Flux.gradient(model) do model - ŷ = model(g) - lossfunction(vec(ŷ), g.tgdata.g) - end - Flux.update!(opt, model, grads[1]) - end - if epoch % 20 == 0 - report(epoch) - end - end - return model -end - -train(brain_dataset); -```` - -```` -Epoch: 0 (train_loss = 0.80321693f0, train_acc = 50.5) (test_loss = 0.79863846f0, test_acc = 60.0) -Epoch: 20 (train_loss = 0.5073769f0, train_acc = 74.5) (test_loss = 0.64655066f0, test_acc = 60.0) -Epoch: 40 (train_loss = 0.13417317f0, train_acc = 96.5) (test_loss = 0.5689327f0, test_acc = 74.0) -Epoch: 60 (train_loss = 0.01875147f0, train_acc = 100.0) (test_loss = 0.45651233f0, test_acc = 82.0) -Epoch: 80 (train_loss = 0.12695672f0, train_acc = 95.0) (test_loss = 0.65159386f0, test_acc = 82.0) -Epoch: 100 (train_loss = 0.036399372f0, train_acc = 99.0) (test_loss = 0.6491585f0, test_acc = 86.0) - -```` - -# Conclusions -In this tutorial, we implemented a very simple architecture to classify temporal graphs in the context of gender classification using brain data. We then trained the model on the GPU for 100 epochs on the TemporalBrains dataset. The accuracy of the model is approximately 85%, but can be improved by fine-tuning the parameters and training on more data. - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification_pluto.md b/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification_pluto.md deleted file mode 100644 index db5753f93..000000000 --- a/GraphNeuralNetworks/docs/src/tutorials/temporal_graph_classification_pluto.md +++ /dev/null @@ -1,211 +0,0 @@ -```@raw html - - - - - -

Temporal Graph classification with GraphNeuralNetworks.jl

In this tutorial, we will learn how to extend the graph classification task to the case of temporal graphs, i.e., graphs whose topology and features are time-varying.

We will design and train a simple temporal graph neural network architecture to classify subjects' gender (female or male) using the temporal graphs extracted from their brain fMRI scan signals. Given the large amount of data, we will implement the training so that it can also run on the GPU.

- - -``` -## Import -```@raw html -
-

We start by importing the necessary libraries. We use GraphNeuralNetworks.jl, Flux.jl and MLDatasets.jl, among others.

- -
begin
-    using Flux
-    using GraphNeuralNetworks
-    using Statistics, Random
-    using LinearAlgebra
-    using MLDatasets: TemporalBrains
-    using CUDA
-    using cuDNN
-end
- - - -``` -## Dataset: TemporalBrains -```@raw html -
-

The TemporalBrains dataset contains a collection of functional brain connectivity networks from 1000 subjects obtained from resting-state functional MRI data from the Human Connectome Project (HCP). Functional connectivity is defined as the temporal dependence of neuronal activation patterns of anatomically separated brain regions.

The graph nodes represent brain regions and their number is fixed at 102 for each of the 27 snapshots, while the edges, representing functional connectivity, change over time. For each snapshot, the feature of a node represents the average activation of the node during that snapshot. Each temporal graph has a label representing gender ('M' for male and 'F' for female) and age group (22-25, 26-30, 31-35, and 36+). The network's edge weights are binarized, and the threshold is set to 0.6 by default.

- -
brain_dataset = TemporalBrains()
-
dataset TemporalBrains:
-  graphs  =>    1000-element Vector{MLDatasets.TemporalSnapshotsGraph}
- - -

After loading the dataset from the MLDatasets.jl package, we see that there are 1000 graphs and we need to convert them to the TemporalSnapshotsGNNGraph format. So we create a function called data_loader that implements the latter and splits the dataset into the training set that will be used to train the model and the test set that will be used to test the performance of the model.

- -
function data_loader(brain_dataset)
-    graphs = brain_dataset.graphs
-    dataset = Vector{TemporalSnapshotsGNNGraph}(undef, length(graphs))
-    for i in 1:length(graphs)
-        graph = graphs[i]
-        dataset[i] = TemporalSnapshotsGNNGraph(GraphNeuralNetworks.mlgraph2gnngraph.(graph.snapshots))
-        # Add graph and node features
-        for t in 1:27
-            s = dataset[i].snapshots[t]
-            s.ndata.x = [I(102); s.ndata.x']
-        end
-        dataset[i].tgdata.g = Float32.(Flux.onehot(graph.graph_data.g, ["F", "M"]))
-    end
-    # Split the dataset into a 80% training set and a 20% test set
-    train_loader = dataset[1:200]
-    test_loader = dataset[201:250]
-    return train_loader, test_loader
-end;
- - - -

The first part of the data_loader function calls the mlgraph2gnngraph function for each snapshot, which takes the graph and converts it to a GNNGraph. The vector of GNNGraphs is then rewritten to a TemporalSnapshotsGNNGraph.

The second part adds the graph and node features to the temporal graphs, in particular it adds the one-hot encoding of the label of the graph (in this case we directly use the identity matrix) and appends the mean activation of the node of the snapshot (which is contained in the vector dataset[i].snapshots[t].ndata.x, where i is the index indicating the subject and t is the snapshot). For the graph feature, it adds the one-hot encoding of gender.

The last part splits the dataset.

- - -``` -## Model -```@raw html -
-

We now implement a simple model that takes a TemporalSnapshotsGNNGraph as input. It consists of a GINConv applied independently to each snapshot, a GlobalPool to get an embedding for each snapshot, a pooling on the time dimension to get an embedding for the whole temporal graph, and finally a Dense layer.

First, we start by adapting the GlobalPool to the TemporalSnapshotsGNNGraphs.

- -
function (l::GlobalPool)(g::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    h = [reduce_nodes(l.aggr, g[i], x[i]) for i in 1:(g.num_snapshots)]
-    sze = size(h[1])
-    reshape(reduce(hcat, h), sze[1], length(h))
-end
- - - -

Then we implement the constructor of the model, which we call GenderPredictionModel, and the foward pass.

- -
begin
-    struct GenderPredictionModel
-        gin::GINConv
-        mlp::Chain
-        globalpool::GlobalPool
-        f::Function
-        dense::Dense
-    end
-    
-    Flux.@layer GenderPredictionModel
-    
-    function GenderPredictionModel(; nfeatures = 103, nhidden = 128, activation = relu)
-        mlp = Chain(Dense(nfeatures, nhidden, activation), Dense(nhidden, nhidden, activation))
-        gin = GINConv(mlp, 0.5)
-        globalpool = GlobalPool(mean)
-        f = x -> mean(x, dims = 2)
-        dense = Dense(nhidden, 2)
-        GenderPredictionModel(gin, mlp, globalpool, f, dense)
-    end
-    
-    function (m::GenderPredictionModel)(g::TemporalSnapshotsGNNGraph)
-        h = m.gin(g, g.ndata.x)
-        h = m.globalpool(g, h)
-        h = m.f(h)
-        m.dense(h)
-    end
-    
-end
- - - -``` -## Training -```@raw html -
-

We train the model for 100 epochs, using the Adam optimizer with a learning rate of 0.001. We use the logitbinarycrossentropy as the loss function, which is typically used as the loss in two-class classification, where the labels are given in a one-hot format. The accuracy expresses the number of correct classifications.

- -
lossfunction(ŷ, y) = Flux.logitbinarycrossentropy(ŷ, y);
- - -
function eval_loss_accuracy(model, data_loader)
-    error = mean([lossfunction(model(g), g.tgdata.g) for g in data_loader])
-    acc = mean([round(100 * mean(Flux.onecold(model(g)) .==     Flux.onecold(g.tgdata.g)); digits = 2) for g in data_loader])
-    return (loss = error, acc = acc)
-end;
- - -
function train(dataset; usecuda::Bool, kws...)
-
-    if usecuda && CUDA.functional() #check if GPU is available 
-        my_device = gpu
-        @info "Training on GPU"
-    else
-        my_device = cpu
-        @info "Training on CPU"
-    end
-    
-    function report(epoch)
-        train_loss, train_acc = eval_loss_accuracy(model, train_loader)
-        test_loss, test_acc = eval_loss_accuracy(model, test_loader)
-        println("Epoch: $epoch  $((; train_loss, train_acc))  $((; test_loss, test_acc))")
-        return (train_loss, train_acc, test_loss, test_acc)
-    end
-
-    model = GenderPredictionModel() |> my_device
-
-    opt = Flux.setup(Adam(1.0f-3), model)
-
-    train_loader, test_loader = data_loader(dataset)
-    train_loader = train_loader |> my_device
-    test_loader = test_loader |> my_device
-
-    report(0)
-    for epoch in 1:100
-        for g in train_loader
-            grads = Flux.gradient(model) do model
-                ŷ = model(g)
-                lossfunction(vec(ŷ), g.tgdata.g)
-            end
-            Flux.update!(opt, model, grads[1])
-        end
-        if  epoch % 10 == 0
-            report(epoch)
-        end
-    end
-    return model
-end;
-
- - -
train(brain_dataset; usecuda = true)
-
GenderPredictionModel(GINConv(Chain(Dense(103 => 128, relu), Dense(128 => 128, relu)), 0.5), Chain(Dense(103 => 128, relu), Dense(128 => 128, relu)), GlobalPool{typeof(mean)}(Statistics.mean), var"#4#5"(), Dense(128 => 2))  # 30_082 parameters, plus 29_824 non-trainable
- - -

We set up the training on the GPU because training takes a lot of time, especially when working on the CPU.

- - -``` -## Conclusions -```@raw html -
-

In this tutorial, we implemented a very simple architecture to classify temporal graphs in the context of gender classification using brain data. We then trained the model on the GPU for 100 epochs on the TemporalBrains dataset. The accuracy of the model is approximately 75-80%, but can be improved by fine-tuning the parameters and training on more data.

- - -``` - diff --git a/GraphNeuralNetworks/docs/src/tutorials/traffic_prediction.md b/GraphNeuralNetworks/docs/src/tutorials/traffic_prediction.md deleted file mode 100644 index 8f2376f74..000000000 --- a/GraphNeuralNetworks/docs/src/tutorials/traffic_prediction.md +++ /dev/null @@ -1,966 +0,0 @@ -```@meta -EditURL = "../../src_tutorials/introductory_tutorials/traffic_prediction.jl" -``` - -# Traffic Prediction using recurrent Temporal Graph Convolutional Network - -In this tutorial, we will learn how to use a recurrent Temporal Graph Convolutional Network (TGCN) to predict traffic in a spatio-temporal setting. Traffic forecasting is the problem of predicting future traffic trends on a road network given historical traffic data, such as, in our case, traffic speed and time of day. - -## Import - -We start by importing the necessary libraries. We use `GraphNeuralNetworks.jl`, `Flux.jl` and `MLDatasets.jl`, among others. - -````julia -using Flux, GraphNeuralNetworks -using Flux.Losses: mae -using MLDatasets: METRLA -using Statistics, Plots, Random - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(42); # for reproducibility -```` - -## Dataset: METR-LA - -We use the `METR-LA` dataset from the paper [Diffusion Convolutional Recurrent Neural Network: Data-driven Traffic Forecasting](https://arxiv.org/pdf/1707.01926.pdf), which contains traffic data from loop detectors in the highway of Los Angeles County. The dataset contains traffic speed data from March 1, 2012 to June 30, 2012. The data is collected every 5 minutes, resulting in 12 observations per hour, from 207 sensors. Each sensor is a node in the graph, and the edge weights are the distances between the sensor locations. - -````julia -dataset_metrla = METRLA(; num_timesteps = 3) -```` - -```` -dataset METRLA: - graphs => 1-element Vector{MLDatasets.Graph} -```` - -````julia -g = dataset_metrla[1] -```` - -```` -Graph: - num_nodes => 207 - num_edges => 1722 - edge_index => ("1722-element Vector{Int64}", "1722-element Vector{Int64}") - node_data => (features = "34269-element Vector{Any}", targets = "34269-element Vector{Any}") - edge_data => 1722-element Vector{Float32} -```` - -`edge_data` contains the weights of the edges of the graph and -`node_data` contains a node feature vector and a target vector. The latter vectors contain batches of dimension `num_timesteps`, which means that they contain vectors with the node features and targets of `num_timesteps` time steps. Two consecutive batches are shifted by one-time step. -The node features are the traffic speed of the sensors and the time of the day, and the targets are the traffic speed of the sensors in the next time step. -Let's see some examples: - -````julia -features = map(x -> permutedims(x,(1,3,2)), g.node_data.features) - -size(features[1]) -```` - -```` -(2, 3, 207) -```` - -The first dimension corresponds to the two features (the first line the speed value and the second line the time of day), the second to the number of time steps `num_timesteps` and the third to the nodes. - -````julia -targets = map(x -> permutedims(x,(1,3,2)), g.node_data.targets) - -size(targets[1]) -```` - -```` -(1, 3, 207) -```` - -In the case of the targets the first dimension is 1 because they store just the speed value. - -````julia -features[1][:,:,1] -```` - -```` -2×3 Matrix{Float32}: - 1.17081 1.11647 1.15888 - -0.876741 -0.87663 -0.87652 -```` - -````julia -features[2][:,:,1] -```` - -```` -2×3 Matrix{Float32}: - 1.11647 1.15888 -0.876741 - -0.87663 -0.87652 -0.87641 -```` - -````julia -targets[1][:,:,1] -```` - -```` -1×3 Matrix{Float32}: - 1.11647 1.15888 -0.876741 -```` - -````julia -function plot_data(data,sensor) - p = plot(legend=false, xlabel="Time (h)", ylabel="Normalized speed") - plotdata = [] - for i in 1:3:length(data) - push!(plotdata,data[i][1,:,sensor]) - end - plotdata = reduce(vcat,plotdata) - plot!(p, collect(1:length(data)), plotdata, color = :green, xticks =([i for i in 0:50:250], ["$(i)" for i in 0:4:24])) - return p -end - -plot_data(features[1:288],1) # Plot the speed of the first sensor for the first day -```` - -```@raw html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -``` - -Now let's construct the static graph, the `train_loader` and `data_loader`. - -````julia -graph = GNNGraph(g.edge_index; edata = g.edge_data, g.num_nodes); - -train_loader = zip(features[1:288], targets[1:288]); # train on 24 hours -test_loader = zip(features[289:577], targets[289:577]); # test on next 24 hours -```` - -## Model: T-GCN - -We use the T-GCN model from the paper [T-GCN: A Temporal Graph Convolutional Network for Traffic Prediction] (https://arxiv.org/pdf/1811.05320.pdf), which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). The GCN is used to capture spatial features from the graph, and the GRU is used to capture temporal features from the feature time series. - -````julia -model = GNNChain(TGCN(2 => 100; add_self_loops = false), Dense(100, 1)) -```` - -```` -GNNChain( - GNNRecurrence( - TGCNCell(2 => 100), # 91_500 parameters - ), - Dense(100 => 1), # 101 parameters -) # Total: 20 arrays, 91_601 parameters, 359.926 KiB. -```` - -Let's look at the output of the model for the first batch of the training data. - -````julia -model(graph, features[1]) -```` - -```` -1×3×207 Array{Float32, 3}: -[:, :, 1] = - -0.190437 -0.275888 -0.309614 - -[:, :, 2] = - -0.205105 -0.296784 -0.334173 - -[:, :, 3] = - -0.197624 -0.284286 -0.319129 - -[:, :, 4] = - -0.157204 -0.228289 -0.253792 - -[:, :, 5] = - -0.155905 -0.226222 -0.251258 - -[:, :, 6] = - -0.159185 -0.230661 -0.257051 - -[:, :, 7] = - -0.173409 -0.252395 -0.282089 - -[:, :, 8] = - -0.210598 -0.302728 -0.33926 - -[:, :, 9] = - -0.149367 -0.213933 -0.243558 - -[:, :, 10] = - -0.180688 -0.261343 -0.295115 - -[:, :, 11] = - -0.199304 -0.290447 -0.325584 - -[:, :, 12] = - -0.278894 -0.406116 -0.460844 - -[:, :, 13] = - -0.180216 -0.262925 -0.293612 - -[:, :, 14] = - -0.228686 -0.334105 -0.37733 - -[:, :, 15] = - -0.210634 -0.305526 -0.348456 - -[:, :, 16] = - -0.172959 -0.252245 -0.281268 - -[:, :, 17] = - -0.217106 -0.319649 -0.360754 - -[:, :, 18] = - -0.150425 -0.218005 -0.241832 - -[:, :, 19] = - -0.190064 -0.27711 -0.307167 - -[:, :, 20] = - -0.171708 -0.248663 -0.279023 - -[:, :, 21] = - -0.180806 -0.262256 -0.29506 - -[:, :, 22] = - -0.172006 -0.249267 -0.276387 - -[:, :, 23] = - -0.204998 -0.299257 -0.337803 - -[:, :, 24] = - -0.143662 -0.205132 -0.22986 - -[:, :, 25] = - -0.17058 -0.246961 -0.277176 - -[:, :, 26] = - -0.125724 -0.179274 -0.201392 - -[:, :, 27] = - -0.161693 -0.240145 -0.264886 - -[:, :, 28] = - -0.270972 -0.390949 -0.441452 - -[:, :, 29] = - -0.196606 -0.284139 -0.319259 - -[:, :, 30] = - -0.106939 -0.152224 -0.168653 - -[:, :, 31] = - -0.212599 -0.310341 -0.350769 - -[:, :, 32] = - -0.173633 -0.250774 -0.280106 - -[:, :, 33] = - -0.139142 -0.199454 -0.22344 - -[:, :, 34] = - -0.174493 -0.254723 -0.284142 - -[:, :, 35] = - -0.164989 -0.237711 -0.268315 - -[:, :, 36] = - -0.228477 -0.333556 -0.370548 - -[:, :, 37] = - -0.146427 -0.211616 -0.234375 - -[:, :, 38] = - -0.180574 -0.26056 -0.292912 - -[:, :, 39] = - -0.163789 -0.23647 -0.264311 - -[:, :, 40] = - -0.151709 -0.218625 -0.244964 - -[:, :, 41] = - -0.152286 -0.218895 -0.243639 - -[:, :, 42] = - -0.180067 -0.261519 -0.292724 - -[:, :, 43] = - -0.177356 -0.255949 -0.287688 - -[:, :, 44] = - -0.233629 -0.34346 -0.389236 - -[:, :, 45] = - -0.162016 -0.238135 -0.268996 - -[:, :, 46] = - -0.210987 -0.30546 -0.344888 - -[:, :, 47] = - -0.129844 -0.184776 -0.206926 - -[:, :, 48] = - -0.144141 -0.208359 -0.232098 - -[:, :, 49] = - -0.186031 -0.26919 -0.30268 - -[:, :, 50] = - -0.18573 -0.268062 -0.301012 - -[:, :, 51] = - -0.154996 -0.222242 -0.246921 - -[:, :, 52] = - -0.125969 -0.183241 -0.20299 - -[:, :, 53] = - -0.128753 -0.184801 -0.206073 - -[:, :, 54] = - -0.182141 -0.263658 -0.296718 - -[:, :, 55] = - -0.180574 -0.26056 -0.292912 - -[:, :, 56] = - -0.247853 -0.35698 -0.403441 - -[:, :, 57] = - -0.12183 -0.171019 -0.193141 - -[:, :, 58] = - -0.143798 -0.206911 -0.231874 - -[:, :, 59] = - -0.186424 -0.270015 -0.303871 - -[:, :, 60] = - -0.124582 -0.178255 -0.201623 - -[:, :, 61] = - -0.166843 -0.241884 -0.270934 - -[:, :, 62] = - -0.12182 -0.175862 -0.192675 - -[:, :, 63] = - -0.155786 -0.223574 -0.251115 - -[:, :, 64] = - -0.166843 -0.241884 -0.270934 - -[:, :, 65] = - -0.160543 -0.230969 -0.25823 - -[:, :, 66] = - -0.15121 -0.216889 -0.242367 - -[:, :, 67] = - -0.13159 -0.188731 -0.207805 - -[:, :, 68] = - -0.189616 -0.27612 -0.308881 - -[:, :, 69] = - -0.200743 -0.292853 -0.329716 - -[:, :, 70] = - -0.169824 -0.246521 -0.275719 - -[:, :, 71] = - -0.167384 -0.242634 -0.270986 - -[:, :, 72] = - -0.196242 -0.285936 -0.321644 - -[:, :, 73] = - -0.236292 -0.348151 -0.392883 - -[:, :, 74] = - -0.147614 -0.213301 -0.23788 - -[:, :, 75] = - -0.130178 -0.1868 -0.209104 - -[:, :, 76] = - -0.143267 -0.207005 -0.230624 - -[:, :, 77] = - -0.178527 -0.258857 -0.290719 - -[:, :, 78] = - -0.229404 -0.335587 -0.381214 - -[:, :, 79] = - -0.17892 -0.256519 -0.287337 - -[:, :, 80] = - -0.256783 -0.371621 -0.418989 - -[:, :, 81] = - -0.172706 -0.251723 -0.280522 - -[:, :, 82] = - -0.18082 -0.262931 -0.295119 - -[:, :, 83] = - -0.227386 -0.334063 -0.378103 - -[:, :, 84] = - -0.165214 -0.237812 -0.265142 - -[:, :, 85] = - -0.22231 -0.323155 -0.368888 - -[:, :, 86] = - -0.296831 -0.431825 -0.489221 - -[:, :, 87] = - -0.129124 -0.185373 -0.205527 - -[:, :, 88] = - -0.147305 -0.210806 -0.235909 - -[:, :, 89] = - -0.232948 -0.339972 -0.388093 - -[:, :, 90] = - -0.189576 -0.274985 -0.310499 - -[:, :, 91] = - -0.151025 -0.216674 -0.241499 - -[:, :, 92] = - -0.173409 -0.252395 -0.282089 - -[:, :, 93] = - -0.250912 -0.362302 -0.409829 - -[:, :, 94] = - -0.18209 -0.265956 -0.297498 - -[:, :, 95] = - -0.140428 -0.202285 -0.225455 - -[:, :, 96] = - -0.206212 -0.300465 -0.338838 - -[:, :, 97] = - -0.166441 -0.240681 -0.269976 - -[:, :, 98] = - -0.164072 -0.237686 -0.265411 - -[:, :, 99] = - -0.195899 -0.285294 -0.321289 - -[:, :, 100] = - -0.165435 -0.238288 -0.265888 - -[:, :, 101] = - -0.166142 -0.239296 -0.268309 - -[:, :, 102] = - -0.16833 -0.242256 -0.271739 - -[:, :, 103] = - -0.179791 -0.261186 -0.292492 - -[:, :, 104] = - -0.209821 -0.306689 -0.345681 - -[:, :, 105] = - -0.135322 -0.194691 -0.216658 - -[:, :, 106] = - -0.1255 -0.171665 -0.191923 - -[:, :, 107] = - -0.153809 -0.212498 -0.238045 - -[:, :, 108] = - -0.187928 -0.270401 -0.302994 - -[:, :, 109] = - -0.264405 -0.381957 -0.431546 - -[:, :, 110] = - -0.117467 -0.168596 -0.187241 - -[:, :, 111] = - -0.224301 -0.322166 -0.363671 - -[:, :, 112] = - -0.168287 -0.242538 -0.272265 - -[:, :, 113] = - -0.160955 -0.233634 -0.260878 - -[:, :, 114] = - -0.207936 -0.304274 -0.341215 - -[:, :, 115] = - -0.186424 -0.270015 -0.303871 - -[:, :, 116] = - -0.136708 -0.196708 -0.218464 - -[:, :, 117] = - -0.162854 -0.23376 -0.262001 - -[:, :, 118] = - -0.145378 -0.209858 -0.232379 - -[:, :, 119] = - -0.192249 -0.280093 -0.313773 - -[:, :, 120] = - -0.130178 -0.1868 -0.209104 - -[:, :, 121] = - -0.125724 -0.179274 -0.201392 - -[:, :, 122] = - -0.131778 -0.188032 -0.204329 - -[:, :, 123] = - -0.190305 -0.275725 -0.308528 - -[:, :, 124] = - -0.186358 -0.269023 -0.302943 - -[:, :, 125] = - -0.134097 -0.19219 -0.21461 - -[:, :, 126] = - -0.229687 -0.335651 -0.379391 - -[:, :, 127] = - -0.150366 -0.217124 -0.23494 - -[:, :, 128] = - -0.154125 -0.222539 -0.248575 - -[:, :, 129] = - -0.17385 -0.252715 -0.282263 - -[:, :, 130] = - -0.180074 -0.259453 -0.291821 - -[:, :, 131] = - -0.241886 -0.356054 -0.404226 - -[:, :, 132] = - -0.147983 -0.213392 -0.237915 - -[:, :, 133] = - -0.297865 -0.433592 -0.491451 - -[:, :, 134] = - -0.1685 -0.242305 -0.272017 - -[:, :, 135] = - -0.152483 -0.218733 -0.244645 - -[:, :, 136] = - -0.168702 -0.242609 -0.272312 - -[:, :, 137] = - -0.171193 -0.249144 -0.27821 - -[:, :, 138] = - -0.157444 -0.227595 -0.254614 - -[:, :, 139] = - -0.170859 -0.24856 -0.277567 - -[:, :, 140] = - -0.154185 -0.220971 -0.248326 - -[:, :, 141] = - -0.167564 -0.241895 -0.269898 - -[:, :, 142] = - -0.198382 -0.28988 -0.325145 - -[:, :, 143] = - -0.190193 -0.274691 -0.308954 - -[:, :, 144] = - -0.164367 -0.235356 -0.264506 - -[:, :, 145] = - -0.168585 -0.245612 -0.273639 - -[:, :, 146] = - -0.146785 -0.210873 -0.235221 - -[:, :, 147] = - -0.23117 -0.339433 -0.38506 - -[:, :, 148] = - -0.151623 -0.217167 -0.243276 - -[:, :, 149] = - -0.171926 -0.247709 -0.278381 - -[:, :, 150] = - -0.130441 -0.186429 -0.207541 - -[:, :, 151] = - -0.124612 -0.17517 -0.197384 - -[:, :, 152] = - -0.227914 -0.332301 -0.379083 - -[:, :, 153] = - -0.209602 -0.30737 -0.348098 - -[:, :, 154] = - -0.121033 -0.172756 -0.192205 - -[:, :, 155] = - -0.189307 -0.276265 -0.309519 - -[:, :, 156] = - -0.176345 -0.256208 -0.286709 - -[:, :, 157] = - -0.179865 -0.260353 -0.290873 - -[:, :, 158] = - -0.162518 -0.23573 -0.262435 - -[:, :, 159] = - -0.14841 -0.214032 -0.238902 - -[:, :, 160] = - -0.175029 -0.254328 -0.284384 - -[:, :, 161] = - -0.187785 -0.274063 -0.306795 - -[:, :, 162] = - -0.164439 -0.238981 -0.265863 - -[:, :, 163] = - -0.179146 -0.261088 -0.291655 - -[:, :, 164] = - -0.174582 -0.253612 -0.283677 - -[:, :, 165] = - -0.146835 -0.212141 -0.236504 - -[:, :, 166] = - -0.169574 -0.244662 -0.274579 - -[:, :, 167] = - -0.205169 -0.300378 -0.338329 - -[:, :, 168] = - -0.193459 -0.280816 -0.316494 - -[:, :, 169] = - -0.200406 -0.291517 -0.328319 - -[:, :, 170] = - -0.200175 -0.290885 -0.327673 - -[:, :, 171] = - -0.139527 -0.196866 -0.221303 - -[:, :, 172] = - -0.237291 -0.345947 -0.391304 - -[:, :, 173] = - -0.159724 -0.230503 -0.257995 - -[:, :, 174] = - -0.170648 -0.246842 -0.277382 - -[:, :, 175] = - -0.128632 -0.184283 -0.20528 - -[:, :, 176] = - -0.185905 -0.270254 -0.304107 - -[:, :, 177] = - -0.161168 -0.232131 -0.258788 - -[:, :, 178] = - -0.197273 -0.285652 -0.321442 - -[:, :, 179] = - -0.209952 -0.307914 -0.348789 - -[:, :, 180] = - -0.265724 -0.385954 -0.437209 - -[:, :, 181] = - -0.165994 -0.238608 -0.267949 - -[:, :, 182] = - -0.158609 -0.228565 -0.255802 - -[:, :, 183] = - -0.160324 -0.230812 -0.261482 - -[:, :, 184] = - -0.196514 -0.283855 -0.319215 - -[:, :, 185] = - -0.107441 -0.153569 -0.171727 - -[:, :, 186] = - -0.182517 -0.262905 -0.299795 - -[:, :, 187] = - -0.196173 -0.284077 -0.322929 - -[:, :, 188] = - -0.167405 -0.243004 -0.270949 - -[:, :, 189] = - -0.181083 -0.264399 -0.295156 - -[:, :, 190] = - -0.166454 -0.240585 -0.263743 - -[:, :, 191] = - -0.233863 -0.34417 -0.388829 - -[:, :, 192] = - -0.16662 -0.242443 -0.269792 - -[:, :, 193] = - -0.160664 -0.233362 -0.259544 - -[:, :, 194] = - -0.164726 -0.239385 -0.266441 - -[:, :, 195] = - -0.132454 -0.188973 -0.211022 - -[:, :, 196] = - -0.161302 -0.234395 -0.260481 - -[:, :, 197] = - -0.205169 -0.300378 -0.338329 - -[:, :, 198] = - -0.208202 -0.303859 -0.342555 - -[:, :, 199] = - -0.168634 -0.244804 -0.273155 - -[:, :, 200] = - -0.167564 -0.241895 -0.269898 - -[:, :, 201] = - -0.168502 -0.245323 -0.272658 - -[:, :, 202] = - -0.219244 -0.320418 -0.362164 - -[:, :, 203] = - -0.146067 -0.210437 -0.23644 - -[:, :, 204] = - -0.137563 -0.196391 -0.220473 - -[:, :, 205] = - -0.164275 -0.237357 -0.265263 - -[:, :, 206] = - -0.171608 -0.248259 -0.27865 - -[:, :, 207] = - -0.162244 -0.234602 -0.26246 -```` - -The output of the model is a tensor of size `(1, 3, 207)`, which corresponds to the dimension of the feature (in this case speed), the number of time steps, and the number of nodes in the graph, respectively. The model outputs the predicted traffic speed for each sensor at each time step. - -![](https://www.researchgate.net/profile/Haifeng-Li-3/publication/335353434/figure/fig4/AS:851870352437249@1580113127759/The-architecture-of-the-Gated-Recurrent-Unit-model.jpg) - -## Training - -We train the model for 100 epochs, using the Adam optimizer with a learning rate of 0.001. We use the mean absolute error (MAE) as the loss function. - -````julia -function train(graph, train_loader, model) - - opt = Flux.setup(Adam(0.001), model) - - for epoch in 1:100 - for (x, y) in train_loader - x, y = (x, y) - grads = Flux.gradient(model) do model - ŷ = model(graph, x) - Flux.mae(ŷ, y) - end - Flux.update!(opt, model, grads[1]) - end - - if epoch % 10 == 0 - loss = mean([Flux.mae(model(graph,x), y) for (x, y) in train_loader]) - @show epoch, loss - end - end - return model -end - -train(graph, train_loader, model) -```` - -```` -GNNChain( - GNNRecurrence( - TGCNCell(2 => 100), # 91_500 parameters - ), - Dense(100 => 1), # 101 parameters -) # Total: 20 arrays, 91_601 parameters, 359.926 KiB. -```` - -````julia -function plot_predicted_data(graph, features, targets, sensor) - p = plot(xlabel="Time (h)", ylabel="Normalized speed") - prediction = [] - ground_truth = [] - for i in 1:3:length(features) - push!(ground_truth,targets[i][1,:,sensor]) - push!(prediction, model(graph, features[i])[1,:,sensor]) - end - prediction = reduce(vcat,prediction) - ground_truth = reduce(vcat, ground_truth) - plot!(p, collect(1:length(prediction)), prediction, color = :red, label= "Prediction") - plot!(p, collect(1:length(ground_truth)), ground_truth, color = :blue, label = "Ground Truth", xticks = ([i for i in 0:50:250], ["$(i)" for i in 0:4:20])) - return p -end - -plot_predicted_data(graph,features[289:577],targets[289:577], 1) -```` - -```@raw html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -``` - -````julia -accuracy(ŷ, y) = 1 - Statistics.norm(y-ŷ)/Statistics.norm(y) -```` - -```` -accuracy (generic function with 1 method) -```` - -Test accuracy: - -````julia -mean([accuracy(model(graph,x), y) for (x, y) in test_loader]) -```` - -```` -0.61555225f0 -```` - -The accuracy is not very good but can be improved by training using more data. We used a small subset of the dataset for this tutorial because of the computational cost of training the model. From the plot of the predictions, we can see that the model is able to capture the general trend of the traffic speed, but it is not able to capture the peaks of the traffic. - -## Conclusion - -In this tutorial, we learned how to use a recurrent temporal graph convolutional network to predict traffic in a spatio-temporal setting. We used the TGCN model, which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). We then trained the model for 100 epochs on a small subset of the METR-LA dataset. The accuracy of the model is not very good, but it can be improved by training on more data. - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/gnn_intro.jl b/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/gnn_intro.jl deleted file mode 100644 index c2353d59e..000000000 --- a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/gnn_intro.jl +++ /dev/null @@ -1,244 +0,0 @@ -# # Hands-on introduction to Graph Neural Networks -# -# *This tutorial is a Julia adaptation of the Pytorch Geometric tutorials that can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html).* -# -# Recently, deep learning on graphs has emerged to one of the hottest research fields in the deep learning community. -# Here, **Graph Neural Networks (GNNs)** aim to generalize classical deep learning concepts to irregular structured data (in contrast to images or texts) and to enable neural networks to reason about objects and their relations. -# -# This is done by following a simple **neural message passing scheme**, where node features $\mathbf{x}_i^{(\ell)}$ of all nodes $i \in \mathcal{V}$ in a graph $\mathcal{G} = (\mathcal{V}, \mathcal{E})$ are iteratively updated by aggregating localized information from their neighbors $\mathcal{N}(i)$: -# -# ```math -# \mathbf{x}_i^{(\ell + 1)} = f^{(\ell + 1)}_{\theta} \left( \mathbf{x}_i^{(\ell)}, \left\{ \mathbf{x}_j^{(\ell)} : j \in \mathcal{N}(i) \right\} \right) -# ``` -# -# This tutorial will introduce you to some fundamental concepts regarding deep learning on graphs via Graph Neural Networks based on the **[GraphNeuralNetworks.jl library](https://github.com/JuliaGraphs/GraphNeuralNetworks.jl)**. -# GraphNeuralNetworks.jl is an extension library to the popular deep learning framework [Flux.jl](https://fluxml.ai/Flux.jl/stable/), and consists of various methods and utilities to ease the implementation of Graph Neural Networks. - -# Let's first import the packages we need: - -using Flux, GraphNeuralNetworks -using Flux: onecold, onehotbatch, logitcrossentropy -using MLDatasets -using LinearAlgebra, Random, Statistics -import GraphMakie -import CairoMakie as Makie - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -rng = Random.seed!(17); # for reproducibility - - -# Following [Kipf et al. (2017)](https://arxiv.org/abs/1609.02907), let's dive into the world of GNNs by looking at a simple graph-structured example, the well-known [**Zachary's karate club network**](https://en.wikipedia.org/wiki/Zachary%27s_karate_club). This graph describes a social network of 34 members of a karate club and documents links between members who interacted outside the club. Here, we are interested in detecting communities that arise from the member's interaction. -# GraphNeuralNetworks.jl provides utilities to convert [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl)'s datasets to its own type: - -dataset = MLDatasets.KarateClub() - -# After initializing the `KarateClub` dataset, we first can inspect some of its properties. -# For example, we can see that this dataset holds exactly **one graph**. -# Furthermore, the graph holds exactly **4 classes**, which represent the community each node belongs to. - -karate = dataset[1] - -karate.node_data.labels_comm - -# Now we convert the single-graph dataset to a `GNNGraph`. Moreover, we add a an array of node features, a **34-dimensional feature vector** for each node which uniquely describes the members of the karate club. We also add a training mask selecting the nodes to be used for training in our semi-supervised node classification task. - -g = mldataset2gnngraph(dataset) # convert a MLDatasets.jl's dataset to a GNNGraphs (or a collection of graphs) - -x = zeros(Float32, g.num_nodes, g.num_nodes) -x[diagind(x)] .= 1 - -train_mask = [true, false, false, false, true, false, false, false, true, - false, false, false, false, false, false, false, false, false, false, false, - false, false, false, false, true, false, false, false, false, false, - false, false, false, false] - -labels = g.ndata.labels_comm -y = onehotbatch(labels, 0:3) - -g = GNNGraph(g, ndata = (; x, y, train_mask)) - -# Let's now look at the underlying graph in more detail: - -println("Number of nodes: $(g.num_nodes)") -println("Number of edges: $(g.num_edges)") -println("Average node degree: $(g.num_edges / g.num_nodes)") -println("Number of training nodes: $(sum(g.ndata.train_mask))") -println("Training node label rate: $(mean(g.ndata.train_mask))") -println("Has isolated nodes: $(has_isolated_nodes(g))") -println("Has self-loops: $(has_self_loops(g))") -println("Is undirected: $(is_bidirected(g))") - -# Each graph in GraphNeuralNetworks.jl is represented by a `GNNGraph` object, which holds all the information to describe its graph representation. -# We can print the data object anytime via `print(g)` to receive a short summary about its attributes and their shapes. - -# The `g` object holds 3 attributes: -# - `g.ndata`: contains node-related information. -# - `g.edata`: holds edge-related information. -# - `g.gdata`: this stores the global data, therefore neither node nor edge-specific features. - -# These attributes are `NamedTuples` that can store multiple feature arrays: we can access a specific set of features e.g. `x`, with `g.ndata.x`. - - -# In our task, `g.ndata.train_mask` describes for which nodes we already know their community assignments. In total, we are only aware of the ground-truth labels of 4 nodes (one for each community), and the task is to infer the community assignment for the remaining nodes. - -# The `g` object also provides some **utility functions** to infer some basic properties of the underlying graph. -# For example, we can easily infer whether there exist isolated nodes in the graph (*i.e.* there exists no edge to any node), whether the graph contains self-loops (*i.e.*, $(v, v) \in \mathcal{E}$), or whether the graph is bidirected (*i.e.*, for each edge $(v, w) \in \mathcal{E}$ there also exists the edge $(w, v) \in \mathcal{E}$). - -# Let us now inspect the `edge_index` method: - -edge_index(g) - -# By printing `edge_index(g)`, we can understand how GraphNeuralNetworks.jl represents graph connectivity internally. -# We can see that for each edge, `edge_index` holds a tuple of two node indices, where the first value describes the node index of the source node and the second value describes the node index of the destination node of an edge. - -# This representation is known as the **COO format (coordinate format)** commonly used for representing sparse matrices. -# Instead of holding the adjacency information in a dense representation $\mathbf{A} \in \{ 0, 1 \}^{|\mathcal{V}| \times |\mathcal{V}|}$, GraphNeuralNetworks.jl represents graphs sparsely, which refers to only holding the coordinates/values for which entries in $\mathbf{A}$ are non-zero. - -# Importantly, GraphNeuralNetworks.jl does not distinguish between directed and undirected graphs, and treats undirected graphs as a special case of directed graphs in which reverse edges exist for every entry in the `edge_index`. - -# Since a `GNNGraph` is an `AbstractGraph` from the `Graphs.jl` library, it supports graph algorithms and visualization tools from the wider julia graph ecosystem: - - -GraphMakie.graphplot(g |> to_unidirected, node_size = 20, node_color = labels, arrow_show = false) - -# ## Implementing Graph Neural Networks - -# After learning about GraphNeuralNetworks.jl's data handling, it's time to implement our first Graph Neural Network! - -# For this, we will use on of the most simple GNN operators, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)), which is defined as - -# ```math -# \mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} -# ``` - -# where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. - -# GraphNeuralNetworks.jl implements this layer via `GCNConv`, which can be executed by passing in the node feature representation `x` and the COO graph connectivity representation `edge_index`. - -# With this, we are ready to create our first Graph Neural Network by defining our network architecture: - - -struct GCN - layers::NamedTuple -end - -Flux.@layer GCN # Provides parameter collection, gpu movement and more - -function GCN(num_features, num_classes) - layers = (conv1 = GCNConv(num_features => 4), - conv2 = GCNConv(4 => 4), - conv3 = GCNConv(4 => 2), - classifier = Dense(2, num_classes)) - return GCN(layers) -end; - -function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) - l = gcn.layers - x = l.conv1(g, x) - x = tanh.(x) - x = l.conv2(g, x) - x = tanh.(x) - x = l.conv3(g, x) - x = tanh.(x) # Final GNN embedding space. - out = l.classifier(x) # Apply a final (linear) classifier. - return out, x -end; - -# Here, we first initialize all of our building blocks in the constructor and define the computation flow of our network in the call method. -# We first define and stack **three graph convolution layers**, which corresponds to aggregating 3-hop neighborhood information around each node (all nodes up to 3 "hops" away). -# In addition, the `GCNConv` layers reduce the node feature dimensionality to ``2``, *i.e.*, $34 \rightarrow 4 \rightarrow 4 \rightarrow 2$. Each `GCNConv` layer is enhanced by a `tanh` non-linearity. - -# After that, we apply a single linear transformation (`Flux.Dense` that acts as a classifier to map our nodes to 1 out of the 4 classes/communities. - -# We return both the output of the final classifier as well as the final node embeddings produced by our GNN. -# We proceed to initialize our final model via `GCN()`, and printing our model produces a summary of all its used sub-modules. - -# ### Embedding the Karate Club Network - -# Let's take a look at the node embeddings produced by our GNN. -# Here, we pass in the initial node features `x` and the graph information `g` to the model, and visualize its 2-dimensional embedding. - - -num_features = 34 -num_classes = 4 -gcn = GCN(num_features, num_classes) - -# -_, h = gcn(g, g.ndata.x); - -# - -function visualize_embeddings(h; colors = nothing) - xs = h[1, :] |> vec - ys = h[2, :] |> vec - Makie.scatter(xs, ys, color = labels, markersize = 20) -end - -visualize_embeddings(h, colors = labels) - -# Remarkably, even before training the weights of our model, the model produces an embedding of nodes that closely resembles the community-structure of the graph. -# Nodes of the same color (community) are already closely clustered together in the embedding space, although the weights of our model are initialized **completely at random** and we have not yet performed any training so far! -# This leads to the conclusion that GNNs introduce a strong inductive bias, leading to similar embeddings for nodes that are close to each other in the input graph. - -# ### Training on the Karate Club Network - -# But can we do better? Let's look at an example on how to train our network parameters based on the knowledge of the community assignments of 4 nodes in the graph (one for each community). - -# Since everything in our model is differentiable and parameterized, we can add some labels, train the model and observe how the embeddings react. -# Here, we make use of a semi-supervised or transductive learning procedure: we simply train against one node per class, but are allowed to make use of the complete input graph data. - -# Training our model is very similar to any other Flux model. -# In addition to defining our network architecture, we define a loss criterion (here, `logitcrossentropy`), and initialize a stochastic gradient optimizer (here, `Adam`). -# After that, we perform multiple rounds of optimization, where each round consists of a forward and backward pass to compute the gradients of our model parameters w.r.t. to the loss derived from the forward pass. -# If you are not new to Flux, this scheme should appear familiar to you. - -# Note that our semi-supervised learning scenario is achieved by the following line: -# ```julia -# loss = logitcrossentropy(ŷ[:,train_mask], y[:,train_mask]) -# ``` - -# While we compute node embeddings for all of our nodes, we **only make use of the training nodes for computing the loss**. -# Here, this is implemented by filtering the output of the classifier `out` and ground-truth labels `data.y` to only contain the nodes in the `train_mask`. - -# Let us now start training and see how our node embeddings evolve over time (best experienced by explicitly running the code): - -model = GCN(num_features, num_classes) -opt = Flux.setup(Adam(1e-2), model) -epochs = 2000 - -emb = h -function report(epoch, loss, h) - @info (; epoch, loss) -end - -report(0, 10.0, emb) -for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ, emb = model(g, g.ndata.x) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - report(epoch, loss, emb) - end -end; - -# -ŷ, emb_final = model(g, g.ndata.x) - -# Train accuracy: - -mean(onecold(ŷ[:, train_mask]) .== onecold(y[:, train_mask])) - -# Test accuracy: - -mean(onecold(ŷ[:, .!train_mask]) .== onecold(y[:, .!train_mask])) - -# Final embedding: - -visualize_embeddings(emb_final, colors = labels) - -# As one can see, our 3-layer GCN model manages to linearly separating the communities and classifying most of the nodes correctly. - -# Furthermore, we did this all with a few lines of code, thanks to the GraphNeuralNetworks.jl which helped us out with data handling and GNN implementations. diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/graph_classification.jl b/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/graph_classification.jl deleted file mode 100644 index c14eacc51..000000000 --- a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/graph_classification.jl +++ /dev/null @@ -1,205 +0,0 @@ -# # Graph Classification with Graph Neural Networks - -# *This tutorial is a julia adaptation of the Pytorch Geometric tutorials that can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html).* - -# In this tutorial session we will have a closer look at how to apply **Graph Neural Networks (GNNs) to the task of graph classification**. -# Graph classification refers to the problem of classifying entire graphs (in contrast to nodes), given a **dataset of graphs**, based on some structural graph properties and possibly on some input node features. -# Here, we want to embed entire graphs, and we want to embed those graphs in such a way so that they are linearly separable given a task at hand. -# We will use a graph convolutional network to create a vector embedding of the input graph, and the apply a simple linear classification head to perform the final classification. - -# A common graph classification task is **molecular property prediction**, in which molecules are represented as graphs, and the task may be to infer whether a molecule inhibits HIV virus replication or not. - -# The TU Dortmund University has collected a wide range of different graph classification datasets, known as the [**TUDatasets**](https://chrsmrrs.github.io/datasets/), which are also accessible via MLDatasets.jl. -# Let's import the necessary packages. Then we'll load and inspect one of the smaller ones, the **MUTAG dataset**: - - -using Flux, GraphNeuralNetworks -using Flux: onecold, onehotbatch, logitcrossentropy, DataLoader -using MLDatasets, MLUtils -using LinearAlgebra, Random, Statistics - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(42); # for reproducibility -# - -dataset = TUDataset("MUTAG") - -# -dataset.graph_data.targets |> union - -# -g1, y1 = dataset[1] # get the first graph and target - -# -reduce(vcat, g.node_data.targets for (g, _) in dataset) |> union - -# -reduce(vcat, g.edge_data.targets for (g, _) in dataset) |> union - -# This dataset provides **188 different graphs**, and the task is to classify each graph into **one out of two classes**. - -# By inspecting the first graph object of the dataset, we can see that it comes with **17 nodes** and **38 edges**. -# It also comes with exactly **one graph label**, and provides additional node labels (7 classes) and edge labels (4 classes). -# However, for the sake of simplicity, we will not make use of edge labels. - -# We now convert the `MLDatasets.jl` graph types to our `GNNGraph`s and we also onehot encode both the node labels (which will be used as input features) and the graph labels (what we want to predict): - -graphs = mldataset2gnngraph(dataset) -graphs = [GNNGraph(g, - ndata = Float32.(onehotbatch(g.ndata.targets, 0:6)), - edata = nothing) - for g in graphs] -y = onehotbatch(dataset.graph_data.targets, [-1, 1]) - - -# We have some useful utilities for working with graph datasets, *e.g.*, we can shuffle the dataset and use the first 150 graphs as training graphs, while using the remaining ones for testing: - -train_data, test_data = splitobs((graphs, y), at = 150, shuffle = true) |> getobs - - -train_loader = DataLoader(train_data, batchsize = 32, shuffle = true) -test_loader = DataLoader(test_data, batchsize = 32, shuffle = false) - -# Here, we opt for a `batch_size` of 32, leading to 5 (randomly shuffled) mini-batches, containing all $4 \cdot 32+22 = 150$ graphs. - - -# ## Mini-batching of graphs - -# Since graphs in graph classification datasets are usually small, a good idea is to **batch the graphs** before inputting them into a Graph Neural Network to guarantee full GPU utilization. -# In the image or language domain, this procedure is typically achieved by **rescaling** or **padding** each example into a set of equally-sized shapes, and examples are then grouped in an additional dimension. -# The length of this dimension is then equal to the number of examples grouped in a mini-batch and is typically referred to as the `batchsize`. - - -# However, for GNNs the two approaches described above are either not feasible or may result in a lot of unnecessary memory consumption. -# Therefore, GraphNeuralNetworks.jl opts for another approach to achieve parallelization across a number of examples. Here, adjacency matrices are stacked in a diagonal fashion (creating a giant graph that holds multiple isolated subgraphs), and node and target features are simply concatenated in the node dimension (the last dimension). - -# This procedure has some crucial advantages over other batching procedures: - -# 1. GNN operators that rely on a message passing scheme do not need to be modified since messages are not exchanged between two nodes that belong to different graphs. - -# 2. There is no computational or memory overhead since adjacency matrices are saved in a sparse fashion holding only non-zero entries, *i.e.*, the edges. - -# GraphNeuralNetworks.jl can **batch multiple graphs into a single giant graph**: - - -vec_gs, _ = first(train_loader) - -# -MLUtils.batch(vec_gs) - - -# Each batched graph object is equipped with a **`graph_indicator` vector**, which maps each node to its respective graph in the batch: - -# ```math -# \textrm{graph\_indicator} = [1, \ldots, 1, 2, \ldots, 2, 3, \ldots ] -# ``` - - -# ## Training a Graph Neural Network (GNN) - -# Training a GNN for graph classification usually follows a simple recipe: - -# 1. Embed each node by performing multiple rounds of message passing -# 2. Aggregate node embeddings into a unified graph embedding (**readout layer**) -# 3. Train a final classifier on the graph embedding - -# There exists multiple **readout layers** in literature, but the most common one is to simply take the average of node embeddings: - -# ```math -# \mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_v -# ``` - -# GraphNeuralNetworks.jl provides this functionality via `GlobalPool(mean)`, which takes in the node embeddings of all nodes in the mini-batch and the assignment vector `graph_indicator` to compute a graph embedding of size `[hidden_channels, batchsize]`. - -# The final architecture for applying GNNs to the task of graph classification then looks as follows and allows for complete end-to-end training: - -function create_model(nin, nh, nout) - GNNChain(GCNConv(nin => nh, relu), - GCNConv(nh => nh, relu), - GCNConv(nh => nh), - GlobalPool(mean), - Dropout(0.5), - Dense(nh, nout)) -end; - - -# Here, we again make use of the `GCNConv` with $\mathrm{ReLU}(x) = \max(x, 0)$ activation for obtaining localized node embeddings, before we apply our final classifier on top of a graph readout layer. - -# Let's train our network for a few epochs to see how well it performs on the training as well as test set: - - - -function eval_loss_accuracy(model, data_loader, device) - loss = 0.0 - acc = 0.0 - ntot = 0 - for (g, y) in data_loader - g, y = MLUtils.batch(g) |> device, y |> device - n = length(y) - ŷ = model(g, g.ndata.x) - loss += logitcrossentropy(ŷ, y) * n - acc += mean((ŷ .> 0) .== y) * n - ntot += n - end - return (loss = round(loss / ntot, digits = 4), - acc = round(acc * 100 / ntot, digits = 2)) -end - - -function train!(model; epochs = 200, η = 1e-3, infotime = 10) - ## device = Flux.gpu # uncomment this for GPU training - device = Flux.cpu - model = model |> device - opt = Flux.setup(Adam(η), model) - - function report(epoch) - train = eval_loss_accuracy(model, train_loader, device) - test = eval_loss_accuracy(model, test_loader, device) - @info (; epoch, train, test) - end - - report(0) - for epoch in 1:epochs - for (g, y) in train_loader - g, y = MLUtils.batch(g) |> device, y |> device - grad = Flux.gradient(model) do model - ŷ = model(g, g.ndata.x) - logitcrossentropy(ŷ, y) - end - Flux.update!(opt, model, grad[1]) - end - epoch % infotime == 0 && report(epoch) - end -end - - -nin = 7 -nh = 64 -nout = 2 -model = create_model(nin, nh, nout) -train!(model) - - - -# As one can see, our model reaches around **75% test accuracy**. -# Reasons for the fluctuations in accuracy can be explained by the rather small dataset (only 38 test graphs), and usually disappear once one applies GNNs to larger datasets. - -# ## (Optional) Exercise - -# Can we do better than this? -# As multiple papers pointed out ([Xu et al. (2018)](https://arxiv.org/abs/1810.00826), [Morris et al. (2018)](https://arxiv.org/abs/1810.02244)), applying **neighborhood normalization decreases the expressivity of GNNs in distinguishing certain graph structures**. -# An alternative formulation ([Morris et al. (2018)](https://arxiv.org/abs/1810.02244)) omits neighborhood normalization completely and adds a simple skip-connection to the GNN layer in order to preserve central node information: - -# ```math -# \mathbf{x}_i^{(\ell+1)} = \mathbf{W}^{(\ell + 1)}_1 \mathbf{x}_i^{(\ell)} + \mathbf{W}^{(\ell + 1)}_2 \sum_{j \in \mathcal{N}(i)} \mathbf{x}_j^{(\ell)} -# ``` - -# This layer is implemented under the name `GraphConv` in GraphNeuralNetworks.jl. - -# As an exercise, you are invited to complete the following code to the extent that it makes use of `GraphConv` rather than `GCNConv`. -# This should bring you close to **82% test accuracy**. - -# ## Conclusion - -# In this chapter, you have learned how to apply GNNs to the task of graph classification. -# You have learned how graphs can be batched together for better GPU utilization, and how to apply readout layers for obtaining graph embeddings rather than node embeddings. diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/node_classification.jl b/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/node_classification.jl deleted file mode 100644 index 16e82eee8..000000000 --- a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/node_classification.jl +++ /dev/null @@ -1,275 +0,0 @@ -# # Node Classification with Graph Neural Networks - -# In this tutorial, we will be learning how to use Graph Neural Networks (GNNs) for node classification. Given the ground-truth labels of only a small subset of nodes, and want to infer the labels for all the remaining nodes (transductive learning). - -# ## Import -# Let us start off by importing some libraries. We will be using `Flux.jl` and `GraphNeuralNetworks.jl` for our tutorial. - -using Flux, GraphNeuralNetworks -using Flux: onecold, onehotbatch, logitcrossentropy -using MLDatasets -using Plots, TSne -using Statistics, Random - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(17); # for reproducibility - -# ## Visualize -# We want to visualize our results using t-distributed stochastic neighbor embedding (tsne) to project our output onto a 2D plane. - -function visualize_tsne(out, targets) - z = tsne(out, 2) - scatter(z[:, 1], z[:, 2], color = Int.(targets[1:size(z, 1)]), leg = false) -end; - -# ## Dataset: Cora - -# For our tutorial, we will be using the `Cora` dataset. `Cora` is a citation network of 2708 documents categorized into seven classes with 5,429 citation links. Each node represents an article or document, and edges between nodes indicate a citation relationship, where one cites the other. - -# Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. - -# This dataset was first introduced by [Yang et al. (2016)](https://arxiv.org/abs/1603.08861) as one of the datasets of the `Planetoid` benchmark suite. We will be using [MLDatasets.jl](https://juliaml.github.io/MLDatasets.jl/stable/) for an easy access to this dataset. - -dataset = Cora() - -# Datasets in MLDatasets.jl have `metadata` containing information about the dataset itself. - -dataset.metadata - -# The `graphs` variable contains the graph. The `Cora` dataset contains only 1 graph. - - -dataset.graphs - -# There is only one graph of the dataset. The `node_data` contains `features` indicating if certain words are present or not and `targets` indicating the class for each document. We convert the single-graph dataset to a `GNNGraph`. - -g = mldataset2gnngraph(dataset) - -println("Number of nodes: $(g.num_nodes)") -println("Number of edges: $(g.num_edges)") -println("Average node degree: $(g.num_edges / g.num_nodes)") -println("Number of training nodes: $(sum(g.ndata.train_mask))") -println("Training node label rate: $(mean(g.ndata.train_mask))") -println("Has isolated nodes: $(has_isolated_nodes(g))") -println("Has self-loops: $(has_self_loops(g))") -println("Is undirected: $(is_bidirected(g))") - - -# Overall, this dataset is quite similar to the previously used [`KarateClub`](https://juliaml.github.io/MLDatasets.jl/stable/datasets/graphs/#MLDatasets.KarateClub) network. -# We can see that the `Cora` network holds 2,708 nodes and 10,556 edges, resulting in an average node degree of 3.9. -# For training this dataset, we are given the ground-truth categories of 140 nodes (20 for each class). -# This results in a training node label rate of only 5%. - -# We can further see that this network is undirected, and that there exists no isolated nodes (each document has at least one citation). - -x = g.ndata.features # we onehot encode both the node labels (what we want to predict): -y = onehotbatch(g.ndata.targets, 1:7) -train_mask = g.ndata.train_mask -num_features = size(x)[1] -hidden_channels = 16 -num_classes = dataset.metadata["num_classes"]; - -# ## Multi-layer Perception Network (MLP) - -# In theory, we should be able to infer the category of a document solely based on its content, *i.e.* its bag-of-words feature representation, without taking any relational information into account. - -# Let's verify that by constructing a simple MLP that solely operates on input node features (using shared weights across all nodes): - -struct MLP - layers::NamedTuple -end - -Flux.@layer :expand MLP - -function MLP(num_features, num_classes, hidden_channels; drop_rate = 0.5) - layers = (hidden = Dense(num_features => hidden_channels), - drop = Dropout(drop_rate), - classifier = Dense(hidden_channels => num_classes)) - return MLP(layers) -end; - -function (model::MLP)(x::AbstractMatrix) - l = model.layers - x = l.hidden(x) - x = relu(x) - x = l.drop(x) - x = l.classifier(x) - return x -end; - -# ### Training a Multilayer Perceptron - -# Our MLP is defined by two linear layers and enhanced by [ReLU](https://fluxml.ai/Flux.jl/stable/models/nnlib/#NNlib.relu) non-linearity and [Dropout](https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.Dropout). -# Here, we first reduce the 1433-dimensional feature vector to a low-dimensional embedding (`hidden_channels=16`), while the second linear layer acts as a classifier that should map each low-dimensional node embedding to one of the 7 classes. - -# Let's train our simple MLP by following a similar procedure as described in [the first part of this tutorial](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/tutorials/gnn_intro/). -# We again make use of the **cross entropy loss** and **Adam optimizer**. -# This time, we also define a **`accuracy` function** to evaluate how well our final model performs on the test node set (which labels have not been observed during training). - -function train(model::MLP, data::AbstractMatrix, epochs::Int, opt) - Flux.trainmode!(model) - - for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ = model(data) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - @show epoch, loss - end - end -end; - -function accuracy(model::MLP, x::AbstractMatrix, y::Flux.OneHotArray, mask::BitVector) - Flux.testmode!(model) - mean(onecold(model(x))[mask] .== onecold(y)[mask]) -end; - -mlp = MLP(num_features, num_classes, hidden_channels) -opt_mlp = Flux.setup(Adam(1e-3), mlp) -epochs = 2000 -train(mlp, g.ndata.features, epochs, opt_mlp) - -# After training the model, we can call the `accuracy` function to see how well our model performs on unseen labels. -# Here, we are interested in the accuracy of the model, *i.e.*, the ratio of correctly classified nodes: - -accuracy(mlp, g.ndata.features, y, .!train_mask) - - -# As one can see, our MLP performs rather bad with only about ~50% test accuracy. -# But why does the MLP do not perform better? -# The main reason for that is that this model suffers from heavy overfitting due to only having access to a **small amount of training nodes**, and therefore generalizes poorly to unseen node representations. - -# It also fails to incorporate an important bias into the model: **Cited papers are very likely related to the category of a document**. -# That is exactly where Graph Neural Networks come into play and can help to boost the performance of our model. - - - -# ## Training a Graph Convolutional Neural Network (GNN) - -# Following-up on the first part of this tutorial, we replace the `Dense` linear layers by the [`GCNConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GCNConv) module. -# To recap, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)) is defined as - -# ```math -# \mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} -# ``` - -# where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. -# In contrast, a single `Linear` layer is defined as - -# ```math -# \mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \mathbf{x}_v^{(\ell)} -# ``` - -# which does not make use of neighboring node information. - -struct GCN - layers::NamedTuple -end - -Flux.@layer GCN # provides parameter collection, gpu movement and more - -function GCN(num_features, num_classes, hidden_channels; drop_rate = 0.5) - layers = (conv1 = GCNConv(num_features => hidden_channels), - drop = Dropout(drop_rate), - conv2 = GCNConv(hidden_channels => num_classes)) - return GCN(layers) -end; - -function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) - l = gcn.layers - x = l.conv1(g, x) - x = relu.(x) - x = l.drop(x) - x = l.conv2(g, x) - return x -end; - - -# Now let's visualize the node embeddings of our **untrained** GCN network. - -gcn = GCN(num_features, num_classes, hidden_channels) -h_untrained = gcn(g, x) |> transpose -visualize_tsne(h_untrained, g.ndata.targets) - - -# We certainly can do better by training our model. -# The training and testing procedure is once again the same, but this time we make use of the node features `x` **and** the graph `g` as input to our GCN model. - -function train(model::GCN, g::GNNGraph, x::AbstractMatrix, epochs::Int, opt) - Flux.trainmode!(model) - - for epoch in 1:epochs - loss, grad = Flux.withgradient(model) do model - ŷ = model(g, x) - logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) - end - - Flux.update!(opt, model, grad[1]) - if epoch % 200 == 0 - @show epoch, loss - end - end -end; - -# - -mlp = MLP(num_features, num_classes, hidden_channels) -opt_mlp = Flux.setup(Adam(1e-3), mlp) -epochs = 2000 -train(mlp, g.ndata.features, epochs, opt_mlp) - -# -function accuracy(model::GCN, g::GNNGraph, x::AbstractMatrix, y::Flux.OneHotArray, - mask::BitVector) - Flux.testmode!(model) - mean(onecold(model(g, x))[mask] .== onecold(y)[mask]) -end - -# - -accuracy(mlp, g.ndata.features, y, .!train_mask) - -# - -opt_gcn = Flux.setup(Adam(1e-2), gcn) -train(gcn, g, x, epochs, opt_gcn) - - -# Now let's evaluate the loss of our trained GCN. - -train_accuracy = accuracy(gcn, g, g.ndata.features, y, train_mask) -test_accuracy = accuracy(gcn, g, g.ndata.features, y, .!train_mask) - -println("Train accuracy: $(train_accuracy)") -println("Test accuracy: $(test_accuracy)") - - -# **There it is!** -# By simply swapping the linear layers with GNN layers, we can reach **76% of test accuracy**! -# This is in stark contrast to the 59% of test accuracy obtained by our MLP, indicating that relational information plays a crucial role in obtaining better performance. - -# We can also verify that once again by looking at the output embeddings of our trained model, which now produces a far better clustering of nodes of the same category. - - -Flux.testmode!(gcn) # inference mode - -out_trained = gcn(g, x) |> transpose -visualize_tsne(out_trained, g.ndata.targets) - - - -# ## (Optional) Exercises - -# 1. To achieve better model performance and to avoid overfitting, it is usually a good idea to select the best model based on an additional validation set. The `Cora` dataset provides a validation node set as `g.ndata.val_mask`, but we haven't used it yet. Can you modify the code to select and test the model with the highest validation performance? This should bring test performance to **82% accuracy**. - -# 2. How does `GCN` behave when increasing the hidden feature dimensionality or the number of layers? Does increasing the number of layers help at all? - -# 3. You can try to use different GNN layers to see how model performance changes. What happens if you swap out all `GCNConv` instances with [`GATConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GATConv) layers that make use of attention? Try to write a 2-layer `GAT` model that makes use of 8 attention heads in the first layer and 1 attention head in the second layer, uses a `dropout` ratio of `0.6` inside and outside each `GATConv` call, and uses a `hidden_channels` dimensions of `8` per head. - - - -# ## Conclusion -# In this tutorial, we have seen how to apply GNNs to real-world problems, and, in particular, how they can effectively be used for boosting a model's performance. In the next tutorial, we will look into how GNNs can be used for the task of graph classification. diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/traffic_prediction.jl b/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/traffic_prediction.jl deleted file mode 100644 index c869fb664..000000000 --- a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/traffic_prediction.jl +++ /dev/null @@ -1,143 +0,0 @@ -# # Traffic Prediction using recurrent Temporal Graph Convolutional Network - -# In this tutorial, we will learn how to use a recurrent Temporal Graph Convolutional Network (TGCN) to predict traffic in a spatio-temporal setting. Traffic forecasting is the problem of predicting future traffic trends on a road network given historical traffic data, such as, in our case, traffic speed and time of day. - -# ## Import - -# We start by importing the necessary libraries. We use `GraphNeuralNetworks.jl`, `Flux.jl` and `MLDatasets.jl`, among others. - -using Flux, GraphNeuralNetworks -using Flux.Losses: mae -using MLDatasets: METRLA -using Statistics, Plots, Random - -ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation -Random.seed!(42); # for reproducibility - -# ## Dataset: METR-LA - -# We use the `METR-LA` dataset from the paper [Diffusion Convolutional Recurrent Neural Network: Data-driven Traffic Forecasting](https://arxiv.org/pdf/1707.01926.pdf), which contains traffic data from loop detectors in the highway of Los Angeles County. The dataset contains traffic speed data from March 1, 2012 to June 30, 2012. The data is collected every 5 minutes, resulting in 12 observations per hour, from 207 sensors. Each sensor is a node in the graph, and the edge weights are the distances between the sensor locations. - -dataset_metrla = METRLA(; num_timesteps = 3) -# -g = dataset_metrla[1] - - -# `edge_data` contains the weights of the edges of the graph and -# `node_data` contains a node feature vector and a target vector. The latter vectors contain batches of dimension `num_timesteps`, which means that they contain vectors with the node features and targets of `num_timesteps` time steps. Two consecutive batches are shifted by one-time step. -# The node features are the traffic speed of the sensors and the time of the day, and the targets are the traffic speed of the sensors in the next time step. -# Let's see some examples: - -features = map(x -> permutedims(x,(1,3,2)), g.node_data.features) - -size(features[1]) - -# The first dimension corresponds to the two features (the first line the speed value and the second line the time of day), the second to the number of time steps `num_timesteps` and the third to the nodes. - -targets = map(x -> permutedims(x,(1,3,2)), g.node_data.targets) - -size(targets[1]) - -# In the case of the targets the first dimension is 1 because they store just the speed value. - -features[1][:,:,1] - -# -features[2][:,:,1] - -# -targets[1][:,:,1] - -# -function plot_data(data,sensor) - p = plot(legend=false, xlabel="Time (h)", ylabel="Normalized speed") - plotdata = [] - for i in 1:3:length(data) - push!(plotdata,data[i][1,:,sensor]) - end - plotdata = reduce(vcat,plotdata) - plot!(p, collect(1:length(data)), plotdata, color = :green, xticks =([i for i in 0:50:250], ["$(i)" for i in 0:4:24])) - return p -end - -plot_data(features[1:288],1) # Plot the speed of the first sensor for the first day - -# Now let's construct the static graph, the `train_loader` and `data_loader`. - -graph = GNNGraph(g.edge_index; edata = g.edge_data, g.num_nodes); - -train_loader = zip(features[1:288], targets[1:288]); # train on 24 hours -test_loader = zip(features[289:577], targets[289:577]); # test on next 24 hours - -# ## Model: T-GCN - -# We use the T-GCN model from the paper [T-GCN: A Temporal Graph Convolutional Network for Traffic Prediction] (https://arxiv.org/pdf/1811.05320.pdf), which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). The GCN is used to capture spatial features from the graph, and the GRU is used to capture temporal features from the feature time series. - -model = GNNChain(TGCN(2 => 100; add_self_loops = false), Dense(100, 1)) - -# Let's look at the output of the model for the first batch of the training data. - -model(graph, features[1]) - -# The output of the model is a tensor of size `(1, 3, 207)`, which corresponds to the dimension of the feature (in this case speed), the number of time steps, and the number of nodes in the graph, respectively. The model outputs the predicted traffic speed for each sensor at each time step. - -# ![](https://www.researchgate.net/profile/Haifeng-Li-3/publication/335353434/figure/fig4/AS:851870352437249@1580113127759/The-architecture-of-the-Gated-Recurrent-Unit-model.jpg) - -# ## Training - -# We train the model for 100 epochs, using the Adam optimizer with a learning rate of 0.001. We use the mean absolute error (MAE) as the loss function. - -function train(graph, train_loader, model) - - opt = Flux.setup(Adam(0.001), model) - - for epoch in 1:100 - for (x, y) in train_loader - x, y = (x, y) - grads = Flux.gradient(model) do model - ŷ = model(graph, x) - Flux.mae(ŷ, y) - end - Flux.update!(opt, model, grads[1]) - end - - if epoch % 10 == 0 - loss = mean([Flux.mae(model(graph,x), y) for (x, y) in train_loader]) - @show epoch, loss - end - end - return model -end - -train(graph, train_loader, model) - -# -function plot_predicted_data(graph, features, targets, sensor) - p = plot(xlabel="Time (h)", ylabel="Normalized speed") - prediction = [] - ground_truth = [] - for i in 1:3:length(features) - push!(ground_truth,targets[i][1,:,sensor]) - push!(prediction, model(graph, features[i])[1,:,sensor]) - end - prediction = reduce(vcat,prediction) - ground_truth = reduce(vcat, ground_truth) - plot!(p, collect(1:length(prediction)), prediction, color = :red, label= "Prediction") - plot!(p, collect(1:length(ground_truth)), ground_truth, color = :blue, label = "Ground Truth", xticks = ([i for i in 0:50:250], ["$(i)" for i in 0:4:20])) - return p -end - -plot_predicted_data(graph,features[289:577],targets[289:577], 1) - -# -accuracy(ŷ, y) = 1 - Statistics.norm(y-ŷ)/Statistics.norm(y) -# Test accuracy: -mean([accuracy(model(graph,x), y) for (x, y) in test_loader]) - - -# The accuracy is not very good but can be improved by training using more data. We used a small subset of the dataset for this tutorial because of the computational cost of training the model. From the plot of the predictions, we can see that the model is able to capture the general trend of the traffic speed, but it is not able to capture the peaks of the traffic. - -# ## Conclusion - -# In this tutorial, we learned how to use a recurrent temporal graph convolutional network to predict traffic in a spatio-temporal setting. We used the TGCN model, which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). We then trained the model for 100 epochs on a small subset of the METR-LA dataset. The accuracy of the model is not very good, but it can be improved by training on more data. - diff --git a/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_gnn_intro.png b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_gnn_intro.png new file mode 100644 index 000000000..c91c4e756 Binary files /dev/null and b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_gnn_intro.png differ diff --git a/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_graph_classification.png b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_graph_classification.png new file mode 100644 index 000000000..8d0ec1f16 Binary files /dev/null and b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_graph_classification.png differ diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/graph_classification.gif b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_graph_classification2.gif similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/graph_classification.gif rename to GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_graph_classification2.gif diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/intro_1.png b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_intro2.png similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/intro_1.png rename to GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_intro2.png diff --git a/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_node_classification.svg b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_node_classification.svg new file mode 100644 index 000000000..c416be4bc --- /dev/null +++ b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_node_classification.svg @@ -0,0 +1,2748 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/node_classsification.gif b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_node_classsification2.gif similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/node_classsification.gif rename to GraphNeuralNetworks/docs/tutorials/beginner_tutorials/assets/cover_node_classsification2.gif diff --git a/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/gnn_intro.md b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/gnn_intro.md new file mode 100644 index 000000000..1eaa60815 --- /dev/null +++ b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/gnn_intro.md @@ -0,0 +1,264 @@ +# Hands-on introduction to Graph Neural Networks + +*This tutorial is a Julia adaptation of the Pytorch Geometric tutorials that can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html).* + +![cover](assets/cover_gnn_intro.png) + +Recently, deep learning on graphs has emerged to one of the hottest research fields in the deep learning community. +Here, **Graph Neural Networks (GNNs)** aim to generalize classical deep learning concepts to irregular structured data (in contrast to images or texts) and to enable neural networks to reason about objects and their relations. + +This is done by following a simple **neural message passing scheme**, where node features $\mathbf{x}_i^{(\ell)}$ of all nodes $i \in \mathcal{V}$ in a graph $\mathcal{G} = (\mathcal{V}, \mathcal{E})$ are iteratively updated by aggregating localized information from their neighbors $\mathcal{N}(i)$: + +```math +\mathbf{x}_i^{(\ell + 1)} = f^{(\ell + 1)}_{\theta} \left( \mathbf{x}_i^{(\ell)}, \left\{ \mathbf{x}_j^{(\ell)} : j \in \mathcal{N}(i) \right\} \right) +``` + +This tutorial will introduce you to some fundamental concepts regarding deep learning on graphs via Graph Neural Networks based on the **[GraphNeuralNetworks.jl library](https://github.com/JuliaGraphs/GraphNeuralNetworks.jl)**. +GraphNeuralNetworks.jl is an extension library to the popular deep learning framework [Flux.jl](https://fluxml.ai/Flux.jl/stable/), and consists of various methods and utilities to ease the implementation of Graph Neural Networks. + +Let's first import the packages we need: + +```@example intro +using Flux, GraphNeuralNetworks +using Flux: onecold, onehotbatch, logitcrossentropy +using MLDatasets +using LinearAlgebra, Random, Statistics +import GraphMakie +import CairoMakie as Makie +using FileIO: FileIO # save cover image + +ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation +rng = Random.seed!(17); # for reproducibility +``` + +Following [Kipf et al. (2017)](https://arxiv.org/abs/1609.02907), let's dive into the world of GNNs by looking at a simple graph-structured example, the well-known [**Zachary's karate club network**](https://en.wikipedia.org/wiki/Zachary%27s_karate_club). This graph describes a social network of 34 members of a karate club and documents links between members who interacted outside the club. Here, we are interested in detecting communities that arise from the member's interaction. + +GraphNeuralNetworks.jl provides utilities to convert [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl)'s datasets to its own type: + +```@example intro +dataset = MLDatasets.KarateClub() +``` +After initializing the `KarateClub` dataset, we first can inspect some of its properties. +For example, we can see that this dataset holds exactly **one graph**. +Furthermore, the graph holds exactly **4 classes**, which represent the community each node belongs to. + +```@example intro +karate = dataset[1] + +karate.node_data.labels_comm +``` +Now we convert the single-graph dataset to a `GNNGraph`. Moreover, we add a an array of node features, a **34-dimensional feature vector** for each node which uniquely describes the members of the karate club. We also add a training mask selecting the nodes to be used for training in our semi-supervised node classification task. + +```@example intro +g = mldataset2gnngraph(dataset) # convert a MLDatasets.jl's dataset to a GNNGraphs (or a collection of graphs) + +x = zeros(Float32, g.num_nodes, g.num_nodes) +x[diagind(x)] .= 1 + +train_mask = [true, false, false, false, true, false, false, false, true, + false, false, false, false, false, false, false, false, false, false, false, + false, false, false, false, true, false, false, false, false, false, + false, false, false, false] + +labels = g.ndata.labels_comm +y = onehotbatch(labels, 0:3) + +g = GNNGraph(g, ndata = (; x, y, train_mask)) +``` +Let's now look at the underlying graph in more detail: + +```@example intro +println("Number of nodes: $(g.num_nodes)") +println("Number of edges: $(g.num_edges)") +println("Average node degree: $(g.num_edges / g.num_nodes)") +println("Number of training nodes: $(sum(g.ndata.train_mask))") +println("Training node label rate: $(mean(g.ndata.train_mask))") +println("Has isolated nodes: $(has_isolated_nodes(g))") +println("Has self-loops: $(has_self_loops(g))") +println("Is undirected: $(is_bidirected(g))") +``` +Each graph in GraphNeuralNetworks.jl is represented by a `GNNGraph` object, which holds all the information to describe its graph representation. +We can print the data object anytime via `print(g)` to receive a short summary about its attributes and their shapes. + +The `g` object holds 3 attributes: +- `g.ndata`: contains node-related information. +- `g.edata`: holds edge-related information. +- `g.gdata`: this stores the global data, therefore neither node nor edge-specific features. + +These attributes are `NamedTuples` that can store multiple feature arrays: we can access a specific set of features e.g. `x`, with `g.ndata.x`. + + +In our task, `g.ndata.train_mask` describes for which nodes we already know their community assignments. In total, we are only aware of the ground-truth labels of 4 nodes (one for each community), and the task is to infer the community assignment for the remaining nodes. + +The `g` object also provides some **utility functions** to infer some basic properties of the underlying graph. +For example, we can easily infer whether there exist isolated nodes in the graph (*i.e.* there exists no edge to any node), whether the graph contains self-loops (*i.e.*, $(v, v) \in \mathcal{E}$), or whether the graph is bidirected (*i.e.*, for each edge $(v, w) \in \mathcal{E}$ there also exists the edge $(w, v) \in \mathcal{E}$). + +Let us now inspect the `edge_index` method: + +```@example intro +edge_index(g) +``` +By printing `edge_index(g)`, we can understand how GraphNeuralNetworks.jl represents graph connectivity internally. +We can see that for each edge, `edge_index` holds a tuple of two node indices, where the first value describes the node index of the source node and the second value describes the node index of the destination node of an edge. + +This representation is known as the **COO format (coordinate format)** commonly used for representing sparse matrices. +Instead of holding the adjacency information in a dense representation $\mathbf{A} \in \{ 0, 1 \}^{|\mathcal{V}| \times |\mathcal{V}|}$, GraphNeuralNetworks.jl represents graphs sparsely, which refers to only holding the coordinates/values for which entries in $\mathbf{A}$ are non-zero. + +Importantly, GraphNeuralNetworks.jl does not distinguish between directed and undirected graphs, and treats undirected graphs as a special case of directed graphs in which reverse edges exist for every entry in the `edge_index`. + +Since a `GNNGraph` is an `AbstractGraph` from the `Graphs.jl` library, it supports graph algorithms and visualization tools from the wider julia graph ecosystem: + +```@example intro +# no axis ticks +p = GraphMakie.graphplot(g |> to_unidirected, node_size = 20, node_color = labels, arrow_show = false) +FileIO.save("assets/cover_gnn_intro.png", p) # hide +p +``` +## Implementing Graph Neural Networks + +After learning about GraphNeuralNetworks.jl's data handling, it's time to implement our first Graph Neural Network! + +For this, we will use on of the most simple GNN operators, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)), which is defined as + +```math +\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} +``` + +where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. + +GraphNeuralNetworks.jl implements this layer via `GCNConv`, which can be executed by passing in the node feature representation `x` and the COO graph connectivity representation `edge_index`. + +With this, we are ready to create our first Graph Neural Network by defining our network architecture: + +```@example intro +struct GCN + layers::NamedTuple +end + +Flux.@layer GCN # Provides parameter collection, gpu movement and more + +function GCN(num_features, num_classes) + layers = (conv1 = GCNConv(num_features => 4), + conv2 = GCNConv(4 => 4), + conv3 = GCNConv(4 => 2), + classifier = Dense(2, num_classes)) + return GCN(layers) +end; + +function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) + l = gcn.layers + x = l.conv1(g, x) + x = tanh.(x) + x = l.conv2(g, x) + x = tanh.(x) + x = l.conv3(g, x) + x = tanh.(x) # Final GNN embedding space. + out = l.classifier(x) # Apply a final (linear) classifier. + return out, x +end; +``` + +Here, we first initialize all of our building blocks in the constructor and define the computation flow of our network in the call method. +We first define and stack **three graph convolution layers**, which corresponds to aggregating 3-hop neighborhood information around each node (all nodes up to 3 "hops" away). +In addition, the `GCNConv` layers reduce the node feature dimensionality to ``2``, *i.e.*, $34 \rightarrow 4 \rightarrow 4 \rightarrow 2$. Each `GCNConv` layer is enhanced by a `tanh` non-linearity. + +After that, we apply a single linear transformation (`Flux.Dense` that acts as a classifier to map our nodes to 1 out of the 4 classes/communities. + +We return both the output of the final classifier as well as the final node embeddings produced by our GNN. +We proceed to initialize our final model via `GCN()`, and printing our model produces a summary of all its used sub-modules. + +### Embedding the Karate Club Network + +Let's take a look at the node embeddings produced by our GNN. +Here, we pass in the initial node features `x` and the graph information `g` to the model, and visualize its 2-dimensional embedding. + +```@example intro +num_features = 34 +num_classes = 4 +gcn = GCN(num_features, num_classes) +``` + +```@example intro +_, h = gcn(g, g.ndata.x); +``` + + +```@example intro +function visualize_embeddings(h; colors = nothing) + xs = h[1, :] |> vec + ys = h[2, :] |> vec + Makie.scatter(xs, ys, color = labels, markersize = 20) +end + +visualize_embeddings(h, colors = labels) +``` +Remarkably, even before training the weights of our model, the model produces an embedding of nodes that closely resembles the community-structure of the graph. +Nodes of the same color (community) are already closely clustered together in the embedding space, although the weights of our model are initialized **completely at random** and we have not yet performed any training so far! +This leads to the conclusion that GNNs introduce a strong inductive bias, leading to similar embeddings for nodes that are close to each other in the input graph. + +### Training on the Karate Club Network + +But can we do better? Let's look at an example on how to train our network parameters based on the knowledge of the community assignments of 4 nodes in the graph (one for each community). + +Since everything in our model is differentiable and parameterized, we can add some labels, train the model and observe how the embeddings react. +Here, we make use of a semi-supervised or transductive learning procedure: we simply train against one node per class, but are allowed to make use of the complete input graph data. + +Training our model is very similar to any other Flux model. +In addition to defining our network architecture, we define a loss criterion (here, `logitcrossentropy`), and initialize a stochastic gradient optimizer (here, `Adam`). +After that, we perform multiple rounds of optimization, where each round consists of a forward and backward pass to compute the gradients of our model parameters w.r.t. to the loss derived from the forward pass. +If you are not new to Flux, this scheme should appear familiar to you. + +Note that our semi-supervised learning scenario is achieved by the following line: +```julia +loss = logitcrossentropy(ŷ[:,train_mask], y[:,train_mask]) +``` + +While we compute node embeddings for all of our nodes, we **only make use of the training nodes for computing the loss**. +Here, this is implemented by filtering the output of the classifier `out` and ground-truth labels `data.y` to only contain the nodes in the `train_mask`. + +Let us now start training and see how our node embeddings evolve over time (best experienced by explicitly running the code): +```@example intro +model = GCN(num_features, num_classes) +opt = Flux.setup(Adam(1e-2), model) +epochs = 2000 + +emb = h +function report(epoch, loss, h) + @info (; epoch, loss) +end + +report(0, 10.0, emb) +for epoch in 1:epochs + loss, grad = Flux.withgradient(model) do model + ŷ, emb = model(g, g.ndata.x) + logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) + end + + Flux.update!(opt, model, grad[1]) + if epoch % 200 == 0 + report(epoch, loss, emb) + end +end; +``` + +```@example intro +ŷ, emb_final = model(g, g.ndata.x) +``` +Train accuracy: + +```@example intro +mean(onecold(ŷ[:, train_mask]) .== onecold(y[:, train_mask])) +``` +Test accuracy: + +```@example intro +mean(onecold(ŷ[:, .!train_mask]) .== onecold(y[:, .!train_mask])) +``` +Final embedding: + +```@example intro +visualize_embeddings(emb_final, colors = labels) +``` +As one can see, our 3-layer GCN model manages to linearly separating the communities and classifying most of the nodes correctly. + +Furthermore, we did this all with a few lines of code, thanks to the GraphNeuralNetworks.jl which helped us out with data handling and GNN implementations. diff --git a/GraphNeuralNetworks/docs/src/tutorials/graph_classification.md b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/graph_classification.md similarity index 59% rename from GraphNeuralNetworks/docs/src/tutorials/graph_classification.md rename to GraphNeuralNetworks/docs/tutorials/beginner_tutorials/graph_classification.md index f30f71273..b2849d566 100644 --- a/GraphNeuralNetworks/docs/src/tutorials/graph_classification.md +++ b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/graph_classification.md @@ -1,11 +1,9 @@ -```@meta -EditURL = "../../src_tutorials/introductory_tutorials/graph_classification.jl" -``` - -# Graph Classification with Graph Neural Networks +# Supervised Graph Classification *This tutorial is a julia adaptation of the Pytorch Geometric tutorials that can be found [here](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html).* +![cover](assets/cover_graph_classification.png) + In this tutorial session we will have a closer look at how to apply **Graph Neural Networks (GNNs) to the task of graph classification**. Graph classification refers to the problem of classifying entire graphs (in contrast to nodes), given a **dataset of graphs**, based on some structural graph properties and possibly on some input node features. Here, we want to embed entire graphs, and we want to embed those graphs in such a way so that they are linearly separable given a task at hand. @@ -16,7 +14,8 @@ A common graph classification task is **molecular property prediction**, in whic The TU Dortmund University has collected a wide range of different graph classification datasets, known as the [**TUDatasets**](https://chrsmrrs.github.io/datasets/), which are also accessible via MLDatasets.jl. Let's import the necessary packages. Then we'll load and inspect one of the smaller ones, the **MUTAG dataset**: -````julia + +```@example classification using Flux, GraphNeuralNetworks using Flux: onecold, onehotbatch, logitcrossentropy, DataLoader using MLDatasets, MLUtils @@ -24,115 +23,65 @@ using LinearAlgebra, Random, Statistics ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation Random.seed!(42); # for reproducibility -```` +``` -````julia +```@example classification dataset = TUDataset("MUTAG") -```` - -```` -dataset TUDataset: - name => MUTAG - metadata => Dict{String, Any} with 1 entry - graphs => 188-element Vector{MLDatasets.Graph} - graph_data => (targets = "188-element Vector{Int64}",) - num_nodes => 3371 - num_edges => 7442 - num_graphs => 188 -```` - -````julia -dataset.graph_data.targets |> union -```` +``` -```` -2-element Vector{Int64}: - 1 - -1 -```` -````julia -g1, y1 = dataset[1] # get the first graph and target -```` +```@example classification +dataset.graph_data.targets |> union +``` -```` -(graphs = Graph(17, 38), targets = 1) -```` +```@example classification +g1, y1 = dataset[1] # get the first graph and target +``` -````julia +```@example classification reduce(vcat, g.node_data.targets for (g, _) in dataset) |> union -```` - -```` -7-element Vector{Int64}: - 0 - 1 - 2 - 3 - 4 - 5 - 6 -```` - -````julia -reduce(vcat, g.edge_data.targets for (g, _) in dataset) |> union -```` - -```` -4-element Vector{Int64}: - 0 - 1 - 2 - 3 -```` +``` +```@example classification +reduce(vcat, g.edge_data.targets for (g, _) in dataset) |> union +``` This dataset provides **188 different graphs**, and the task is to classify each graph into **one out of two classes**. By inspecting the first graph object of the dataset, we can see that it comes with **17 nodes** and **38 edges**. It also comes with exactly **one graph label**, and provides additional node labels (7 classes) and edge labels (4 classes). However, for the sake of simplicity, we will not make use of edge labels. -We now convert the `MLDatasets.jl` graph types to our `GNNGraph`s and we also onehot encode both the node labels (which will be used as input features) and the graph labels (what we want to predict): +We now convert the `MLDatasets.jl` graph types to our `GNNGraph`s and we also onehot encode both the node labels (which will be used as input features) and the graph labels (what we want to predict): -````julia +```@example classification graphs = mldataset2gnngraph(dataset) graphs = [GNNGraph(g, ndata = Float32.(onehotbatch(g.ndata.targets, 0:6)), edata = nothing) for g in graphs] y = onehotbatch(dataset.graph_data.targets, [-1, 1]) -```` - -```` -2×188 OneHotMatrix(::Vector{UInt32}) with eltype Bool: - ⋅ 1 1 ⋅ 1 ⋅ 1 ⋅ 1 ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ 1 ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ 1 ⋅ 1 1 1 ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ 1 1 ⋅ ⋅ ⋅ 1 ⋅ ⋅ 1 ⋅ ⋅ 1 1 1 ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ ⋅ 1 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 1 ⋅ 1 1 ⋅ 1 ⋅ ⋅ 1 1 ⋅ ⋅ 1 1 ⋅ ⋅ ⋅ ⋅ 1 1 1 1 1 ⋅ 1 ⋅ ⋅ 1 1 ⋅ 1 1 1 1 ⋅ ⋅ 1 ⋅ ⋅ 1 ⋅ ⋅ ⋅ 1 1 1 ⋅ ⋅ ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ ⋅ ⋅ 1 ⋅ 1 1 ⋅ ⋅ 1 1 ⋅ 1 - 1 ⋅ ⋅ 1 ⋅ 1 ⋅ 1 ⋅ 1 1 1 1 ⋅ 1 1 ⋅ 1 ⋅ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ⋅ 1 ⋅ 1 ⋅ ⋅ ⋅ 1 ⋅ 1 1 1 1 1 1 1 1 1 1 1 1 ⋅ 1 1 1 1 1 1 ⋅ 1 1 ⋅ ⋅ 1 1 1 ⋅ 1 1 ⋅ 1 1 ⋅ ⋅ ⋅ 1 1 1 1 1 ⋅ 1 1 1 ⋅ ⋅ 1 1 1 1 1 1 1 1 ⋅ 1 ⋅ 1 1 1 1 1 1 1 1 1 ⋅ ⋅ 1 ⋅ ⋅ 1 ⋅ 1 1 ⋅ ⋅ 1 1 ⋅ ⋅ 1 1 1 1 ⋅ ⋅ ⋅ ⋅ ⋅ 1 ⋅ 1 1 ⋅ ⋅ 1 ⋅ ⋅ ⋅ ⋅ 1 1 ⋅ 1 1 ⋅ 1 1 1 ⋅ ⋅ ⋅ 1 1 1 ⋅ 1 1 1 1 1 1 1 ⋅ 1 1 1 1 1 1 ⋅ 1 1 1 ⋅ 1 ⋅ ⋅ 1 1 ⋅ ⋅ 1 ⋅ -```` +``` We have some useful utilities for working with graph datasets, *e.g.*, we can shuffle the dataset and use the first 150 graphs as training graphs, while using the remaining ones for testing: -````julia +```@example classification train_data, test_data = splitobs((graphs, y), at = 150, shuffle = true) |> getobs train_loader = DataLoader(train_data, batchsize = 32, shuffle = true) test_loader = DataLoader(test_data, batchsize = 32, shuffle = false) -```` - -```` -2-element DataLoader(::Tuple{Vector{GNNGraph{Tuple{Vector{Int64}, Vector{Int64}, Nothing}}}, OneHotArrays.OneHotMatrix{UInt32, Vector{UInt32}}}, batchsize=32) - with first element: - (32-element Vector{GNNGraph{Tuple{Vector{Int64}, Vector{Int64}, Nothing}}}, 2×32 OneHotMatrix(::Vector{UInt32}) with eltype Bool,) -```` +``` Here, we opt for a `batch_size` of 32, leading to 5 (randomly shuffled) mini-batches, containing all $4 \cdot 32+22 = 150$ graphs. + ## Mini-batching of graphs Since graphs in graph classification datasets are usually small, a good idea is to **batch the graphs** before inputting them into a Graph Neural Network to guarantee full GPU utilization. In the image or language domain, this procedure is typically achieved by **rescaling** or **padding** each example into a set of equally-sized shapes, and examples are then grouped in an additional dimension. The length of this dimension is then equal to the number of examples grouped in a mini-batch and is typically referred to as the `batchsize`. + However, for GNNs the two approaches described above are either not feasible or may result in a lot of unnecessary memory consumption. Therefore, GraphNeuralNetworks.jl opts for another approach to achieve parallelization across a number of examples. Here, adjacency matrices are stacked in a diagonal fashion (creating a giant graph that holds multiple isolated subgraphs), and node and target features are simply concatenated in the node dimension (the last dimension). @@ -144,26 +93,15 @@ This procedure has some crucial advantages over other batching procedures: GraphNeuralNetworks.jl can **batch multiple graphs into a single giant graph**: -````julia + +```@example classification vec_gs, _ = first(train_loader) -```` +``` -```` -(GNNGraph{Tuple{Vector{Int64}, Vector{Int64}, Nothing}}[GNNGraph(11, 22) with x: 7×11 data, GNNGraph(22, 50) with x: 7×22 data, GNNGraph(19, 44) with x: 7×19 data, GNNGraph(22, 50) with x: 7×22 data, GNNGraph(16, 36) with x: 7×16 data, GNNGraph(20, 44) with x: 7×20 data, GNNGraph(19, 42) with x: 7×19 data, GNNGraph(20, 44) with x: 7×20 data, GNNGraph(13, 26) with x: 7×13 data, GNNGraph(19, 40) with x: 7×19 data, GNNGraph(25, 56) with x: 7×25 data, GNNGraph(16, 34) with x: 7×16 data, GNNGraph(28, 66) with x: 7×28 data, GNNGraph(19, 40) with x: 7×19 data, GNNGraph(19, 44) with x: 7×19 data, GNNGraph(17, 36) with x: 7×17 data, GNNGraph(12, 24) with x: 7×12 data, GNNGraph(16, 34) with x: 7×16 data, GNNGraph(27, 66) with x: 7×27 data, GNNGraph(17, 38) with x: 7×17 data, GNNGraph(19, 42) with x: 7×19 data, GNNGraph(17, 36) with x: 7×17 data, GNNGraph(12, 26) with x: 7×12 data, GNNGraph(24, 50) with x: 7×24 data, GNNGraph(20, 46) with x: 7×20 data, GNNGraph(19, 42) with x: 7×19 data, GNNGraph(11, 22) with x: 7×11 data, GNNGraph(16, 34) with x: 7×16 data, GNNGraph(13, 28) with x: 7×13 data, GNNGraph(13, 28) with x: 7×13 data, GNNGraph(13, 28) with x: 7×13 data, GNNGraph(16, 36) with x: 7×16 data], Bool[1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0; 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1]) -```` -````julia +```@example classification MLUtils.batch(vec_gs) -```` - -```` -GNNGraph: - num_nodes: 570 - num_edges: 1254 - num_graphs: 32 - ndata: - x = 7×570 Matrix{Float32} -```` +``` Each batched graph object is equipped with a **`graph_indicator` vector**, which maps each node to its respective graph in the batch: @@ -171,6 +109,7 @@ Each batched graph object is equipped with a **`graph_indicator` vector**, which \textrm{graph\_indicator} = [1, \ldots, 1, 2, \ldots, 2, 3, \ldots ] ``` + ## Training a Graph Neural Network (GNN) Training a GNN for graph classification usually follows a simple recipe: @@ -189,7 +128,7 @@ GraphNeuralNetworks.jl provides this functionality via `GlobalPool(mean)`, which The final architecture for applying GNNs to the task of graph classification then looks as follows and allows for complete end-to-end training: -````julia +```@example classification function create_model(nin, nh, nout) GNNChain(GCNConv(nin => nh, relu), GCNConv(nh => nh, relu), @@ -198,13 +137,15 @@ function create_model(nin, nh, nout) Dropout(0.5), Dense(nh, nout)) end; -```` +``` Here, we again make use of the `GCNConv` with $\mathrm{ReLU}(x) = \max(x, 0)$ activation for obtaining localized node embeddings, before we apply our final classifier on top of a graph readout layer. Let's train our network for a few epochs to see how well it performs on the training as well as test set: -````julia + + +```@example classification function eval_loss_accuracy(model, data_loader, device) loss = 0.0 acc = 0.0 @@ -223,7 +164,7 @@ end function train!(model; epochs = 200, η = 1e-3, infotime = 10) - # device = Flux.gpu # uncomment this for GPU training + ## device = Flux.gpu # uncomment this for GPU training device = Flux.cpu model = model |> device opt = Flux.setup(Adam(η), model) @@ -254,32 +195,8 @@ nh = 64 nout = 2 model = create_model(nin, nh, nout) train!(model) -```` - -```` -[ Info: (epoch = 0, train = (loss = 0.6975, acc = 50.0), test = (loss = 0.6958, acc = 51.32)) -[ Info: (epoch = 10, train = (loss = 0.6002, acc = 67.33), test = (loss = 0.6181, acc = 63.16)) -[ Info: (epoch = 20, train = (loss = 0.534, acc = 78.67), test = (loss = 0.5339, acc = 68.42)) -[ Info: (epoch = 30, train = (loss = 0.5056, acc = 75.33), test = (loss = 0.4999, acc = 68.42)) -[ Info: (epoch = 40, train = (loss = 0.4989, acc = 74.33), test = (loss = 0.5041, acc = 68.42)) -[ Info: (epoch = 50, train = (loss = 0.4927, acc = 74.67), test = (loss = 0.4985, acc = 72.37)) -[ Info: (epoch = 60, train = (loss = 0.4908, acc = 74.67), test = (loss = 0.4989, acc = 75.0)) -[ Info: (epoch = 70, train = (loss = 0.4876, acc = 75.67), test = (loss = 0.4982, acc = 75.0)) -[ Info: (epoch = 80, train = (loss = 0.4855, acc = 75.67), test = (loss = 0.4984, acc = 75.0)) -[ Info: (epoch = 90, train = (loss = 0.4835, acc = 74.67), test = (loss = 0.497, acc = 76.32)) -[ Info: (epoch = 100, train = (loss = 0.4869, acc = 75.33), test = (loss = 0.5127, acc = 67.11)) -[ Info: (epoch = 110, train = (loss = 0.4805, acc = 75.33), test = (loss = 0.4944, acc = 76.32)) -[ Info: (epoch = 120, train = (loss = 0.4782, acc = 75.33), test = (loss = 0.4971, acc = 76.32)) -[ Info: (epoch = 130, train = (loss = 0.4793, acc = 75.33), test = (loss = 0.5029, acc = 73.68)) -[ Info: (epoch = 140, train = (loss = 0.4747, acc = 76.67), test = (loss = 0.4923, acc = 75.0)) -[ Info: (epoch = 150, train = (loss = 0.4813, acc = 76.33), test = (loss = 0.5151, acc = 71.05)) -[ Info: (epoch = 160, train = (loss = 0.472, acc = 76.0), test = (loss = 0.4968, acc = 75.0)) -[ Info: (epoch = 170, train = (loss = 0.4712, acc = 75.33), test = (loss = 0.4991, acc = 73.68)) -[ Info: (epoch = 180, train = (loss = 0.4711, acc = 75.0), test = (loss = 0.4994, acc = 73.68)) -[ Info: (epoch = 190, train = (loss = 0.4672, acc = 75.33), test = (loss = 0.4956, acc = 73.68)) -[ Info: (epoch = 200, train = (loss = 0.4662, acc = 77.67), test = (loss = 0.4934, acc = 76.32)) - -```` +``` + As one can see, our model reaches around **75% test accuracy**. Reasons for the fluctuations in accuracy can be explained by the rather small dataset (only 38 test graphs), and usually disappear once one applies GNNs to larger datasets. @@ -303,8 +220,3 @@ This should bring you close to **82% test accuracy**. In this chapter, you have learned how to apply GNNs to the task of graph classification. You have learned how graphs can be batched together for better GPU utilization, and how to apply readout layers for obtaining graph embeddings rather than node embeddings. - ---- - -*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).* - diff --git a/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/node_classification.md b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/node_classification.md new file mode 100644 index 000000000..0e288786c --- /dev/null +++ b/GraphNeuralNetworks/docs/tutorials/beginner_tutorials/node_classification.md @@ -0,0 +1,301 @@ +# Semi-Supervised Node Classification + +![](assets/cover_node_classification.svg) + +In this tutorial, we will be learning how to use Graph Neural Networks (GNNs) for node classification. Given the ground-truth labels of only a small subset of nodes, and want to infer the labels for all the remaining nodes (transductive learning). + +## Import +Let us start off by importing some libraries. We will be using `Flux.jl` and `GraphNeuralNetworks.jl` for our tutorial. + +```@example node_classification +using Flux, GraphNeuralNetworks +using Flux: onecold, onehotbatch, logitcrossentropy +using MLDatasets +using Plots, TSne +using Statistics, Random + +ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation +Random.seed!(17); # for reproducibility +``` + +## Visualize +We want to visualize our results using t-distributed stochastic neighbor embedding (tsne) to project our output onto a 2D plane. + +```@example node_classification + +function visualize_tsne(out, targets) + z = tsne(out, 2) + scatter(z[:, 1], z[:, 2], color = Int.(targets[1:size(z, 1)]), leg = false) +end; +``` + +## Dataset: Cora + +For our tutorial, we will be using the `Cora` dataset. `Cora` is a citation network of 2708 documents categorized into seven classes with 5,429 citation links. Each node represents an article or document, and edges between nodes indicate a citation relationship, where one cites the other. + +Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words. + +This dataset was first introduced by [Yang et al. (2016)](https://arxiv.org/abs/1603.08861) as one of the datasets of the `Planetoid` benchmark suite. We will be using [MLDatasets.jl](https://juliaml.github.io/MLDatasets.jl/stable/) for an easy access to this dataset. + +```@example node_classification +dataset = Cora() +``` + +Datasets in MLDatasets.jl have `metadata` containing information about the dataset itself. +```@example node_classification +dataset.metadata +``` + +The `graphs` variable contains the graph. The `Cora` dataset contains only 1 graph. + + +```@example node_classification +dataset.graphs +``` + +There is only one graph of the dataset. The `node_data` contains `features` indicating if certain words are present or not and `targets` indicating the class for each document. We convert the single-graph dataset to a `GNNGraph`. + +```@example node_classification +g = mldataset2gnngraph(dataset) + +println("Number of nodes: $(g.num_nodes)") +println("Number of edges: $(g.num_edges)") +println("Average node degree: $(g.num_edges / g.num_nodes)") +println("Number of training nodes: $(sum(g.ndata.train_mask))") +println("Training node label rate: $(mean(g.ndata.train_mask))") +println("Has isolated nodes: $(has_isolated_nodes(g))") +println("Has self-loops: $(has_self_loops(g))") +println("Is undirected: $(is_bidirected(g))") +``` + +Overall, this dataset is quite similar to the previously used [`KarateClub`](https://juliaml.github.io/MLDatasets.jl/stable/datasets/graphs/#MLDatasets.KarateClub) network. +We can see that the `Cora` network holds 2,708 nodes and 10,556 edges, resulting in an average node degree of 3.9. +For training this dataset, we are given the ground-truth categories of 140 nodes (20 for each class). +This results in a training node label rate of only 5%. + +We can further see that this network is undirected, and that there exists no isolated nodes (each document has at least one citation). + +```@example node_classification +x = g.ndata.features # we onehot encode both the node labels (what we want to predict): +y = onehotbatch(g.ndata.targets, 1:7) +train_mask = g.ndata.train_mask +num_features = size(x)[1] +hidden_channels = 16 +num_classes = dataset.metadata["num_classes"]; +``` + +## Multi-layer Perception Network (MLP) + +In theory, we should be able to infer the category of a document solely based on its content, *i.e.* its bag-of-words feature representation, without taking any relational information into account. + +Let's verify that by constructing a simple MLP that solely operates on input node features (using shared weights across all nodes): + +```@example node_classification +struct MLP + layers::NamedTuple +end + +Flux.@layer :expand MLP + +function MLP(num_features, num_classes, hidden_channels; drop_rate = 0.5) + layers = (hidden = Dense(num_features => hidden_channels), + drop = Dropout(drop_rate), + classifier = Dense(hidden_channels => num_classes)) + return MLP(layers) +end; + +function (model::MLP)(x::AbstractMatrix) + l = model.layers + x = l.hidden(x) + x = relu(x) + x = l.drop(x) + x = l.classifier(x) + return x +end; +``` + +### Training a Multilayer Perceptron + +Our MLP is defined by two linear layers and enhanced by [ReLU](https://fluxml.ai/Flux.jl/stable/models/nnlib/#NNlib.relu) non-linearity and [Dropout](https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.Dropout). +Here, we first reduce the 1433-dimensional feature vector to a low-dimensional embedding (`hidden_channels=16`), while the second linear layer acts as a classifier that should map each low-dimensional node embedding to one of the 7 classes. + +Let's train our simple MLP by following a similar procedure as described in [the first part of this tutorial](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/tutorials/gnn_intro/). +We again make use of the **cross entropy loss** and **Adam optimizer**. +This time, we also define a **`accuracy` function** to evaluate how well our final model performs on the test node set (which labels have not been observed during training). + +```@example node_classification +function train(model::MLP, data::AbstractMatrix, epochs::Int, opt) + Flux.trainmode!(model) + + for epoch in 1:epochs + loss, grad = Flux.withgradient(model) do model + ŷ = model(data) + logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) + end + + Flux.update!(opt, model, grad[1]) + if epoch % 200 == 0 + @show epoch, loss + end + end +end; + +function accuracy(model::MLP, x::AbstractMatrix, y::Flux.OneHotArray, mask::BitVector) + Flux.testmode!(model) + mean(onecold(model(x))[mask] .== onecold(y)[mask]) +end; + +mlp = MLP(num_features, num_classes, hidden_channels) +opt_mlp = Flux.setup(Adam(1e-3), mlp) +epochs = 2000 +train(mlp, g.ndata.features, epochs, opt_mlp) +``` + +After training the model, we can call the `accuracy` function to see how well our model performs on unseen labels. +Here, we are interested in the accuracy of the model, *i.e.*, the ratio of correctly classified nodes: + +```@example node_classification +accuracy(mlp, g.ndata.features, y, .!train_mask) +``` + +As one can see, our MLP performs rather bad with only about ~50% test accuracy. +But why does the MLP do not perform better? +The main reason for that is that this model suffers from heavy overfitting due to only having access to a **small amount of training nodes**, and therefore generalizes poorly to unseen node representations. + +It also fails to incorporate an important bias into the model: **Cited papers are very likely related to the category of a document**. +That is exactly where Graph Neural Networks come into play and can help to boost the performance of our model. + + + +## Training a Graph Convolutional Neural Network (GNN) + +Following-up on the first part of this tutorial, we replace the `Dense` linear layers by the [`GCNConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GCNConv) module. +To recap, the **GCN layer** ([Kipf et al. (2017)](https://arxiv.org/abs/1609.02907)) is defined as + +```math +\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \sum_{w \in \mathcal{N}(v) \, \cup \, \{ v \}} \frac{1}{c_{w,v}} \cdot \mathbf{x}_w^{(\ell)} +``` + +where $\mathbf{W}^{(\ell + 1)}$ denotes a trainable weight matrix of shape `[num_output_features, num_input_features]` and $c_{w,v}$ refers to a fixed normalization coefficient for each edge. +In contrast, a single `Linear` layer is defined as + +```math +\mathbf{x}_v^{(\ell + 1)} = \mathbf{W}^{(\ell + 1)} \mathbf{x}_v^{(\ell)} +``` + +which does not make use of neighboring node information. + +```@example node_classification +struct GCN + layers::NamedTuple +end + +Flux.@layer GCN # provides parameter collection, gpu movement and more + +function GCN(num_features, num_classes, hidden_channels; drop_rate = 0.5) + layers = (conv1 = GCNConv(num_features => hidden_channels), + drop = Dropout(drop_rate), + conv2 = GCNConv(hidden_channels => num_classes)) + return GCN(layers) +end; + +function (gcn::GCN)(g::GNNGraph, x::AbstractMatrix) + l = gcn.layers + x = l.conv1(g, x) + x = relu.(x) + x = l.drop(x) + x = l.conv2(g, x) + return x +end; +``` + +Now let's visualize the node embeddings of our **untrained** GCN network. + +```@example node_classification +gcn = GCN(num_features, num_classes, hidden_channels) +h_untrained = gcn(g, x) |> transpose +visualize_tsne(h_untrained, g.ndata.targets) +``` + +We certainly can do better by training our model. +The training and testing procedure is once again the same, but this time we make use of the node features `x` **and** the graph `g` as input to our GCN model. + +```@example node_classification +function train(model::GCN, g::GNNGraph, x::AbstractMatrix, epochs::Int, opt) + Flux.trainmode!(model) + + for epoch in 1:epochs + loss, grad = Flux.withgradient(model) do model + ŷ = model(g, x) + logitcrossentropy(ŷ[:, train_mask], y[:, train_mask]) + end + + Flux.update!(opt, model, grad[1]) + if epoch % 200 == 0 + @show epoch, loss + end + end +end; +``` + +```@example node_classification +mlp = MLP(num_features, num_classes, hidden_channels) +opt_mlp = Flux.setup(Adam(1e-3), mlp) +epochs = 2000 +train(mlp, g.ndata.features, epochs, opt_mlp) +``` + +```@example node_classification +function accuracy(model::GCN, g::GNNGraph, x::AbstractMatrix, y::Flux.OneHotArray, + mask::BitVector) + Flux.testmode!(model) + mean(onecold(model(g, x))[mask] .== onecold(y)[mask]) +end +``` + +```@example node_classification +accuracy(mlp, g.ndata.features, y, .!train_mask) +``` + + +```@example node_classification +opt_gcn = Flux.setup(Adam(1e-2), gcn) +train(gcn, g, x, epochs, opt_gcn) +``` + +Now let's evaluate the loss of our trained GCN. + +```@example node_classification +train_accuracy = accuracy(gcn, g, g.ndata.features, y, train_mask) +test_accuracy = accuracy(gcn, g, g.ndata.features, y, .!train_mask) + +println("Train accuracy: $(train_accuracy)") +println("Test accuracy: $(test_accuracy)") +``` + +**There it is!** +By simply swapping the linear layers with GNN layers, we can reach **76% of test accuracy**! +This is in stark contrast to the 59% of test accuracy obtained by our MLP, indicating that relational information plays a crucial role in obtaining better performance. + +We can also verify that once again by looking at the output embeddings of our trained model, which now produces a far better clustering of nodes of the same category. + + +```@example node_classification +Flux.testmode!(gcn) # inference mode + +out_trained = gcn(g, x) |> transpose +visualize_tsne(out_trained, g.ndata.targets) +``` + +## (Optional) Exercises + +1. To achieve better model performance and to avoid overfitting, it is usually a good idea to select the best model based on an additional validation set. The `Cora` dataset provides a validation node set as `g.ndata.val_mask`, but we haven't used it yet. Can you modify the code to select and test the model with the highest validation performance? This should bring test performance to **82% accuracy**. + +2. How does `GCN` behave when increasing the hidden feature dimensionality or the number of layers? Does increasing the number of layers help at all? + +3. You can try to use different GNN layers to see how model performance changes. What happens if you swap out all `GCNConv` instances with [`GATConv`](https://juliagraphs.org/GraphNeuralNetworks.jl/docs/GraphNeuralNetworks.jl/stable/api/conv/#GraphNeuralNetworks.GATConv) layers that make use of attention? Try to write a 2-layer `GAT` model that makes use of 8 attention heads in the first layer and 1 attention head in the second layer, uses a `dropout` ratio of `0.6` inside and outside each `GATConv` call, and uses a `hidden_channels` dimensions of `8` per head. + + + +## Conclusion +In this tutorial, we have seen how to apply GNNs to real-world problems, and, in particular, how they can effectively be used for boosting a model's performance. In the next tutorial, we will look into how GNNs can be used for the task of graph classification. diff --git a/GraphNeuralNetworks/docs/src_tutorials/config.json b/GraphNeuralNetworks/docs/tutorials/config.json similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/config.json rename to GraphNeuralNetworks/docs/tutorials/config.json diff --git a/GraphNeuralNetworks/docs/src_tutorials/index.md b/GraphNeuralNetworks/docs/tutorials/index.md similarity index 70% rename from GraphNeuralNetworks/docs/src_tutorials/index.md rename to GraphNeuralNetworks/docs/tutorials/index.md index e0a02c6e6..801331d8d 100644 --- a/GraphNeuralNetworks/docs/src_tutorials/index.md +++ b/GraphNeuralNetworks/docs/tutorials/index.md @@ -14,6 +14,3 @@ Users are invited to contribute demonstrations of their own. If you want to contribute new tutorials and looking for inspiration, checkout these tutorials from [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/notes/colabs.html). -You are expected to use [Pluto.jl](https://github.com/fonsp/Pluto.jl) notebooks -with [DemoCards.jl](https://github.com/JuliaDocs/DemoCards.jl). -Please check out existing tutorials for more details. diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/brain_gnn.gif b/GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/assets/cover_brain_gnn.gif similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/brain_gnn.gif rename to GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/assets/cover_brain_gnn.gif diff --git a/GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/traffic.gif b/GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/assets/cover_traffic.gif similarity index 100% rename from GraphNeuralNetworks/docs/src_tutorials/introductory_tutorials/assets/traffic.gif rename to GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/assets/cover_traffic.gif diff --git a/GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/traffic_prediction.md b/GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/traffic_prediction.md new file mode 100644 index 000000000..3b3ffd38f --- /dev/null +++ b/GraphNeuralNetworks/docs/tutorials/intermediate_tutorials/traffic_prediction.md @@ -0,0 +1,162 @@ +# Traffic Prediction using Recurrent Temporal GNN + +![](assets/cover_traffic.gif) + +In this tutorial, we will learn how to use a recurrent Temporal Graph Convolutional Network (TGCN) to predict traffic in a spatio-temporal setting. Traffic forecasting is the problem of predicting future traffic trends on a road network given historical traffic data, such as, in our case, traffic speed and time of day. + +## Import + +We start by importing the necessary libraries. We use `GraphNeuralNetworks.jl`, `Flux.jl` and `MLDatasets.jl`, among others. + +```@example traffic +using Flux, GraphNeuralNetworks +using Flux.Losses: mae +using MLDatasets: METRLA +using Statistics, Plots, Random + +ENV["DATADEPS_ALWAYS_ACCEPT"] = "true" # don't ask for dataset download confirmation +Random.seed!(42); # for reproducibility +``` +## Dataset: METR-LA + +We use the `METR-LA` dataset from the paper [Diffusion Convolutional Recurrent Neural Network: Data-driven Traffic Forecasting](https://arxiv.org/pdf/1707.01926.pdf), which contains traffic data from loop detectors in the highway of Los Angeles County. The dataset contains traffic speed data from March 1, 2012 to June 30, 2012. The data is collected every 5 minutes, resulting in 12 observations per hour, from 207 sensors. Each sensor is a node in the graph, and the edge weights are the distances between the sensor locations. + +```@example traffic +dataset_metrla = METRLA(; num_timesteps = 3) +``` +```@example traffic +g = dataset_metrla[1] +``` + +`edge_data` contains the weights of the edges of the graph and +`node_data` contains a node feature vector and a target vector. The latter vectors contain batches of dimension `num_timesteps`, which means that they contain vectors with the node features and targets of `num_timesteps` time steps. Two consecutive batches are shifted by one-time step. +The node features are the traffic speed of the sensors and the time of the day, and the targets are the traffic speed of the sensors in the next time step. +Let's see some examples: + +```@example traffic +features = map(x -> permutedims(x,(1,3,2)), g.node_data.features) + +size(features[1]) +``` + +The first dimension corresponds to the two features (the first line the speed value and the second line the time of day), the second to the number of time steps `num_timesteps` and the third to the nodes. + +```@example traffic +targets = map(x -> permutedims(x,(1,3,2)), g.node_data.targets) + +size(targets[1]) +``` +In the case of the targets the first dimension is 1 because they store just the speed value. + +```@example traffic +features[1][:,:,1] +``` +```@example traffic +features[2][:,:,1] +``` +```@example traffic +targets[1][:,:,1] +``` +```@example traffic +function plot_data(data,sensor) + p = plot(legend=false, xlabel="Time (h)", ylabel="Normalized speed") + plotdata = [] + for i in 1:3:length(data) + push!(plotdata,data[i][1,:,sensor]) + end + plotdata = reduce(vcat,plotdata) + plot!(p, collect(1:length(data)), plotdata, color = :green, xticks =([i for i in 0:50:250], ["$(i)" for i in 0:4:24])) + return p +end + +plot_data(features[1:288],1) # Plot the speed of the first sensor for the first day +``` +Now let's construct the static graph, the `train_loader` and `data_loader`. + +```@example traffic +graph = GNNGraph(g.edge_index; edata = g.edge_data, g.num_nodes); + +train_loader = zip(features[1:288], targets[1:288]); # train on 24 hours +test_loader = zip(features[289:577], targets[289:577]); # test on next 24 hours +``` +## Model: T-GCN + +We use the T-GCN model from the paper [T-GCN: A Temporal Graph Convolutional Network for Traffic Prediction] (https://arxiv.org/pdf/1811.05320.pdf), which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). The GCN is used to capture spatial features from the graph, and the GRU is used to capture temporal features from the feature time series. + +```@example traffic +model = GNNChain(TGCN(2 => 100; add_self_loops = false), Dense(100, 1)) +``` +Let's look at the output of the model for the first batch of the training data. + +```@example traffic +model(graph, features[1]) +``` +The output of the model is a tensor of size `(1, 3, 207)`, which corresponds to the dimension of the feature (in this case speed), the number of time steps, and the number of nodes in the graph, respectively. The model outputs the predicted traffic speed for each sensor at each time step. + +![](https://www.researchgate.net/profile/Haifeng-Li-3/publication/335353434/figure/fig4/AS:851870352437249@1580113127759/The-architecture-of-the-Gated-Recurrent-Unit-model.jpg) + +## Training + +We train the model for 100 epochs, using the Adam optimizer with a learning rate of 0.001. We use the mean absolute error (MAE) as the loss function. + +```@example traffic +function train(graph, train_loader, model) + + opt = Flux.setup(Adam(0.001), model) + + for epoch in 1:100 + for (x, y) in train_loader + x, y = (x, y) + grads = Flux.gradient(model) do model + ŷ = model(graph, x) + Flux.mae(ŷ, y) + end + Flux.update!(opt, model, grads[1]) + end + + if epoch % 10 == 0 + loss = mean([Flux.mae(model(graph,x), y) for (x, y) in train_loader]) + @show epoch, loss + end + end + return model +end + +train(graph, train_loader, model) +``` + +```@example traffic +function plot_predicted_data(graph, features, targets, sensor) + p = plot(xlabel="Time (h)", ylabel="Normalized speed") + prediction = [] + ground_truth = [] + for i in 1:3:length(features) + push!(ground_truth,targets[i][1,:,sensor]) + push!(prediction, model(graph, features[i])[1,:,sensor]) + end + prediction = reduce(vcat,prediction) + ground_truth = reduce(vcat, ground_truth) + plot!(p, collect(1:length(prediction)), prediction, color = :red, label= "Prediction") + plot!(p, collect(1:length(ground_truth)), ground_truth, color = :blue, label = "Ground Truth", xticks = ([i for i in 0:50:250], ["$(i)" for i in 0:4:20])) + return p +end + +plot_predicted_data(graph,features[289:577],targets[289:577], 1) +``` + +```@example traffic +accuracy(ŷ, y) = 1 - Statistics.norm(y-ŷ)/Statistics.norm(y) +``` + +Test accuracy: + +```@example traffic +mean([accuracy(model(graph,x), y) for (x, y) in test_loader]) +``` + +The accuracy is not very good but can be improved by training using more data. We used a small subset of the dataset for this tutorial because of the computational cost of training the model. From the plot of the predictions, we can see that the model is able to capture the general trend of the traffic speed, but it is not able to capture the peaks of the traffic. + +## Conclusion + +In this tutorial, we learned how to use a recurrent temporal graph convolutional network to predict traffic in a spatio-temporal setting. We used the TGCN model, which consists of a graph convolutional network (GCN) and a gated recurrent unit (GRU). We then trained the model for 100 epochs on a small subset of the METR-LA dataset. The accuracy of the model is not very good, but it can be improved by training on more data. + diff --git a/GraphNeuralNetworks/src/layers/temporalconv.jl b/GraphNeuralNetworks/src/layers/temporalconv.jl index 5516d8015..8db7bbc38 100644 --- a/GraphNeuralNetworks/src/layers/temporalconv.jl +++ b/GraphNeuralNetworks/src/layers/temporalconv.jl @@ -174,7 +174,7 @@ where `h` is the updated hidden state of the GRU cell. ```jldoctest julia> using GraphNeuralNetworks, Flux -julia> num_nodes, num_edges = 5, 10; +julia> num_nodes, num_edges = 5, 20; julia> d_in, d_out = 2, 3;