|
| 1 | +""" |
| 2 | + NeighborLoader(graph; num_neighbors, input_nodes, num_layers, [batch_size]) |
| 3 | +
|
| 4 | +A data structure for sampling neighbors from a graph for training Graph Neural Networks (GNNs). |
| 5 | +It supports multi-layer sampling of neighbors for a batch of input nodes, useful for mini-batch training |
| 6 | +originally introduced in ["Inductive Representation Learning on Large Graphs"}(https://arxiv.org/abs/1706.02216) paper. |
| 7 | +
|
| 8 | +# Fields |
| 9 | +- `graph::GNNGraph`: The input graph. |
| 10 | +- `num_neighbors::Vector{Int}`: A vector specifying the number of neighbors to sample per node at each GNN layer. |
| 11 | +- `input_nodes::Vector{Int}`: A vector containing the starting nodes for neighbor sampling. |
| 12 | +- `num_layers::Int`: The number of layers for neighborhood expansion (how far to sample neighbors). |
| 13 | +- `batch_size::Union{Int, Nothing}`: The size of the batch. If not specified, it defaults to the number of `input_nodes`. |
| 14 | +
|
| 15 | +# Examples |
| 16 | +
|
| 17 | +```julia |
| 18 | +julia> loader = NeighborLoader(graph; num_neighbors=[10, 5], input_nodes=[1, 2, 3], num_layers=2) |
| 19 | +
|
| 20 | +julia> batch_counter = 0 |
| 21 | +
|
| 22 | +julia> for mini_batch_gnn in loader |
| 23 | + batch_counter += 1 |
| 24 | + println("Batch ", batch_counter, ": Nodes in mini-batch graph: ", nv(mini_batch_gnn)) |
| 25 | + end |
| 26 | +``` |
| 27 | +""" |
| 28 | +struct NeighborLoader |
| 29 | + graph::GNNGraph # The input GNNGraph (graph + features from GraphNeuralNetworks.jl) |
| 30 | + num_neighbors::Vector{Int} # Number of neighbors to sample per node, for each layer |
| 31 | + input_nodes::Vector{Int} # Set of input nodes (starting nodes for sampling) |
| 32 | + num_layers::Int # Number of layers for neighborhood expansion |
| 33 | + batch_size::Union{Int, Nothing} # Optional batch size, defaults to the length of input_nodes if not given |
| 34 | + neighbors_cache::Dict{Int, Vector{Int}} # Cache neighbors to avoid recomputation |
| 35 | +end |
| 36 | + |
| 37 | +function NeighborLoader(graph::GNNGraph; num_neighbors::Vector{Int}, input_nodes::Vector{Int}=nothing, |
| 38 | + num_layers::Int, batch_size::Union{Int, Nothing}=nothing) |
| 39 | + return NeighborLoader(graph, num_neighbors, input_nodes === nothing ? collect(1:graph.num_nodes) : input_nodes, num_layers, |
| 40 | + batch_size === nothing ? length(input_nodes) : batch_size, Dict{Int, Vector{Int}}()) |
| 41 | +end |
| 42 | + |
| 43 | +# Function to get cached neighbors or compute them |
| 44 | +function get_neighbors(loader::NeighborLoader, node::Int) |
| 45 | + if haskey(loader.neighbors_cache, node) |
| 46 | + return loader.neighbors_cache[node] |
| 47 | + else |
| 48 | + neighbors = Graphs.neighbors(loader.graph, node, dir = :in) # Get neighbors from graph |
| 49 | + loader.neighbors_cache[node] = neighbors |
| 50 | + return neighbors |
| 51 | + end |
| 52 | +end |
| 53 | + |
| 54 | +# Function to sample neighbors for a given node at a specific layer |
| 55 | +function sample_nbrs(loader::NeighborLoader, node::Int, layer::Int) |
| 56 | + neighbors = get_neighbors(loader, node) |
| 57 | + if isempty(neighbors) |
| 58 | + return Int[] |
| 59 | + else |
| 60 | + num_samples = min(loader.num_neighbors[layer], length(neighbors)) # Limit to required samples for this layer |
| 61 | + return rand(neighbors, num_samples) # Randomly sample neighbors |
| 62 | + end |
| 63 | +end |
| 64 | + |
| 65 | +# Iterator protocol for NeighborLoader with lazy batch loading |
| 66 | +function Base.iterate(loader::NeighborLoader, state=1) |
| 67 | + if state > length(loader.input_nodes) |
| 68 | + return nothing # End of iteration if batches are exhausted (state larger than amount of input nodes or current batch no >= batch number) |
| 69 | + end |
| 70 | + |
| 71 | + # Determine the size of the current batch |
| 72 | + batch_size = min(loader.batch_size, length(loader.input_nodes) - state + 1) # Conditional in case there is not enough nodes to fill the last batch |
| 73 | + batch_nodes = loader.input_nodes[state:state + batch_size - 1] # Each mini-batch uses different set of input nodes |
| 74 | + |
| 75 | + # Set for tracking the subgraph nodes |
| 76 | + subgraph_nodes = Set(batch_nodes) |
| 77 | + |
| 78 | + for node in batch_nodes |
| 79 | + # Initialize current layer of nodes (starting with the node itself) |
| 80 | + sampled_neighbors = Set([node]) |
| 81 | + |
| 82 | + # For each GNN layer, sample the neighborhood |
| 83 | + for layer in 1:loader.num_layers |
| 84 | + new_neighbors = Set{Int}() |
| 85 | + for n in sampled_neighbors |
| 86 | + neighbors = sample_nbrs(loader, n, layer) # Sample neighbors of the node for this layer |
| 87 | + new_neighbors = union(new_neighbors, neighbors) # Avoid duplicates in the neighbor set |
| 88 | + end |
| 89 | + sampled_neighbors = new_neighbors |
| 90 | + subgraph_nodes = union(subgraph_nodes, sampled_neighbors) # Expand the subgraph with the new neighbors |
| 91 | + end |
| 92 | + end |
| 93 | + |
| 94 | + # Collect subgraph nodes and their features |
| 95 | + subgraph_node_list = collect(subgraph_nodes) |
| 96 | + |
| 97 | + if isempty(subgraph_node_list) |
| 98 | + return GNNGraph(), state + batch_size |
| 99 | + end |
| 100 | + |
| 101 | + mini_batch_gnn = Graphs.induced_subgraph(loader.graph, subgraph_node_list) # Create a subgraph of the nodes |
| 102 | + |
| 103 | + # Continue iteration for the next batch |
| 104 | + return mini_batch_gnn, state + batch_size |
| 105 | +end |
| 106 | + |
| 107 | + |
1 | 108 | """
|
2 | 109 | sample_neighbors(g, nodes, K=-1; dir=:in, replace=false, dropnodes=false)
|
3 | 110 |
|
|
0 commit comments