Merge pull request #70 from CarloLucibello/cl/negative

CarloLucibello · web-flow · commit d5c7a8093f7b · 2021-11-05T10:43:43.000+01:00
improvements to link prediction
diff --git a/docs/make.jl b/docs/make.jl
@@ -13,6 +13,7 @@ makedocs(;
              "Graphs" => "gnngraph.md",
              "Message Passing" => "messagepassing.md",
              "Model Building" => "models.md",
+             "Datasets" => "datasets.md",
              "API Reference" =>
                [
                 "GNNGraph" => "api/gnngraph.md",
diff --git a/docs/src/datasets.md b/docs/src/datasets.md
@@ -0,0 +1,4 @@
+# Datasets
+
+GNN.jl doesn't come with its own datasets, but leverages those available in the julia (and non-julia) ecosytem. In particular, the [examples in the GNN.jl repository](https://github.com/CarloLucibello/GraphNeuralNetworks.jl/tree/master/examples) make use of the [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) package. There you will find common graph datasets sich as Cora, PubMed, and Citeseer.
+Also MLDatasets gives access to the [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) repository and its numerous datasets.
diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl
@@ -49,23 +49,20 @@ function train(; kws...)
     ### LOAD DATA
     data = Cora.dataset()
     # data = PubMed.dataset()
-    g = GNNGraph(data.adjacency_list) |> device
+    g = GNNGraph(data.adjacency_list)
+    @info g
     @show is_bidirected(g)
+    @show has_self_loops(g)
+    @show has_multi_edges(g)
+    @show mean(degree(g))
+    isbidir = is_bidirected(g)  
+
+    g = g |> device
     X = data.node_features |> device
     
     #### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES
-    s, t = edge_index(g)
-    eids = randperm(g.num_edges)
-    test_size = round(Int, g.num_edges * 0.1)
-    
-    test_pos_s, test_pos_t = s[eids[1:test_size]], t[eids[1:test_size]]
-    test_pos_g = GNNGraph(test_pos_s, test_pos_t, num_nodes=g.num_nodes)
-    
-    train_pos_s, train_pos_t = s[eids[test_size+1:end]], t[eids[test_size+1:end]]
-    train_pos_g = GNNGraph(train_pos_s, train_pos_t, num_nodes=g.num_nodes)
-
-    test_neg_g = negative_sample(g, num_neg_edges=test_size)
-    
+    train_pos_g, test_pos_g = rand_edge_split(g, 0.9)
+    test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir)
 
     ### DEFINE MODEL #########
     nin, nhidden = size(X,1), args.nhidden
@@ -82,24 +79,30 @@ function train(; kws...)
 
     ### LOSS FUNCTION ############
 
-    function loss(pos_g, neg_g = nothing)
+    function loss(pos_g, neg_g = nothing; with_accuracy=false)
         h = model(X)
         if neg_g === nothing
             # We sample a negative graph at each training step
-            neg_g = negative_sample(pos_g)
+            neg_g = negative_sample(pos_g, bidirected=isbidir)
         end
         pos_score = pred(pos_g, h)
         neg_score = pred(neg_g, h)
         scores = [pos_score; neg_score]
         labels = [fill!(similar(pos_score), 1); fill!(similar(neg_score), 0)]
-        return logitbinarycrossentropy(scores, labels)
+        l = logitbinarycrossentropy(scores, labels)
+        if with_accuracy
+            acc = 0.5 * mean(pos_score .>= 0) + 0.5 * mean(neg_score .< 0)
+            return l, acc
+        else
+            return l
+        end
     end
     
     ### LOGGING FUNCTION
     function report(epoch)
-        train_loss = loss(train_pos_g)
-        test_loss = loss(test_pos_g, test_neg_g)
-        println("Epoch: $epoch   Train: $(train_loss)   Test: $(test_loss)")
+        train_loss, train_acc = loss(train_pos_g, with_accuracy=true)
+        test_loss, test_acc = loss(test_pos_g, test_neg_g, with_accuracy=true)
+        println("Epoch: $epoch  $((; train_loss, train_acc))  $((; test_loss, test_acc))")
     end
     
     ### TRAINING
diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
@@ -22,26 +22,29 @@ export GNNGraph,
        graph_features
     
 include("query.jl")
-export edge_index, 
-       adjacency_list, 
-       normalized_laplacian, 
-       scaled_laplacian,
+export adjacency_list,
+       edge_index, 
        graph_indicator, 
+       has_multi_edges, 
        is_bidirected,
+       normalized_laplacian, 
+       scaled_laplacian,
        # from Graphs
        adjacency_matrix, 
        degree, 
-       outneighbors, 
-       inneighbors
+       has_self_loops,
+       inneighbors,
+       outneighbors 
 
 include("transform.jl")
 export add_nodes, 
        add_edges, 
-       add_self_loops, 
-       remove_self_loops, 
-       remove_multi_edges,
+       add_self_loops,
        getgraph,
        negative_sample,
+       rand_edge_split,
+       remove_self_loops, 
+       remove_multi_edges,
        # from Flux
        batch, 
        unbatch,
@@ -51,6 +54,9 @@ export add_nodes,
 include("generate.jl")
 export rand_graph
 
+include("operators.jl")
+# Base.intersect
+
 include("convert.jl")
 include("utils.jl")
 
diff --git a/src/GNNGraphs/convert.jl b/src/GNNGraphs/convert.jl
@@ -5,9 +5,10 @@ function to_coo(coo::COO_T; dir=:out, num_nodes=nothing)
     num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes 
     @assert isnothing(val) || length(val) == length(s)
     @assert length(s) == length(t)
-    @assert min(minimum(s), minimum(t)) >= 1 
-    @assert max(maximum(s), maximum(t)) <= num_nodes 
-
+    if !isempty(s)
+        @assert min(minimum(s), minimum(t)) >= 1 
+        @assert max(maximum(s), maximum(t)) <= num_nodes 
+    end
     num_edges = length(s)
     return coo, num_nodes, num_edges
 end
diff --git a/src/GNNGraphs/gnngraph.jl b/src/GNNGraphs/gnngraph.jl
@@ -150,6 +150,11 @@ function GNNGraph(data;
             ndata, edata, gdata)
 end
 
+function GNNGraph(n::T; graph_type=:coo, kws...) where {T<:Integer}
+    s, t = T[], T[] 
+    return GNNGraph(s, t; graph_type, num_nodes=n, kws...)
+end
+
 # COO convenience constructors
 GNNGraph(s::AbstractVector, t::AbstractVector, v = nothing; kws...) = GNNGraph((s, t, v); kws...)
 GNNGraph((s, t)::NTuple{2}; kws...) = GNNGraph((s, t, nothing); kws...)
diff --git a/src/GNNGraphs/operators.jl b/src/GNNGraphs/operators.jl
@@ -0,0 +1,13 @@
+# 2 or more args graph operators
+function Base.intersect(g1::GNNGraph, g2::GNNGraph)
+    @assert g1.num_nodes == g2.num_nodes
+    @assert graph_type_symbol(g1) == graph_type_symbol(g2)
+    graph_type = graph_type_symbol(g1)
+    num_nodes = g1.num_nodes
+
+    idx1, _ = edge_encoding(edge_index(g1)..., num_nodes)
+    idx2, _ = edge_encoding(edge_index(g2)..., num_nodes)
+    idx = intersect(idx1, idx2)
+    s, t = edge_decoding(idx, num_nodes)
+    return GNNGraph(s, t; num_nodes, graph_type)
+end
diff --git a/src/GNNGraphs/query.jl b/src/GNNGraphs/query.jl
@@ -28,6 +28,10 @@ end
 
 Graphs.has_edge(g::GNNGraph{<:ADJMAT_T}, i::Integer, j::Integer) = g.graph[i,j] != 0
 
+graph_type_symbol(g::GNNGraph{<:COO_T}) = :coo 
+graph_type_symbol(g::GNNGraph{<:SPARSE_T}) = :sparse
+graph_type_symbol(g::GNNGraph{<:ADJMAT_T}) = :dense
+
 Graphs.nv(g::GNNGraph) = g.num_nodes
 Graphs.ne(g::GNNGraph) = g.num_edges
 Graphs.has_vertex(g::GNNGraph, i::Int) = 1 <= i <= g.num_nodes
@@ -243,10 +247,35 @@ function is_bidirected(g::GNNGraph)
     all((s1 .== s2) .& (t1 .== t2))
 end
 
-@non_differentiable normalized_laplacian(x...)
-@non_differentiable normalized_adjacency(x...)
-@non_differentiable scaled_laplacian(x...)
-@non_differentiable adjacency_matrix(x...)
+"""
+    has_self_loops(g::GNNGraph)
+
+Return `true` if `g` has any self loops.
+"""
+function Graphs.has_self_loops(g::GNNGraph)
+    s, t = edge_index(g)
+    any(s .== t)
+end
+
+"""
+    has_multi_edges(g::GNNGraph)
+
+Return `true` if `g` has any multiple edges.
+"""
+function has_multi_edges(g::GNNGraph)
+    s, t = edge_index(g)
+    idxs = edge_encoding(s, t, g.num_nodes)
+    length(union(idxs)) < length(idxs)
+end
+
+
 @non_differentiable adjacency_list(x...)
+@non_differentiable adjacency_matrix(x...)
 @non_differentiable degree(x...)
 @non_differentiable graph_indicator(x...)
+@non_differentiable has_multi_edges(x...)
+@non_differentiable Graphs.has_self_loops(x...) 
+@non_differentiable is_bidirected(x...)
+@non_differentiable normalized_adjacency(x...)
+@non_differentiable normalized_laplacian(x...)
+@non_differentiable scaled_laplacian(x...)
diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
@@ -321,13 +321,21 @@ function getgraph(g::GNNGraph, i::AbstractVector{Int}; nmap=false)
 end
 
 """
-    negative_sample(g::GNNGraph; num_neg_edges=g.num_edges)
+    negative_sample(g::GNNGraph; 
+                    num_neg_edges = g.num_edges, 
+                    bidirected = is_bidirected(g))
 
 Return a graph containing random negative edges (i.e. non-edges) from graph `g` as edges.
+
+Is `bidirected=true`, the output graph will be bidirected and there will be no
+leakage from the origin graph. 
+
+See also [`is_bidirected`](@ref).
 """
 function negative_sample(g::GNNGraph; 
         max_trials=3, 
-        num_neg_edges=g.num_edges)
+        num_neg_edges=g.num_edges, 
+        bidirected = is_bidirected(g))
 
     @assert g.num_graphs == 1
     # Consider self-loops as positive edges
@@ -344,8 +352,12 @@ function negative_sample(g::GNNGraph;
         device = Flux.cpu
     end
     idx_pos, maxid = edge_encoding(s, t, n)
-    
-    pneg = 1 - g.num_edges / maxid # prob of selecting negative edge 
+    if bidirected
+        num_neg_edges = num_neg_edges ÷ 2
+        pneg = 1 - g.num_edges / 2maxid # prob of selecting negative edge 
+    else 
+        pneg = 1 - g.num_edges / 2maxid # prob of selecting negative edge 
+    end    
     # pneg * sample_prob * maxid == num_neg_edges  
     sample_prob = min(1, num_neg_edges / (pneg * maxid) * 1.1)
     idx_neg = Int[]
@@ -359,26 +371,44 @@ function negative_sample(g::GNNGraph;
         end
     end
     s_neg, t_neg = edge_decoding(idx_neg, n)
+    if bidirected
+        s_neg, t_neg = [s_neg; t_neg], [t_neg; s_neg] 
+    end
     return GNNGraph(s_neg, t_neg, num_nodes=n) |> device
 end
 
-# each edge is represented by a number in
-# 1:N^2
-function edge_encoding(s, t, n)
-    idx = (s .- 1) .* n .+ t
-    maxid = n^2 
-    return idx, maxid
-end
+"""
+    rand_edge_split(g::GNNGraph, frac) -> g1, g2
+
+Randomly partition the edges in `g` to from two graphs, `g1`
+and `g2`. Both will have the same number of nodes as `g`.
+`g1` will contain a fraction `frac` of the original edges, 
+while `g2` wil contain the rest.
+Useful for train/test splits in link prediction tasks.
+"""
+function rand_edge_split(g::GNNGraph, frac)
+    # TODO add bidirected version
+    s, t = edge_index(g)
+    eids = randperm(g.num_edges)
+    size1 = round(Int, g.num_edges * frac)
+    
+    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+    g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
+
+    s, t = edge_index(g)
+    eids = randperm(g.num_edges)
+    size1 = round(Int, g.num_edges * frac)
+    
+    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+    g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
 
-# each edge is represented by a number in
-# 1:N^2
-function edge_decoding(idx, n)
-    # g = remove_self_loops(g)
-    s =  (idx .- 1) .÷ n .+ 1
-    t =  (idx .- 1) .% n .+ 1
-    return s, t
+    s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
+    g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes)
+
+    return g1, g2
 end
 
+
 # """
 # Transform vector of cartesian indexes into a tuple of vectors containing integers.
 # """
diff --git a/src/GNNGraphs/utils.jl b/src/GNNGraphs/utils.jl
@@ -70,3 +70,59 @@ end
 ones_like(x::AbstractArray, T=eltype(x), sz=size(x)) = fill!(similar(x, T, sz), 1)
 ones_like(x::SparseMatrixCSC, T=eltype(x), sz=size(x)) = ones(T, sz)
 ones_like(x::CUMAT_T, T=eltype(x), sz=size(x)) = CUDA.ones(T, sz)
+
+
+# each edge is represented by a number in
+# 1:N^2
+function edge_encoding(s, t, n; directed=true)
+    if directed
+        # directed edges and self-loops allowed
+        idx = (s .- 1) .* n .+ t
+        maxid = n^2
+    else 
+        # Undirected edges and self-loops allowed
+        maxid = n * (n + 1) ÷ 2
+        
+        mask = s .> t
+        snew = copy(s)
+        tnew = copy(t)
+        snew[mask] .= t[mask]
+        tnew[mask] .= s[mask]
+        s, t = snew, tnew
+
+        # idx = ∑_{i',i'<i} ∑_{j',j'>=i'}^n 1 + ∑_{j',i<=j'<=j} 1 
+        #     = ∑_{i',i'<i} ∑_{j',j'>=i'}^n 1 + (j - i + 1)
+        #     = ∑_{i',i'<i} (n - i' + 1) + (j - i + 1)
+        #     = (i - 1)*(2*(n+1)-i)÷2 + (j - i + 1)
+        idx = @. (s-1)*(2*(n+1)-s)÷2 + (t-s+1)
+    end
+    return idx, maxid
+end
+
+# each edge is represented by a number in
+# 1:N^2
+function edge_decoding(idx, n; directed=true)
+    if directed
+        # g = remove_self_loops(g)
+        s =  (idx .- 1) .÷ n .+ 1
+        t =  (idx .- 1) .% n .+ 1
+    else
+        # We replace j=n in 
+        # idx = (i - 1)*(2*(n+1)-i)÷2 + (j - i + 1) 
+        # and obtain
+        # idx = (i - 1)*(2*(n+1)-i)÷2 + (n - i + 1) 
+        
+        # OR We replace j=i  and obtain??
+        # idx = (i - 1)*(2*(n+1)-i)÷2 + 1 
+        
+        # inverting we have
+        s = @. ceil(Int, -sqrt((n + 1/2)^2 - 2*idx) + n + 1/2)
+        t = @. idx - (s-1)*(2*(n+1)-s)÷2 - 1 + s
+        # t =  (idx .- 1) .% n .+ 1
+    end
+    return s, t
+end
+
+@non_differentiable edge_encoding(x...)
+@non_differentiable edge_decoding(x...)
+
diff --git a/test/GNNGraphs/operators.jl b/test/GNNGraphs/operators.jl
@@ -0,0 +1,6 @@
+@testset "Operators" begin
+    @testset "intersect" begin
+        g = rand_graph(10, 20, graph_type=GRAPH_T)
+        @test intersect(g, g).num_edges == 20
+    end
+end
diff --git a/test/GNNGraphs/transform.jl b/test/GNNGraphs/transform.jl
diff --git a/test/GNNGraphs/utils.jl b/test/GNNGraphs/utils.jl
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ makedocs(;`
`13`	`13`	`"Graphs" => "gnngraph.md",`
`14`	`14`	`"Message Passing" => "messagepassing.md",`
`15`	`15`	`"Model Building" => "models.md",`
	`16`	`+ "Datasets" => "datasets.md",`
`16`	`17`	`"API Reference" =>`
`17`	`18`	`[`
`18`	`19`	`"GNNGraph" => "api/gnngraph.md",`