working implementation

CarloLucibello · CarloLucibello · commit ece027cf1e75 · 2021-09-05T16:09:49.000+02:00
diff --git a/README.md b/README.md
@@ -27,12 +27,10 @@ Find below a list of available datasets and links to their documentation.
   - [MNIST](https://juliaml.github.io/MLDatasets.jl/dev/datasets/MNIST/)
   - [SVHN2](https://juliaml.github.io/MLDatasets.jl/dev/datasets/SVHN2/)
 
-
 #### Miscellaneous
   - [BostonHousing](https://juliaml.github.io/MLDatasets.jl/dev/datasets/BostonHousing/)
   - [Iris](https://juliaml.github.io/MLDatasets.jl/dev/datasets/Iris/)
 
-
 #### Text
   - [PTBLM](https://juliaml.github.io/MLDatasets.jl/dev/datasets/PTBLM/)
   - [UD_English](https://juliaml.github.io/MLDatasets.jl/dev/datasets/UD_English/)
@@ -41,7 +39,7 @@ Find below a list of available datasets and links to their documentation.
   - [CiteSeer](https://juliaml.github.io/MLDatasets.jl/dev/datasets/CiteSeer/)
   - [Cora](https://juliaml.github.io/MLDatasets.jl/dev/datasets/Cora/)
   - [PubMed](https://juliaml.github.io/MLDatasets.jl/dev/datasets/PubMed/)
-
+  - [TUDatasets](TODO)
 
 
 ## Installation
diff --git a/docs/make.jl b/docs/make.jl
@@ -44,6 +44,7 @@ makedocs(
                 "CiteSeer" => "datasets/CiteSeer.md",
                 "Cora" => "datasets/Cora.md",
                 "PubMed" => "datasets/PubMed.md",
+                "TUDataset" => "datasets/TUDataset.md",      
             ],
 
         ],
diff --git a/docs/src/datasets/TUDataset.md b/docs/src/datasets/TUDataset.md
@@ -0,0 +1,11 @@
+# TUDataset
+
+```@docs
+TUDataset
+```
+
+## API reference
+
+```@docs
+TUDataset.dataset
+```
diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -72,6 +72,8 @@ function __init__()
         out = out.todense() if hasattr(out, 'todense') else out
         return out
     """
+
+    __init__tudataset()
 end
 
 end
diff --git a/src/TUDataset/TUDataset.jl b/src/TUDataset/TUDataset.jl
@@ -1,63 +1,84 @@
 export TUDataset
 
-"""
-    TUDataset
-
-A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
-"REDDIT-BINARY" or "PROTEINS", collected from the [TU Dortmund University](https://chrsmrrs.github.io/datasets).
-"""
-module TUDataset
-
 using DataDeps
-using ..MLDatasets: datafile, datadir
+# using ..MLDatasets: datafile, datadir
 using DelimitedFiles: readdlm
 
-using PyCall
-
-const DEPNAME = "TUDataset"
-# LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz"
-# LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/"
-const LINK = "https://www.chrsmrrs.com/graphkerneldatasets"
-const DOCS = "https://chrsmrrs.github.io/datasets"
-const DATA = "PROTEINS.zip"
+function __init__tudataset()
+    DEPNAME = "TUDataset"
+    LINK = "https://www.chrsmrrs.com/graphkerneldatasets"
+    DOCS = ""
+    DATA = "PROTEINS.zip"
 
-function __init__()
     register(DataDep(
         DEPNAME,
         """
         Dataset: The $DEPNAME dataset.
-        Website: $DOCS
+        Website: $LINK)
         """,
         "$LINK/$DATA",
         # "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7",  # if checksum omitted, will be generated by DataDeps
         post_fetch_method = unpack
     ))
 end
 
-struct TUData
+struct TUDataset
+    num_nodes::Int
+    num_edges::Int
+    num_graphs::Int
     source::Vector{Int}
     target::Vector{Int}
-    graph_indicator::Vector{Int}
+    graph_indicator
     node_labels::Vector{Int}
     edge_labels::Union{Nothing, Vector{Int}}
-    graph_labels::Vector{Int}
+    graph_labels
     node_attributes
     edge_attributes
     graph_attributes
 end
 
 """
+    TUDataset
+
+A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
+"REDDIT-BINARY" or "PROTEINS", collected from the [TU Dortmund University](https://chrsmrrs.github.io/datasets).
+
     dataset(name; dir=nothing)
 
-Retrieve the TUDataset dataset. The output is a named tuple with fields
+Retrieve the TUDataset dataset. The output is an object with fields
+
+```
+num_nodes
+num_edges
+num_graphs
+source              # vector of edges' source vectors      
+target              # vector of edges' target vectors
+graph_indicator     # graph
+node_labels
+edge_labels
+graph_labels
+node_attributes
+edge_attributes
+graph_attributes
+```
 
 See [this link](https://chrsmrrs.github.io/datasets/docs/datasets/)
 for a list of the available datasets.
 """
-function dataset(name; dir=nothing) 
-    d = datadir(DEPNAME, dir)
+function TUDataset(name; dir=nothing)
+    d = datadir("TUDataset", dir)
     # See here for the file format https://chrsmrrs.github.io/datasets/docs/format/
     st = readdlm(joinpath(d, name, "$(name)_A.txt"), ',', Int)
+   
+    # Check that the first node is labeled 1.
+    # TODO this will fail if the first node is isolated
+    @assert minimum(st) == 1
+
+    graph_indicator = readdlm(joinpath(d, name, "$(name)_graph_indicator.txt"), Int) |> vec      
+    @assert all(sort(unique(graph_indicator)) .== 1:length(unique(graph_indicator)))
+
+    node_labels = readdlm(joinpath(d, name, "$(name)_node_labels.txt"), Int) |> vec
+    graph_labels = readdlm(joinpath(d, name, "$(name)_graph_labels.txt"), Int) |> vec
 
     # LOAD OPTIONAL FILES IF EXIST
     
@@ -82,16 +103,49 @@ function dataset(name; dir=nothing)
         graph_attributes = nothing
     end
 
-    TUData(st[:,1], st[:,2], 
-            readdlm(joinpath(d, name, "$(name)_graph_indicator.txt"), Int) |> vec, 
-            readdlm(joinpath(d, name, "$(name)_node_labels.txt"), Int) |> vec,
+
+    TUDataset( length(node_labels), size(st, 1), length(graph_labels),
+                st[:,1], st[:,2], 
+                graph_indicator,
+                node_labels,
+                edge_labels,            
+                graph_labels,
+                node_attributes, 
+                edge_attributes,
+                graph_attributes)
+end
+
+
+function Base.getindex(data::TUDataset, i)
+    node_mask = data.graph_indicator .∈ Ref(i)
+    graph_indicator = data.graph_indicator[node_mask]
+    
+    nodes = (1:data.num_nodes)[node_mask]
+    node_labels = data.node_labels[node_mask]
+    nodemap = Dict(v => i for (i, v) in enumerate(nodes))
+
+    edge_mask = data.source .∈ Ref(nodes) 
+    source = [nodemap[i] for i in data.source[edge_mask]]
+    target = [nodemap[i] for i in data.target[edge_mask]]
+    edge_labels = isnothing(data.edge_labels) ? nothing : data.edge_labels[edge_mask]
+
+    graph_labels = data.graph_labels[i]
+    
+    node_attributes = isnothing(data.node_attributes) ? nothing : data.node_attributes[:,node_mask]
+    edge_attributes = isnothing(data.edge_attributes) ? nothing : data.edge_attributes[:,edge_mask]
+    graph_attributes = isnothing(data.graph_attributes) ? nothing : data.graph_attributes[:,i]
+
+
+    @assert source isa Vector
+    @assert target isa Vector
+    @assert node_labels isa Vector
+    TUDataset(length(nodes), length(source), length(graph_labels),
+            source, target, 
+            graph_indicator,
+            node_labels,
             edge_labels,            
-            readdlm(joinpath(d, name, "$(name)_graph_labels.txt"), Int) |> vec,
+            graph_labels,
             node_attributes, 
             edge_attributes,
             graph_attributes)
 end
-
-
-end #module 
-
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -7,17 +7,21 @@ using DataDeps
 ENV["DATADEPS_ALWAYS_ACCEPT"] = true
 
 tests = [
+    # misc
     "tst_iris.jl",
     "tst_boston_housing.jl",
+    # vision
     "tst_cifar10.jl",
     "tst_cifar100.jl",
     "tst_mnist.jl",
     "tst_fashion_mnist.jl",
     "tst_svhn2.jl",
     "tst_emnist.jl",
-    "tst_cora.jl",
+    # graphs    
     "tst_citeseer.jl",
+    "tst_cora.jl",
     "tst_pubmed.jl",
+    "tst_tudataset.jl",
 ]
 
 for t in tests
diff --git a/test/tst_tudataset.jl b/test/tst_tudataset.jl
@@ -0,0 +1,25 @@
+data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
+    datadep"TUDataset"
+end
+
+@testset "TUDataset - PROTEINS" begin
+    data  = TUDataset("PROTEINS")
+    
+    @test data.num_nodes == 43471
+    @test data.num_edges == 162088
+    @test data.num_graphs === 1113
+
+    @test length(data.source) == data.num_edges 
+    @test length(data.target) == data.num_edges 
+    
+    @test size(data.node_attributes) == (1, data.num_nodes)
+    @test data.edge_attributes === nothing
+    @test data.graph_attributes === nothing
+    
+    @test size(data.node_labels) == (data.num_nodes,)
+    @test data.edge_labels === nothing
+    @test size(data.graph_labels) == (data.num_graphs,)
+
+    @test length(data.graph_indicator) == data.num_nodes
+    @test all(sort(unique(data.graph_indicator)) .== 1:data.num_graphs)  
+end