1
1
export TUDataset
2
2
3
- """
4
- TUDataset
5
-
6
- A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
7
- "REDDIT-BINARY" or "PROTEINS", collected from the [TU Dortmund University](https://chrsmrrs.github.io/datasets).
8
- """
9
- module TUDataset
10
-
11
3
using DataDeps
12
- using .. MLDatasets: datafile, datadir
4
+ # using ..MLDatasets: datafile, datadir
13
5
using DelimitedFiles: readdlm
14
6
15
- using PyCall
16
-
17
- const DEPNAME = " TUDataset"
18
- # LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz"
19
- # LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/"
20
- const LINK = " https://www.chrsmrrs.com/graphkerneldatasets"
21
- const DOCS = " https://chrsmrrs.github.io/datasets"
22
- const DATA = " PROTEINS.zip"
7
+ function __init__tudataset ()
8
+ DEPNAME = " TUDataset"
9
+ LINK = " https://www.chrsmrrs.com/graphkerneldatasets"
10
+ DOCS = " "
11
+ DATA = " PROTEINS.zip"
23
12
24
- function __init__ ()
25
13
register (DataDep (
26
14
DEPNAME,
27
15
"""
28
16
Dataset: The $DEPNAME dataset.
29
- Website: $DOCS
17
+ Website: $LINK )
30
18
""" ,
31
19
" $LINK /$DATA " ,
32
20
# "81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7", # if checksum omitted, will be generated by DataDeps
33
21
post_fetch_method = unpack
34
22
))
35
23
end
36
24
37
- struct TUData
25
+ struct TUDataset
26
+ num_nodes:: Int
27
+ num_edges:: Int
28
+ num_graphs:: Int
38
29
source:: Vector{Int}
39
30
target:: Vector{Int}
40
- graph_indicator:: Vector{Int}
31
+ graph_indicator
41
32
node_labels:: Vector{Int}
42
33
edge_labels:: Union{Nothing, Vector{Int}}
43
- graph_labels:: Vector{Int}
34
+ graph_labels
44
35
node_attributes
45
36
edge_attributes
46
37
graph_attributes
47
38
end
48
39
49
40
"""
41
+ TUDataset
42
+
43
+ A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
44
+ "REDDIT-BINARY" or "PROTEINS", collected from the [TU Dortmund University](https://chrsmrrs.github.io/datasets).
45
+
50
46
dataset(name; dir=nothing)
51
47
52
- Retrieve the TUDataset dataset. The output is a named tuple with fields
48
+ Retrieve the TUDataset dataset. The output is an object with fields
49
+
50
+ ```
51
+ num_nodes
52
+ num_edges
53
+ num_graphs
54
+ source # vector of edges' source vectors
55
+ target # vector of edges' target vectors
56
+ graph_indicator # graph
57
+ node_labels
58
+ edge_labels
59
+ graph_labels
60
+ node_attributes
61
+ edge_attributes
62
+ graph_attributes
63
+ ```
53
64
54
65
See [this link](https://chrsmrrs.github.io/datasets/docs/datasets/)
55
66
for a list of the available datasets.
56
67
"""
57
- function dataset (name; dir= nothing )
58
- d = datadir (DEPNAME , dir)
68
+ function TUDataset (name; dir= nothing )
69
+ d = datadir (" TUDataset " , dir)
59
70
# See here for the file format https://chrsmrrs.github.io/datasets/docs/format/
60
71
st = readdlm (joinpath (d, name, " $(name) _A.txt" ), ' ,' , Int)
72
+
73
+ # Check that the first node is labeled 1.
74
+ # TODO this will fail if the first node is isolated
75
+ @assert minimum (st) == 1
76
+
77
+ graph_indicator = readdlm (joinpath (d, name, " $(name) _graph_indicator.txt" ), Int) |> vec
78
+ @assert all (sort (unique (graph_indicator)) .== 1 : length (unique (graph_indicator)))
79
+
80
+ node_labels = readdlm (joinpath (d, name, " $(name) _node_labels.txt" ), Int) |> vec
81
+ graph_labels = readdlm (joinpath (d, name, " $(name) _graph_labels.txt" ), Int) |> vec
61
82
62
83
# LOAD OPTIONAL FILES IF EXIST
63
84
@@ -82,16 +103,49 @@ function dataset(name; dir=nothing)
82
103
graph_attributes = nothing
83
104
end
84
105
85
- TUData (st[:,1 ], st[:,2 ],
86
- readdlm (joinpath (d, name, " $(name) _graph_indicator.txt" ), Int) |> vec,
87
- readdlm (joinpath (d, name, " $(name) _node_labels.txt" ), Int) |> vec,
106
+
107
+ TUDataset ( length (node_labels), size (st, 1 ), length (graph_labels),
108
+ st[:,1 ], st[:,2 ],
109
+ graph_indicator,
110
+ node_labels,
111
+ edge_labels,
112
+ graph_labels,
113
+ node_attributes,
114
+ edge_attributes,
115
+ graph_attributes)
116
+ end
117
+
118
+
119
+ function Base. getindex (data:: TUDataset , i)
120
+ node_mask = data. graph_indicator .∈ Ref (i)
121
+ graph_indicator = data. graph_indicator[node_mask]
122
+
123
+ nodes = (1 : data. num_nodes)[node_mask]
124
+ node_labels = data. node_labels[node_mask]
125
+ nodemap = Dict (v => i for (i, v) in enumerate (nodes))
126
+
127
+ edge_mask = data. source .∈ Ref (nodes)
128
+ source = [nodemap[i] for i in data. source[edge_mask]]
129
+ target = [nodemap[i] for i in data. target[edge_mask]]
130
+ edge_labels = isnothing (data. edge_labels) ? nothing : data. edge_labels[edge_mask]
131
+
132
+ graph_labels = data. graph_labels[i]
133
+
134
+ node_attributes = isnothing (data. node_attributes) ? nothing : data. node_attributes[:,node_mask]
135
+ edge_attributes = isnothing (data. edge_attributes) ? nothing : data. edge_attributes[:,edge_mask]
136
+ graph_attributes = isnothing (data. graph_attributes) ? nothing : data. graph_attributes[:,i]
137
+
138
+
139
+ @assert source isa Vector
140
+ @assert target isa Vector
141
+ @assert node_labels isa Vector
142
+ TUDataset (length (nodes), length (source), length (graph_labels),
143
+ source, target,
144
+ graph_indicator,
145
+ node_labels,
88
146
edge_labels,
89
- readdlm ( joinpath (d, name, " $(name) _graph_labels.txt " ), Int) |> vec ,
147
+ graph_labels ,
90
148
node_attributes,
91
149
edge_attributes,
92
150
graph_attributes)
93
151
end
94
-
95
-
96
- end # module
97
-
0 commit comments