Skip to content

Commit 5f5495f

Browse files
update cora
add PuMed and CiteSeer
1 parent aff7d26 commit 5f5495f

File tree

15 files changed

+407
-45
lines changed

15 files changed

+407
-45
lines changed

Project.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "MLDatasets"
22
uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
3-
version = "0.5.8"
3+
version = "0.5.9"
44

55
[deps]
66
BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
@@ -10,6 +10,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
1010
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
1111
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
1212
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
13+
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
1314
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1415

1516
[compat]
@@ -20,6 +21,7 @@ FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
2021
GZip = "0.5"
2122
ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
2223
MAT = "0.7, 0.8, 0.9, 0.10"
24+
PyCall = "1"
2325
Requires = "1"
2426
julia = "1"
2527

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Each dataset has its own dedicated sub-module.
2020
Find below a list of available datasets and links to their documentation.
2121

2222
#### Vision
23-
- [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
23+
- [CIFAR10](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR10/)
2424
- [CIFAR100](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CIFAR100/)
2525
- [EMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/EMNIST/)
2626
- [FashionMNIST](https://juliaml.github.io/MLDatasets.jl/latest/datasets/FashionMNIST/)
@@ -38,7 +38,9 @@ Find below a list of available datasets and links to their documentation.
3838
- [UD_English](https://juliaml.github.io/MLDatasets.jl/latest/datasets/UD_English/)
3939

4040
#### Graphs
41+
- [CiteSeer](https://juliaml.github.io/MLDatasets.jl/latest/datasets/CiteSeer/)
4142
- [Cora](https://juliaml.github.io/MLDatasets.jl/latest/datasets/Cora/)
43+
- [PubMed](https://juliaml.github.io/MLDatasets.jl/latest/datasets/PubMed/)
4244

4345

4446

docs/src/datasets/CiteSeer.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# CiteSeer
2+
3+
```@docs
4+
CiteSeer
5+
```
6+
7+
## API reference
8+
9+
```@docs
10+
CiteSeer.dataset
11+
```

docs/src/datasets/Cora.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ Cora
77
## API reference
88

99
```@docs
10-
Cora.alldata
10+
Cora.dataset
1111
```

docs/src/datasets/PubMed.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# PubMed
2+
3+
```@docs
4+
PubMed
5+
```
6+
7+
## API reference
8+
9+
```@docs
10+
PubMed.dataset
11+
```

src/CiteSeer/CiteSeer.jl

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
export CiteSeer
2+
3+
4+
"""
5+
CiteSeer
6+
7+
The CiteSeer citation network dataset from Ref. [1].
8+
Nodes represent documents and edges represent citation links.
9+
The dataset is designed for the node classification task.
10+
The task is to predict the category of certain paper.
11+
The dataset is retrieved from Ref. [2].
12+
13+
## Interface
14+
15+
- [`CiteSeer.dataset`](@ref)
16+
17+
## References
18+
19+
[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
20+
[2]: [Planetoid](https://github.com/kimiyoung/planetoid)
21+
"""
22+
module CiteSeer
23+
24+
using DataDeps
25+
using ..MLDatasets: datafile, read_planetoid_data
26+
using DelimitedFiles: readdlm
27+
28+
using PyCall
29+
30+
const DEPNAME = "CiteSeer"
31+
const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
32+
const DOCS = "https://github.com/kimiyoung/planetoid"
33+
const DATA = "ind.citeseer." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
34+
35+
function __init__()
36+
register(DataDep(
37+
DEPNAME,
38+
"""
39+
Dataset: The $DEPNAME dataset.
40+
Website: $DOCS
41+
""",
42+
joinpath.(LINK, DATA),
43+
"7f7ec4df97215c573eee316de35754d89382011dfd9fb2b954a4a491057e3eb3", # if checksum omitted, will be generated by DataDeps
44+
# post_fetch_method = unpack
45+
))
46+
end
47+
48+
"""
49+
dataset(; dir=nothing, reverse_edges=true)
50+
51+
Retrieve the CiteSeer dataset. The output is a named tuple with fields
52+
```juliarepl
53+
julia> keys(CiteSeer.dataset())
54+
(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
55+
```
56+
57+
In particular, `adjacency_list` is a vector of vector,
58+
where `adjacency_list[i]` will contain the neighbors of node `i`
59+
through outgoing edges.
60+
61+
If `reverse_edges=true`, the graph will contain
62+
the reverse of each edge and the graph will be undirected.
63+
64+
See also [`CiteSeer`](@ref).
65+
66+
## Usage Examples
67+
68+
```julia
69+
using MLDatasets: CiteSeer
70+
data = CiteSeer.dataset()
71+
train_labels = data.node_labels[data.train_indices]
72+
```
73+
"""
74+
dataset(; dir=nothing, reverse_edges=true) =
75+
read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
76+
77+
78+
end #module
79+

src/Cora/Cora.jl

Lines changed: 57 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,52 @@
11
export Cora
22

3+
34
"""
45
Cora
56
6-
The full Cora citation network dataset from the
7-
`"Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via
8-
Ranking" <https://arxiv.org/abs/1707.03815>`_ paper.
7+
The Cora citation network dataset from Ref. [1].
98
Nodes represent documents and edges represent citation links.
9+
Each node has a predefined feature with 1433 dimensions.
10+
The dataset is designed for the node classification task.
11+
The task is to predict the category of certain paper.
12+
The dataset is retrieved from Ref. [2].
13+
14+
## Statistics
15+
16+
- Nodes: 2708
17+
- Edges: 10556
18+
- Number of Classes: 7
19+
- Label split:
20+
- Train: 140
21+
- Val: 500
22+
- Test: 1000
23+
24+
The split is the one used in the original paper [1] and
25+
doesn't consider all nodes.
1026
1127
## Interface
1228
13-
- [`Cora.alldata`](@ref)
29+
- [`Cora.dataset`](@ref)
30+
31+
## References
32+
33+
[1]: [Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via Ranking](https://arxiv.org/abs/1707.03815)
34+
[2]: [Planetoid](https://github.com/kimiyoung/planetoid
1435
"""
1536
module Cora
1637

1738
using DataDeps
18-
using ..MLDatasets: datafile
39+
using ..MLDatasets: datafile, read_planetoid_data
1940
using DelimitedFiles: readdlm
2041

21-
const DEPNAME = "Cora"
22-
const LINK = "http://nrvis.com/download/data/labeled/cora.zip"
23-
const DOCS = "http://networkrepository.com/cora.php"
42+
using PyCall
2443

44+
const DEPNAME = "Cora"
45+
# LINK = "https://github.com/shchur/gnn-benchmark/raw/master/data/npz"
46+
# LINK = "https://github.com/abojchevski/graph2gauss/raw/master/data/"
47+
const LINK = "https://github.com/kimiyoung/planetoid/raw/master/data"
48+
const DOCS = "https://github.com/kimiyoung/planetoid"
49+
const DATA = "ind.cora." .* ["x", "y", "tx", "allx", "ty", "ally", "graph", "test.index"]
2550

2651
function __init__()
2752
register(DataDep(
@@ -30,40 +55,41 @@ function __init__()
3055
Dataset: The $DEPNAME dataset.
3156
Website: $DOCS
3257
""",
33-
LINK,
34-
"a3e3a37c34c9385fe8089bbc7c17ef78ecc3bdf8a4b03b80d02aaa080d9501c8", # if checksum omitted, will be generated by DataDeps
35-
post_fetch_method = unpack
58+
joinpath.(LINK, DATA),
59+
"81de017067dc045ebdb8ffd5c0e69a209973ffdb1fe2d5b434e94d3614f3f5c7", # if checksum omitted, will be generated by DataDeps
60+
# post_fetch_method = unpack
3661
))
3762
end
3863

3964
"""
40-
alldata(; dir=nothing)
65+
dataset(; dir=nothing, reverse_edges=true)
4166
4267
Retrieve the Cora dataset. The output is a named tuple with fields
68+
```juliarepl
69+
julia> keys(Cora.dataset())
70+
(:node_features, :node_labels, :adjacency_list, :train_indices, :val_indices, :test_indices, :num_classes, :num_nodes, :num_edges, :directed)
71+
```
72+
In particular, `adjacency_list` is a vector of vector,
73+
where `adjacency_list[i]` will contain the neighbors of node `i`
74+
through outgoing edges.
4375
44-
- `edges`
45-
- `node_labels`
46-
- `directed`
76+
If `reverse_edges=true`, the graph will contain
77+
the reverse of each edge and the graph will be undirected.
78+
79+
See also [`Cora`](@ref).
4780
4881
## Usage Examples
82+
4983
```juliarepl
50-
julia> using MLDatasets: Cora
84+
using MLDatasets: Cora
5185
52-
julia> data = Cora.alldata()
53-
(edges = [1 9; 1 436; … ; 2708 1390; 2708 2345], node_labels = [3, 6, 5, 5, 4, 4, 7, 3, 3, 7 … 4, 4, 4, 3, 2, 2, 2, 2, 1, 3], directed = true)
86+
data = Cora.dataset()
87+
train_labels = data.node_labels[data.train_indices]
5488
```
5589
"""
56-
function alldata(; dir=nothing)
57-
edges = readdlm(datafile(DEPNAME, "cora.edges", dir), ',', Int)
58-
@assert all(edges[:,3] .== 1)
59-
edges = edges[:,1:2]
60-
61-
node_labels = readdlm(datafile(DEPNAME, "cora.node_labels", dir), ',', Int)
62-
node_labels = node_labels[:,2] # first column is just 1:n
63-
64-
return (; edges=edges,
65-
node_labels=node_labels,
66-
directed=true)
67-
end
90+
dataset(; dir=nothing, reverse_edges=true) =
91+
read_planetoid_data(DEPNAME, dir=dir, reverse_edges=reverse_edges)
92+
93+
94+
end #module
6895

69-
end

src/MLDatasets.jl

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
module MLDatasets
22

33
using Requires
4+
using DelimitedFiles: readdlm
45
using FixedPointNumbers, ColorTypes
6+
using PyCall
57

68
bytes_to_type(::Type{UInt8}, A::Array{UInt8}) = A
79
bytes_to_type(::Type{N0f8}, A::Array{UInt8}) = reinterpret(N0f8, A)
@@ -45,13 +47,29 @@ include("PTBLM/PTBLM.jl")
4547
include("UD_English/UD_English.jl")
4648

4749
# Graphs
48-
include("Cora/Cora.jl")
50+
include("planetoid.jl")
51+
include("Cora/Cora.jl")
52+
include("PubMed/PubMed.jl")
53+
include("CiteSeer/CiteSeer.jl")
4954

5055
function __init__()
5156
# initialize optional dependencies
5257
@require ImageCore="a09fc81d-aa75-5fe9-8630-4744c3626534" begin
5358
global __images_supported__ = true
5459
end
60+
61+
62+
py"""
63+
import numpy as np
64+
import pickle
65+
66+
def pyread_planetoid_file(path, name):
67+
out = pickle.load(open(path, "rb"), encoding="latin1")
68+
if name == 'graph':
69+
return out
70+
out = out.todense() if hasattr(out, 'todense') else out
71+
return out
72+
"""
5573
end
5674

5775
end

0 commit comments

Comments
 (0)