Skip to content

Commit 50e6f1c

Browse files
add Cora Dataset (#69)
* add Cora Dataset
1 parent 24ccdc1 commit 50e6f1c

File tree

11 files changed

+137
-36
lines changed

11 files changed

+137
-36
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,8 @@ Find below a list of available datasets and links to their documentation.
3838
- [UD_English](https://juliaml.github.io/MLDatasets.jl/latest/datasets/UD_English/)
3939

4040
#### Graphs
41-
- To be added.
41+
- [Cora](https://juliaml.github.io/MLDatasets.jl/latest/datasets/Cora/)
4242

43-
#### Audio
44-
- To be added.
4543

4644

4745
## Installation
@@ -55,6 +53,11 @@ import Pkg
5553
Pkg.add("MLDatasets")
5654
```
5755

56+
## Contributing to MLDatasets
57+
58+
New dataset contributions are warmly welcome. See `src/Cora/Cora.jl` for an example
59+
of a minimal implementation.
60+
5861
## License
5962

6063
This code is free to use under the terms of the MIT license.

docs/make.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using Documenter, MLDatasets
22

3-
DocMeta.setdocmeta!(MLDatasets, :DocTestSetup, :(using MLDatasets); recursive=true)
3+
## Commented out since gives warning
4+
# DocMeta.setdocmeta!(MLDatasets, :DocTestSetup, :(using MLDatasets); recursive=true)
45

56
# Build documentation.
67
# ====================
@@ -38,9 +39,14 @@ makedocs(
3839
"UD_English" => "datasets/UD_English.md",
3940
],
4041

42+
"Graphs" => Any[
43+
"Cora" => "datasets/Cora.md",
44+
],
45+
4146
],
4247
"LICENSE.md",
43-
]
48+
],
49+
strict = true
4450
)
4551

4652

docs/src/datasets/Cora.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Cora
2+
3+
```@docs
4+
Cora
5+
```
6+
7+
## API reference
8+
9+
```@docs
10+
Cora.alldata
11+
```

src/Cora/Cora.jl

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
export Cora
2+
3+
"""
4+
Cora
5+
6+
The full Cora citation network dataset from the
7+
`"Deep Gaussian Embedding of Graphs: Unsupervised Inductive Learning via
8+
Ranking" <https://arxiv.org/abs/1707.03815>`_ paper.
9+
Nodes represent documents and edges represent citation links.
10+
11+
## Interface
12+
13+
- [`Cora.alldata`](@ref)
14+
"""
15+
module Cora
16+
17+
using DataDeps
18+
using ..MLDatasets: datafile
19+
using DelimitedFiles: readdlm
20+
21+
const DEPNAME = "Cora"
22+
const LINK = "http://nrvis.com/download/data/labeled/cora.zip"
23+
const DOCS = "http://networkrepository.com/cora.php"
24+
25+
26+
function __init__()
27+
register(DataDep(
28+
DEPNAME,
29+
"""
30+
Dataset: The $DEPNAME dataset.
31+
Website: $DOCS
32+
""",
33+
LINK,
34+
"a3e3a37c34c9385fe8089bbc7c17ef78ecc3bdf8a4b03b80d02aaa080d9501c8", # if checksum omitted, will be generated by DataDeps
35+
post_fetch_method = unpack
36+
))
37+
end
38+
39+
"""
40+
alldata(; dir=nothing)
41+
42+
Retrieve the Cora dataset. The output is a named tuple with fields
43+
44+
- `edges`
45+
- `node_labels`
46+
- `directed`
47+
48+
## Usage Examples
49+
```juliarepl
50+
julia> using MLDatasets: Cora
51+
52+
julia> data = Cora.alldata()
53+
(edges = [1 9; 1 436; … ; 2708 1390; 2708 2345], node_labels = [3, 6, 5, 5, 4, 4, 7, 3, 3, 7 … 4, 4, 4, 3, 2, 2, 2, 2, 1, 3], directed = true)
54+
```
55+
"""
56+
function alldata(; dir=nothing)
57+
edges = readdlm(datafile(DEPNAME, "cora.edges", dir), ',', Int)
58+
@assert all(edges[:,3] .== 1)
59+
edges = edges[:,1:2]
60+
61+
node_labels = readdlm(datafile(DEPNAME, "cora.node_labels", dir), ',', Int)
62+
node_labels = node_labels[:,2] # first column is just 1:n
63+
64+
return (; edges=edges,
65+
node_labels=node_labels,
66+
directed=true)
67+
end
68+
69+
end

src/EMNIST/EMNIST.jl

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,6 @@ using ..MNIST: convert2image
99

1010
const DEPNAME = "EMNIST"
1111

12-
"""
13-
download([dir]; [i_accept_the_terms_of_use])
14-
15-
Trigger the (interactive) download of the full dataset into
16-
"`dir`". If no `dir` is provided the dataset will be
17-
downloaded into "~/.julia/datadeps/$DEPNAME".
18-
19-
This function will display an interactive dialog unless
20-
either the keyword parameter `i_accept_the_terms_of_use` or
21-
the environment variable `DATADEPS_ALWAY_ACCEPT` is set to
22-
`true`. Note that using the data responsibly and respecting
23-
copyright/terms-of-use remains your responsibility.
24-
"""
2512
download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
2613

2714
function __init__()

src/MLDatasets.jl

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,27 @@ end
2525

2626

2727
include("download.jl")
28-
include("CoNLL.jl")
28+
29+
30+
# Misc.
2931
include("BostonHousing/BostonHousing.jl")
3032
include("Iris/Iris.jl")
33+
34+
# Vision
3135
include("CIFAR10/CIFAR10.jl")
3236
include("CIFAR100/CIFAR100.jl")
3337
include("MNIST/MNIST.jl")
3438
include("FashionMNIST/FashionMNIST.jl")
3539
include("SVHN2/SVHN2.jl")
40+
include("EMNIST/EMNIST.jl")
41+
42+
# Text
43+
include("CoNLL.jl")
3644
include("PTBLM/PTBLM.jl")
3745
include("UD_English/UD_English.jl")
38-
include("EMNIST/EMNIST.jl")
46+
47+
# Graphs
48+
include("Cora/Cora.jl")
3949

4050
function __init__()
4151
# initialize optional dependencies

src/download.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ function datafile(depname, filename, dir = nothing; recurse = true, kw...)
3232
path = joinpath(datadir(depname, dir; kw...), filename)
3333
if !isfile(path)
3434
@warn "The file \"$path\" does not exist, even though the dataset-specific folder does. This is an unusual situation that may have been caused by a manual creation of an empty folder, or manual deletion of the given file \"$filename\"."
35-
if dir == nothing
35+
if dir === nothing
3636
@info "Retriggering DataDeps.jl for \"$depname\""
3737
download_dep(depname; kw...)
3838
else

test/runtests.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
using Test
22
using MLDatasets
33
using ImageCore
4+
using DataDeps
5+
6+
7+
ENV["DATADEPS_ALWAYS_ACCEPT"] = true
48

59
tests = [
610
"tst_iris.jl",
@@ -11,6 +15,7 @@ tests = [
1115
"tst_fashion_mnist.jl",
1216
"tst_svhn2.jl",
1317
"tst_emnist.jl",
18+
"tst_cora.jl"
1419
]
1520

1621
for t in tests

test/tst_boston_housing.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ using DataDeps
44
using MLDatasets
55

66

7-
@testset "Iris" begin
7+
@testset "Boston Housing" begin
88
X = BostonHousing.features()
99
Y = BostonHousing.targets()
1010
names = BostonHousing.feature_names()

test/tst_cora.jl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
2+
datadep"Cora"
3+
end
4+
5+
@testset "Cora" begin
6+
data = Cora.alldata()
7+
@test data isa NamedTuple
8+
9+
@test data.edges isa Matrix{Int}
10+
@test size(data.edges) == (5429, 2)
11+
@test data.node_labels isa Vector{Int}
12+
@test size(data.node_labels) == (2708,)
13+
@test data.directed
14+
end

0 commit comments

Comments
 (0)