Skip to content

Commit 524be35

Browse files
Merge pull request #79 from CTUAvastLab/master
Added mutagenesis dataset
2 parents 8656032 + d267f1f commit 524be35

File tree

8 files changed

+122
-0
lines changed

8 files changed

+122
-0
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
99
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
1010
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
1111
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
12+
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
1213
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
1314
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
1415
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
@@ -20,6 +21,7 @@ DataDeps = "0.3, 0.4, 0.5, 0.6, 0.7"
2021
FixedPointNumbers = "0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
2122
GZip = "0.5"
2223
ImageCore = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
24+
JSON3 = "1"
2325
MAT = "0.7, 0.8, 0.9, 0.10"
2426
PyCall = "1"
2527
Requires = "1"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Find below a list of available datasets and links to their documentation.
3030
#### Miscellaneous
3131
- [BostonHousing](https://juliaml.github.io/MLDatasets.jl/dev/datasets/BostonHousing/)
3232
- [Iris](https://juliaml.github.io/MLDatasets.jl/dev/datasets/Iris/)
33+
- [Mutagenesis](https://relational.fit.cvut.cz/dataset/Mutagenesis)
3334

3435
#### Text
3536
- [PTBLM](https://juliaml.github.io/MLDatasets.jl/dev/datasets/PTBLM/)

docs/make.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ makedocs(
3333
"Miscellaneous" => Any[
3434
"Iris" => "datasets/Iris.md",
3535
"Boston Housing" => "datasets/BostonHousing.md",
36+
"Mutagenesis" => "datasets/Mutagenesis.md",
3637
],
3738

3839
"Text" => Any[

docs/src/datasets/Mutagenesis.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Mutagenesis
2+
3+
```@docs
4+
Mutagenesis
5+
```

src/MLDatasets.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ include("download.jl")
3939
# Misc.
4040
include("BostonHousing/BostonHousing.jl")
4141
include("Iris/Iris.jl")
42+
include("Mutagenesis/Mutagenesis.jl")
4243

4344
# Vision
4445
include("CIFAR10/CIFAR10.jl")

src/Mutagenesis/Mutagenesis.jl

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
export Mutagenesis
2+
3+
"""
4+
Mutagenesis
5+
6+
Website: https://relational.fit.cvut.cz/dataset/Mutagenesis
7+
License: CC0
8+
9+
The `Mutagenesis` dataset comprises 188 molecules trialed for mutagenicity on Salmonella typhimurium, available from
10+
[relational.fit.cvut.cz](https://relational.fit.cvut.cz/dataset/Mutagenesis) and
11+
[CTUAvastLab/datasets](https://github.com/CTUAvastLab/datasets/tree/main/mutagenesis).
12+
13+
Train, test and validation data can be loaded using following code.
14+
The `withenv("DATADEPS_ALWAYS_ACCEPT"=>"true")` disables the accept download prompt.
15+
```jldoctest
16+
julia> using MLDatasets: Mutagenesis
17+
18+
julia> train_x, train_y = withenv("DATADEPS_ALWAYS_ACCEPT"=>"true") do; Mutagenesis.traindata(); end;
19+
20+
julia> test_x, test_y = Mutagenesis.testdata();
21+
22+
julia> val_x, val_y = Mutagenesis.valdata();
23+
24+
julia> train_x[1]
25+
JSON3.Object{Base.CodeUnits{UInt8, String}, SubArray{UInt64, 1, Vector{UInt64}, Tuple{UnitRange{Int64}}, true}} with 6 entries:
26+
:ind1 => 1
27+
:inda => 0
28+
:logp => 4.23
29+
:lumo => -1.246
30+
:mutagenic => 1
31+
:atoms => JSON3.Object[{…
32+
33+
julia> train_y[1]
34+
1
35+
```
36+
"""
37+
module Mutagenesis
38+
39+
using DataDeps, JSON3
40+
using ..MLDatasets: datafile
41+
42+
const DEPNAME = "Mutagenesis"
43+
const DATA = "data.json"
44+
const METADATA = "meta.json"
45+
46+
function __init__()
47+
ORIGINAL_LINK = "https://relational.fit.cvut.cz/dataset/Mutagenesis"
48+
DATA_LINK = "https://raw.githubusercontent.com/CTUAvastLab/datasets/main/mutagenesis"
49+
50+
register(DataDep(
51+
DEPNAME,
52+
"""
53+
Dataset: The $DEPNAME dataset.
54+
Website: $ORIGINAL_LINK
55+
License: CC0
56+
""",
57+
"$DATA_LINK/" .* [DATA, METADATA],
58+
"80ec1716217135e1f2e0b5a61876c65184e2014e64551103c41e174775ca207c"
59+
))
60+
end
61+
62+
traindata(; dir = nothing) = traindata(dir)
63+
testdata(; dir = nothing) = testdata(dir)
64+
valdata(; dir = nothing) = valdata(dir)
65+
66+
function traindata(dir)
67+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
68+
samples[train_idxs], targets[train_idxs]
69+
end
70+
71+
function testdata(dir)
72+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
73+
samples[test_idxs], targets[test_idxs]
74+
end
75+
76+
function valdata(dir)
77+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
78+
samples[val_idxs], targets[val_idxs]
79+
end
80+
81+
function load_data(dir)
82+
data_path = datafile(DEPNAME, DATA, dir)
83+
metadata_path = datafile(DEPNAME, METADATA, dir)
84+
samples = read_data(data_path)
85+
metadata = read_data(metadata_path)
86+
labelkey = metadata["label"]
87+
targets = map(i -> i[labelkey], samples)
88+
val_num = metadata["val_samples"]
89+
test_num = metadata["test_samples"]
90+
train_idxs = 1:length(samples)-val_num-test_num
91+
val_idxs = length(samples)-val_num-test_num+1:length(samples)-test_num
92+
test_idxs = length(samples)-test_num+1:length(samples)
93+
samples, targets, train_idxs, val_idxs, test_idxs
94+
end
95+
96+
read_data(path) = open(JSON3.read, path)
97+
98+
end # module

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ tests = [
1010
# misc
1111
"tst_iris.jl",
1212
"tst_boston_housing.jl",
13+
"tst_mutagenesis.jl",
1314
# vision
1415
"tst_cifar10.jl",
1516
"tst_cifar100.jl",

test/tst_mutagenesis.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
2+
datadep"Mutagenesis"
3+
end
4+
5+
@testset "Mutagenesis" begin
6+
train_x, train_y = Mutagenesis.traindata()
7+
test_x, test_y = Mutagenesis.testdata()
8+
val_x, val_y = Mutagenesis.valdata()
9+
10+
@test length(train_x) == length(train_y) == 100
11+
@test length(test_x) == length(test_y) == 44
12+
@test length(val_x) == length(val_y) == 44
13+
end

0 commit comments

Comments
 (0)