Skip to content

Commit 321bb61

Browse files
committed
Added mutagenesis dataset
1 parent 8656032 commit 321bb61

File tree

8 files changed

+96
-0
lines changed

8 files changed

+96
-0
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
99
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
1010
FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
1111
GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
12+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1213
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
1314
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
1415
Requires = "ae029012-a4dd-5104-9daa-d747884805df"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Find below a list of available datasets and links to their documentation.
3030
#### Miscellaneous
3131
- [BostonHousing](https://juliaml.github.io/MLDatasets.jl/dev/datasets/BostonHousing/)
3232
- [Iris](https://juliaml.github.io/MLDatasets.jl/dev/datasets/Iris/)
33+
- [Mutagenesis](https://relational.fit.cvut.cz/dataset/Mutagenesis)
3334

3435
#### Text
3536
- [PTBLM](https://juliaml.github.io/MLDatasets.jl/dev/datasets/PTBLM/)

docs/make.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ makedocs(
3333
"Miscellaneous" => Any[
3434
"Iris" => "datasets/Iris.md",
3535
"Boston Housing" => "datasets/BostonHousing.md",
36+
"´Mutagenesis" => "datasets/Mutagenesis.md",
3637
],
3738

3839
"Text" => Any[

docs/src/datasets/Mutagenesis.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Mutagenesis
2+
3+
The `Mutagenesis` dataset comprises 188 molecules trialed for mutagenicity on Salmonella typhimurium, available from
4+
[relational.fit.cvut.cz](https://relational.fit.cvut.cz/dataset/Mutagenesis) and
5+
[CTUAvastLab/datasets](https://github.com/CTUAvastLab/datasets/tree/main/mutagenesis).
6+
7+
Train, test and validation data can be loaded using:
8+
```julia
9+
train_x, train_y = Mutagenesis.traindata()
10+
test_x, test_y = Mutagenesis.testdata()
11+
val_x, val_y = Mutagenesis.valdata()
12+
```
13+
14+
```@docs
15+
Mutagenesis
16+
```

src/MLDatasets.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ include("download.jl")
3939
# Misc.
4040
include("BostonHousing/BostonHousing.jl")
4141
include("Iris/Iris.jl")
42+
include("Mutagenesis/Mutagenesis.jl")
4243

4344
# Vision
4445
include("CIFAR10/CIFAR10.jl")

src/Mutagenesis/Mutagenesis.jl

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
export Mutagenesis
2+
module Mutagenesis
3+
4+
using DataDeps, JSON
5+
using ..MLDatasets: datafile
6+
7+
function __init__()
8+
DEPNAME = "Mutagenesis"
9+
ORIGINAL_LINK = "https://relational.fit.cvut.cz/dataset/Mutagenesis"
10+
DATA_LINK = "https://raw.githubusercontent.com/CTUAvastLab/datasets/main/mutagenesis"
11+
DATA = "data.json"
12+
METADATA = "meta.json"
13+
14+
register(DataDep(
15+
DEPNAME,
16+
"""
17+
Dataset: The $DEPNAME dataset.
18+
Website: $ORIGINAL_LINK
19+
License: CC0
20+
""",
21+
"$DATA_LINK/" .* [DATA, METADATA],
22+
))
23+
end
24+
25+
traindata(; dir = nothing) = traindata(dir)
26+
testdata(; dir = nothing) = testdata(dir)
27+
valdata(; dir = nothing) = valdata(dir)
28+
29+
function traindata(dir)
30+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
31+
samples[train_idxs], targets[train_idxs]
32+
end
33+
34+
function testdata(dir)
35+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
36+
samples[test_idxs], targets[test_idxs]
37+
end
38+
39+
function valdata(dir)
40+
samples, targets, train_idxs, val_idxs, test_idxs = load_data(dir)
41+
samples[val_idxs], targets[val_idxs]
42+
end
43+
44+
function load_data(dir)
45+
data_path = datafile(DEPNAME, DATA, dir)
46+
metadata_path = datafile(DEPNAME, METADATA, dir)
47+
samples = read_data(data_path)
48+
metadata = read_metadata(metadata_path)
49+
labelkey = metadata["label"]
50+
targets = map(i -> i[labelkey], samples)
51+
val_num = metadata["val_samples"]
52+
test_num = metadata["test_samples"]
53+
train_idxs = 1:length(samples)-val_num-test_num
54+
val_idxs = length(samples)-val_num-test_num+1:length(samples)-test_num
55+
test_idxs = length(samples)-test_num+1:length(samples)
56+
samples, targets, train_idxs, val_idxs, test_idxs
57+
end
58+
59+
read_data(path) = Vector{Dict}(open(JSON.parse, path))
60+
read_metadata(path) = open(JSON.parse, path)
61+
62+
end # module

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ tests = [
1010
# misc
1111
"tst_iris.jl",
1212
"tst_boston_housing.jl",
13+
"tst_mutagenesis.jl",
1314
# vision
1415
"tst_cifar10.jl",
1516
"tst_cifar100.jl",

test/tst_mutagenesis.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
2+
datadep"Mutagenesis"
3+
end
4+
5+
@testset "Mutagenesis" begin
6+
train_x, train_y = Mutagenesis.traindata()
7+
test_x, test_y = Mutagenesis.testdata()
8+
val_x, val_y = Mutagenesis.valdata()
9+
10+
@test length(train_x) == length(train_y) == 100
11+
@test length(test_x) == length(test_y) == 44
12+
@test length(val_x) == length(val_y) == 44
13+
end

0 commit comments

Comments
 (0)