Skip to content

Commit 8498c72

Browse files
committed
rework the NLP datasets dor DataDeps
1 parent 84f96c0 commit 8498c72

File tree

7 files changed

+169
-62
lines changed

7 files changed

+169
-62
lines changed
File renamed without changes.

src/MLDatasets.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ bytes_to_type(::Type{T}, A::Array{UInt8}) where T<:Integer = convert(Array{T}, A
99
bytes_to_type(::Type{T}, A::Array{UInt8}) where T<:AbstractFloat = A ./ T(255)
1010
bytes_to_type(::Type{T}, A::Array{UInt8}) where T<:Number = convert(Array{T}, reinterpret(N0f8, A))
1111

12-
include("io/download.jl")
13-
include("io/CoNLL.jl")
12+
include("download.jl")
13+
include("CoNLL.jl")
1414

1515
include("CIFAR10/CIFAR10.jl")
1616
include("CIFAR100/CIFAR100.jl")
1717
include("MNIST/MNIST.jl")
1818
include("FashionMNIST/FashionMNIST.jl")
19-
include("PTBLM.jl")
20-
include("UD_English.jl")
19+
include("PTBLM/PTBLM.jl")
20+
include("UD_English/UD_English.jl")
2121

2222
end

src/PTBLM.jl

Lines changed: 0 additions & 35 deletions
This file was deleted.

src/PTBLM/PTBLM.jl

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
export PTBLM
2+
module PTBLM
3+
4+
using DataDeps
5+
using ..datafile
6+
using ..download_dep
7+
8+
export
9+
10+
traindata,
11+
testdata,
12+
13+
download
14+
15+
const DEPNAME = "PTBLM"
16+
const TRAINFILE = "ptb.train.txt"
17+
const TESTFILE = "ptb.test.txt"
18+
19+
download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
20+
21+
traindata(; dir = nothing) = traindata(dir)
22+
testdata(; dir = nothing) = testdata(dir)
23+
24+
function traindata(dir)
25+
path = datafile(DEPNAME, TRAINFILE, dir)
26+
xs = readdata(path)
27+
ys = makeys(xs)
28+
xs, ys
29+
end
30+
31+
function testdata(dir)
32+
path = datafile(DEPNAME, TESTFILE, dir)
33+
xs = readdata(path)
34+
ys = makeys(xs)
35+
xs, ys
36+
end
37+
38+
function readdata(path)
39+
lines = open(readlines, path)
40+
map(l -> Vector{String}(split(chomp(l))), lines)
41+
end
42+
43+
function makeys(xs::Vector{Vector{String}})
44+
map(xs) do x
45+
y = copy(x)
46+
shift!(y)
47+
push!(y, "<eos>")
48+
end
49+
end
50+
51+
function __init__()
52+
RegisterDataDep(
53+
DEPNAME,
54+
"""
55+
Dataset: Penn Treebank sentences for language modeling
56+
Website: https://github.com/tomsercu/lstm
57+
58+
-----------------------------------------------------
59+
WARNING: EXPERIMENTAL STATUS
60+
Please be aware that this dataset is from a secondary
61+
source. The provided interface by this package is not
62+
as developed as those for other datasets. We would
63+
welcome any contribution to provide this dataset in a
64+
more mature manner.
65+
------------------------------------------------------
66+
67+
The PTBLM dataset consists of Penn Treebank sentences
68+
for language modeling, available from tomsercu/lstm.
69+
The unknown words are replaced with <unk> so that the
70+
total vocaburary size becomes 10000.
71+
72+
The files are available for download at the github
73+
repository linked above. Note that using the data
74+
responsibly and respecting copyright remains your
75+
responsibility.
76+
""",
77+
"https://raw.githubusercontent.com/tomsercu/lstm/master/data/" .* [TRAINFILE, TESTFILE],
78+
"218f4e6c7288bb5efeb03cc4cb8ae9c04ecd8462ebfba8e13e3549fab69dc25f",
79+
)
80+
end
81+
end

src/UD_English.jl

Lines changed: 0 additions & 21 deletions
This file was deleted.

src/UD_English/UD_English.jl

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
export UD_English
2+
module UD_English
3+
4+
using DataDeps
5+
using ..MLDatasets
6+
using ..datafile
7+
using ..download_dep
8+
9+
export
10+
11+
traindata,
12+
testdata,
13+
14+
download
15+
16+
const DEPNAME = "UD_English"
17+
const TRAINFILE = "en-ud-train.conllu"
18+
const DEVFILE = "en-ud-dev.conllu"
19+
const TESTFILE = "en-ud-test.conllu"
20+
21+
download(args...; kw...) = download_dep(DEPNAME, args...; kw...)
22+
23+
traindata(; dir = nothing) = traindata(dir)
24+
devdata(; dir = nothing) = devdata(dir)
25+
testdata(; dir = nothing) = testdata(dir)
26+
27+
traindata(dir) = readdata(dir, TRAINFILE)
28+
devdata(dir) = readdata(dir, DEVFILE)
29+
testdata(dir) = readdata(dir, TESTFILE)
30+
31+
function readdata(dir, filename)
32+
path = datafile(DEPNAME, filename, dir)
33+
CoNLL.read(path)
34+
end
35+
36+
function __init__()
37+
RegisterDataDep(
38+
DEPNAME,
39+
"""
40+
Dataset: Universal Dependencies - English Dependency Treebank Universal Dependencies English Web Treebank
41+
Authors: Natalia Silveira and Timothy Dozat and
42+
Marie-Catherine de Marneffe and Samuel
43+
Bowman and Miriam Connor and John Bauer and
44+
Christopher D. Manning
45+
Website: https://github.com/UniversalDependencies/UD_English
46+
47+
A Gold Standard Universal Dependencies Corpus for
48+
English, built over the source material of the
49+
English Web Treebank LDC2012T13
50+
(https://catalog.ldc.upenn.edu/LDC2012T13).
51+
52+
You are encouraged to cite this paper if you use the
53+
Universal Dependencies English Web Treebank:
54+
55+
@inproceedings{silveira14gold,
56+
year = {2014},
57+
author = {Natalia Silveira and Timothy Dozat
58+
and Marie-Catherine de Marneffe and Samuel
59+
Bowman and Miriam Connor and John Bauer and
60+
Christopher D. Manning},
61+
title = {A Gold Standard Dependency Corpus for {E}nglish},
62+
booktitle = {Proceedings of the Ninth
63+
International Conference on Language Resources
64+
and Evaluation (LREC-2014)}
65+
}
66+
67+
The files are available for download at the github
68+
repository linked above. Note that using the data
69+
responsibly and respecting copyright remains your
70+
responsibility. Copyright and License is discussed in
71+
detail on the Website.
72+
""",
73+
"https://raw.githubusercontent.com/UniversalDependencies/UD_English/master/" .* [TRAINFILE, DEVFILE, TESTFILE],
74+
"2311e260488453d5ba170cfd94e58ac4bd536263ea9545c7b25f0804e87b28a2",
75+
)
76+
end
77+
end

src/io/download.jl renamed to src/download.jl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,13 @@ function datafile(depname, filename, dir = nothing; recurse = true, kw...)
3232
path = joinpath(datadir(depname, dir; kw...), filename)
3333
if !isfile(path)
3434
warn("The file \"$path\" does not exist, even though the dataset-specific folder does. This is an unusual situation that may have been caused by a manual creation of an empty folder, or manual deletion of the given file \"$filename\".")
35-
info("Retriggering DataDeps.jl for \"$depname\" to \"$dir\".")
36-
download_dep(depname, dir; kw...)
35+
if dir == nothing
36+
info("Retriggering DataDeps.jl for \"$depname\"")
37+
download_dep(depname; kw...)
38+
else
39+
info("Retriggering DataDeps.jl for \"$depname\" to \"$dir\".")
40+
download_dep(depname, dir; kw...)
41+
end
3742
if recurse
3843
datafile(depname, filename, dir; recurse = false, kw...)
3944
else

0 commit comments

Comments
 (0)