Skip to content

Commit 67d4d89

Browse files
committed
add doc for some v2/dataset
1 parent 9f417f1 commit 67d4d89

File tree

13 files changed

+197
-41
lines changed

13 files changed

+197
-41
lines changed

doc/api/v2/data.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ mnist
4949
:members:
5050
:noindex:
5151

52-
5352
cifar
5453
+++++
5554

@@ -61,7 +60,7 @@ conll05
6160
+++++++
6261

6362
.. automodule:: paddle.v2.dataset.conll05
64-
:members:
63+
:members: get_dict,get_embedding,test
6564
:noindex:
6665

6766
imdb
@@ -79,12 +78,18 @@ imikolov
7978
:noindex:
8079

8180
movielens
82-
+++++++++
81+
+++++++++
8382

8483
.. automodule:: paddle.v2.dataset.movielens
8584
:members:
8685
:noindex:
8786

87+
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
88+
:noindex:
89+
90+
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
91+
:noindex:
92+
8893
sentiment
8994
+++++++++
9095

@@ -102,7 +107,7 @@ uci_housing
102107
wmt14
103108
+++++
104109

105-
.. automodule:: paddle.v2.dataset.uci_housing
110+
.. automodule:: paddle.v2.dataset.wmt14
106111
:members:
107112
:noindex:
108113

doc/api/v2/run_logic.rst

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,18 @@ Trainer
1313
=======
1414

1515
.. automodule:: paddle.v2.trainer
16-
:members: Trainer
16+
:members: SGD
1717
:noindex:
1818

1919
Event
2020
=====
2121

2222
.. automodule:: paddle.v2.event
23-
:members: Event
23+
:members:
2424
:noindex:
2525

2626
Inference
2727
=========
2828

29-
.. automodule:: paddle.v2.inference
30-
:members: Inference
31-
:noindex:
32-
3329
.. autofunction:: paddle.v2.infer
34-
:members:
35-
:noindex:
36-
37-
30+
:noindex:

python/paddle/v2/data_feeder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter):
5252
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample
5353
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample
5454
# ]
55-
arg = feeder(minibatch_data)
55+
arg = feeder.convert(minibatch_data)
5656
5757
.. note::
5858

python/paddle/v2/dataset/cifar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
CIFAR dataset.
1616
1717
This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
18-
parse train set and test set into paddle reader creators.
18+
parse train/test set into paddle reader creators.
1919
2020
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000
2121
images per class. There are 50000 training images and 10000 test images.

python/paddle/v2/dataset/conll05.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this
16-
dataset as an example. Because Conll 2005 is not free in public, the default
17-
downloaded URL is test set of Conll 2005 (which is public). Users can change
18-
URL and MD5 to their Conll dataset.
19-
20-
TODO(yuyang18): Complete comments.
15+
Conll05 dataset.
16+
Paddle semantic role labeling Book and demo use this dataset as an example. Because
17+
Conll05 is not free in public, the default downloaded URL is test set of
18+
Conll05 (which is public). Users can change URL and MD5 to their Conll dataset.
19+
And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
2120
"""
2221

2322
import tarfile
@@ -180,17 +179,33 @@ def reader():
180179

181180

182181
def get_dict():
182+
"""
183+
Get the word, verb and label dictionary of Wikipedia corpus.
184+
"""
183185
word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
184186
verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
185187
label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
186188
return word_dict, verb_dict, label_dict
187189

188190

189191
def get_embedding():
192+
"""
193+
Get the trained word vector based on Wikipedia corpus.
194+
"""
190195
return download(EMB_URL, 'conll05st', EMB_MD5)
191196

192197

193198
def test():
199+
"""
200+
Conll05 test set creator.
201+
202+
Because the train dataset is not free, the test dataset is used for training.
203+
It returns a reader creator, each sample in the reader is nine features, including sentence
204+
sequence, predicate, predicate context, predicate context flag and tagged sequence.
205+
206+
:return: Train reader creator
207+
:rtype: callable
208+
"""
194209
word_dict, verb_dict, label_dict = get_dict()
195210
reader = corpus_reader(
196211
download(DATA_URL, 'conll05st', DATA_MD5),

python/paddle/v2/dataset/imdb.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,13 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
15+
IMDB dataset.
1616
17-
TODO(yuyang18): Complete comments.
17+
This module download IMDB dataset from
18+
http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
19+
highly polar movie reviews for training, and 25,000 for testing. Besides, this
20+
module also provides API for build dictionary and parse train set and test set
21+
into paddle reader creators.
1822
"""
1923

2024
import paddle.v2.dataset.common
@@ -30,8 +34,11 @@
3034
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
3135

3236

33-
# Read files that match pattern. Tokenize and yield each file.
3437
def tokenize(pattern):
38+
"""
39+
Read files that match pattern. Tokenize and yield each file.
40+
"""
41+
3542
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
3643
MD5)) as tarf:
3744
# Note that we should use tarfile.next(), which does
@@ -48,6 +55,9 @@ def tokenize(pattern):
4855

4956

5057
def build_dict(pattern, cutoff):
58+
"""
59+
Build a word dictionary, the key is word, and the value is index.
60+
"""
5161
word_freq = {}
5262
for doc in tokenize(pattern):
5363
for word in doc:
@@ -109,18 +119,46 @@ def reader():
109119

110120

111121
def train(word_idx):
122+
"""
123+
IMDB train set creator.
124+
125+
It returns a reader creator, each sample in the reader is an index
126+
sequence and label in [0, 1].
127+
128+
:param word_idx: word dictionary
129+
:type word_idx: dict
130+
:return: Train reader creator
131+
:rtype: callable
132+
"""
112133
return reader_creator(
113134
re.compile("aclImdb/train/pos/.*\.txt$"),
114135
re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
115136

116137

117138
def test(word_idx):
139+
"""
140+
IMDB test set creator.
141+
142+
It returns a reader creator, each sample in the reader is an index
143+
sequence and label in [0, 1].
144+
145+
:param word_idx: word dictionary
146+
:type word_idx: dict
147+
:return: Test reader creator
148+
:rtype: callable
149+
"""
118150
return reader_creator(
119151
re.compile("aclImdb/test/pos/.*\.txt$"),
120152
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
121153

122154

123155
def word_dict():
156+
"""
157+
Build word dictionary.
158+
159+
:return: Word dictionary
160+
:rtype: dict
161+
"""
124162
return build_dict(
125163
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
126164

python/paddle/v2/dataset/imikolov.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
15+
imikolov's simple dataset.
1616
17-
Complete comments.
17+
This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
18+
parse train/test set into paddle reader creators.
1819
"""
1920
import paddle.v2.dataset.common
2021
import tarfile
@@ -40,6 +41,9 @@ def word_count(f, word_freq=None):
4041

4142

4243
def build_dict():
44+
"""
45+
Build a word dictionary, the key is word, and the value is index.
46+
"""
4347
train_filename = './simple-examples/data/ptb.train.txt'
4448
test_filename = './simple-examples/data/ptb.valid.txt'
4549
with tarfile.open(
@@ -84,10 +88,36 @@ def reader():
8488

8589

8690
def train(word_idx, n):
91+
"""
92+
imikolov train set creator.
93+
94+
It returns a reader creator, each sample in the reader is an index
95+
tuple.
96+
97+
:param word_idx: word dictionary
98+
:type word_idx: dict
99+
:param n: sliding window size
100+
:type n: int
101+
:return: Train reader creator
102+
:rtype: callable
103+
"""
87104
return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
88105

89106

90107
def test(word_idx, n):
108+
"""
109+
imikolov test set creator.
110+
111+
It returns a reader creator, each sample in the reader is an index
112+
tuple.
113+
114+
:param word_idx: word dictionary
115+
:type word_idx: dict
116+
:param n: sliding window size
117+
:type n: int
118+
:return: Train reader creator
119+
:rtype: callable
120+
"""
91121
return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
92122

93123

python/paddle/v2/dataset/mnist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
MNIST dataset.
1616
1717
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
18-
parse train set and test set into paddle reader creators.
18+
parse train/test set into paddle reader creators.
1919
"""
2020
import paddle.v2.dataset.common
2121
import subprocess

0 commit comments

Comments
 (0)