Skip to content

Commit e329655

Browse files
authored
Merge pull request #1719 from QiJune/feature/add_v2_api_doc
Add more docs for paddle v2 api
2 parents 7384966 + 398d50d commit e329655

File tree

17 files changed

+419
-40
lines changed

17 files changed

+419
-40
lines changed

doc/api/v2/config/optimizer.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
.. _api_v2.optimizer:
2-
31
==========
42
Optimizer
53
==========

doc/api/v2/data.rst

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
========
2-
Datasets
3-
========
1+
==================================
2+
Data Reader Interface and DataSets
3+
==================================
44

55

66
DataTypes
@@ -49,7 +49,6 @@ mnist
4949
:members:
5050
:noindex:
5151

52-
5352
cifar
5453
+++++
5554

@@ -61,7 +60,7 @@ conll05
6160
+++++++
6261

6362
.. automodule:: paddle.v2.dataset.conll05
64-
:members:
63+
:members: get_dict,get_embedding,test
6564
:noindex:
6665

6766
imdb
@@ -85,6 +84,12 @@ movielens
8584
:members:
8685
:noindex:
8786

87+
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
88+
:noindex:
89+
90+
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
91+
:noindex:
92+
8893
sentiment
8994
+++++++++
9095

@@ -102,7 +107,7 @@ uci_housing
102107
wmt14
103108
+++++
104109

105-
.. automodule:: paddle.v2.dataset.uci_housing
110+
.. automodule:: paddle.v2.dataset.wmt14
106111
:members:
107112
:noindex:
108113

doc/api/v2/run_logic.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,26 @@ Parameters
66
==========
77

88
.. automodule:: paddle.v2.parameters
9+
:members: Parameters
910
:noindex:
1011

1112
Trainer
1213
=======
1314

1415
.. automodule:: paddle.v2.trainer
16+
:members: SGD
1517
:noindex:
1618

1719
Event
1820
=====
1921

2022
.. automodule:: paddle.v2.event
23+
:members:
2124
:noindex:
2225

2326
Inference
2427
=========
2528

2629
.. autofunction:: paddle.v2.infer
2730
:noindex:
31+

python/paddle/v2/data_feeder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class DataFeeder(DataProviderConverter):
6767
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample
6868
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample
6969
# ]
70-
arg = feeder(minibatch_data)
70+
arg = feeder.convert(minibatch_data)
7171
7272
.. note::
7373

python/paddle/v2/dataset/cifar.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,20 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
15+
CIFAR dataset.
16+
17+
This module will download dataset from
18+
https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
19+
paddle reader creators.
20+
21+
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
22+
with 6000 images per class. There are 50000 training images and 10000 test
23+
images.
24+
25+
The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
26+
containing 600 images each. There are 500 training images and 100 testing
27+
images per class.
1628
17-
TODO(yuyang18): Complete the comments.
1829
"""
1930

2031
import cPickle
@@ -54,20 +65,56 @@ def reader():
5465

5566

5667
def train100():
68+
"""
69+
CIFAR-100 training set creator.
70+
71+
It returns a reader creator, each sample in the reader is image pixels in
72+
[0, 1] and label in [0, 99].
73+
74+
:return: Training reader creator
75+
:rtype: callable
76+
"""
5777
return reader_creator(
5878
download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
5979

6080

6181
def test100():
82+
"""
83+
CIFAR-100 test set cretor.
84+
85+
It returns a reader creator, each sample in the reader is image pixels in
86+
[0, 1] and label in [0, 9].
87+
88+
:return: Test reader creator.
89+
:rtype: callable
90+
"""
6291
return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
6392

6493

6594
def train10():
95+
"""
96+
CIFAR-10 training set creator.
97+
98+
It returns a reader creator, each sample in the reader is image pixels in
99+
[0, 1] and label in [0, 9].
100+
101+
:return: Training reader creator
102+
:rtype: callable
103+
"""
66104
return reader_creator(
67105
download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
68106

69107

70108
def test10():
109+
"""
110+
CIFAR-10 test set cretor.
111+
112+
It returns a reader creator, each sample in the reader is image pixels in
113+
[0, 1] and label in [0, 9].
114+
115+
:return: Test reader creator.
116+
:rtype: callable
117+
"""
71118
return reader_creator(
72119
download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
73120

python/paddle/v2/dataset/conll05.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,19 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
"""
15+
Conll05 dataset.
16+
Paddle semantic role labeling Book and demo use this dataset as an example.
17+
Because Conll05 is not free in public, the default downloaded URL is test set
18+
of Conll05 (which is public). Users can change URL and MD5 to their Conll
19+
dataset. And a pre-trained word vector model based on Wikipedia corpus is used
20+
to initialize SRL model.
21+
"""
1422

1523
import tarfile
1624
import gzip
1725
import itertools
1826
from common import download
19-
"""
20-
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this
21-
dataset as an example. Because Conll 2005 is not free in public, the default
22-
downloaded URL is test set of Conll 2005 (which is public). Users can change
23-
URL and MD5 to their Conll dataset.
24-
25-
TODO(yuyang18): Complete comments.
26-
"""
2727

2828
__all__ = ['test, get_dict', 'get_embedding']
2929

@@ -179,17 +179,34 @@ def reader():
179179

180180

181181
def get_dict():
182+
"""
183+
Get the word, verb and label dictionary of Wikipedia corpus.
184+
"""
182185
word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
183186
verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
184187
label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
185188
return word_dict, verb_dict, label_dict
186189

187190

188191
def get_embedding():
192+
"""
193+
Get the trained word vector based on Wikipedia corpus.
194+
"""
189195
return download(EMB_URL, 'conll05st', EMB_MD5)
190196

191197

192198
def test():
199+
"""
200+
Conll05 test set creator.
201+
202+
Because the training dataset is not free, the test dataset is used for
203+
training. It returns a reader creator, each sample in the reader is nine
204+
features, including sentence sequence, predicate, predicate context,
205+
predicate context flag and tagged sequence.
206+
207+
:return: Training reader creator
208+
:rtype: callable
209+
"""
193210
word_dict, verb_dict, label_dict = get_dict()
194211
reader = corpus_reader(
195212
download(DATA_URL, 'conll05st', DATA_MD5),

python/paddle/v2/dataset/imdb.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
15+
IMDB dataset.
1616
17-
TODO(yuyang18): Complete comments.
17+
This module downloads IMDB dataset from
18+
http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
19+
of 25,000 highly polar movie reviews for training, and 25,000 for testing.
20+
Besides, this module also provides API for building dictionary.
1821
"""
1922

2023
import paddle.v2.dataset.common
@@ -31,8 +34,11 @@
3134
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
3235

3336

34-
# Read files that match pattern. Tokenize and yield each file.
3537
def tokenize(pattern):
38+
"""
39+
Read files that match the given pattern. Tokenize and yield each file.
40+
"""
41+
3642
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
3743
MD5)) as tarf:
3844
# Note that we should use tarfile.next(), which does
@@ -49,6 +55,10 @@ def tokenize(pattern):
4955

5056

5157
def build_dict(pattern, cutoff):
58+
"""
59+
Build a word dictionary from the corpus. Keys of the dictionary are words,
60+
and values are zero-based IDs of these words.
61+
"""
5262
word_freq = collections.defaultdict(int)
5363
for doc in tokenize(pattern):
5464
for word in doc:
@@ -110,18 +120,46 @@ def reader():
110120

111121

112122
def train(word_idx):
123+
"""
124+
IMDB training set creator.
125+
126+
It returns a reader creator, each sample in the reader is an zero-based ID
127+
sequence and label in [0, 1].
128+
129+
:param word_idx: word dictionary
130+
:type word_idx: dict
131+
:return: Training reader creator
132+
:rtype: callable
133+
"""
113134
return reader_creator(
114135
re.compile("aclImdb/train/pos/.*\.txt$"),
115136
re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
116137

117138

118139
def test(word_idx):
140+
"""
141+
IMDB test set creator.
142+
143+
It returns a reader creator, each sample in the reader is an zero-based ID
144+
sequence and label in [0, 1].
145+
146+
:param word_idx: word dictionary
147+
:type word_idx: dict
148+
:return: Test reader creator
149+
:rtype: callable
150+
"""
119151
return reader_creator(
120152
re.compile("aclImdb/test/pos/.*\.txt$"),
121153
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
122154

123155

124156
def word_dict():
157+
"""
158+
Build a word dictionary from the corpus.
159+
160+
:return: Word dictionary
161+
:rtype: dict
162+
"""
125163
return build_dict(
126164
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
127165

python/paddle/v2/dataset/imikolov.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
15+
imikolov's simple dataset.
1616
17-
Complete comments.
17+
This module will download dataset from
18+
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
19+
into paddle reader creators.
1820
"""
1921
import paddle.v2.dataset.common
2022
import collections
@@ -40,6 +42,10 @@ def word_count(f, word_freq=None):
4042

4143

4244
def build_dict():
45+
"""
46+
Build a word dictionary from the corpus, Keys of the dictionary are words,
47+
and values are zero-based IDs of these words.
48+
"""
4349
train_filename = './simple-examples/data/ptb.train.txt'
4450
test_filename = './simple-examples/data/ptb.valid.txt'
4551
with tarfile.open(
@@ -84,10 +90,36 @@ def reader():
8490

8591

8692
def train(word_idx, n):
93+
"""
94+
imikolov training set creator.
95+
96+
It returns a reader creator, each sample in the reader is a word ID
97+
tuple.
98+
99+
:param word_idx: word dictionary
100+
:type word_idx: dict
101+
:param n: sliding window size
102+
:type n: int
103+
:return: Training reader creator
104+
:rtype: callable
105+
"""
87106
return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
88107

89108

90109
def test(word_idx, n):
110+
"""
111+
imikolov test set creator.
112+
113+
It returns a reader creator, each sample in the reader is a word ID
114+
tuple.
115+
116+
:param word_idx: word dictionary
117+
:type word_idx: dict
118+
:param n: sliding window size
119+
:type n: int
120+
:return: Test reader creator
121+
:rtype: callable
122+
"""
91123
return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
92124

93125

0 commit comments

Comments
 (0)