follow comments

QiJune · QiJune · commit 1e29b124254a · 2017-04-13T16:57:40.000+08:00
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
@@ -14,14 +14,17 @@
 """
 CIFAR dataset.
 
-This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
-parse train/test set into paddle reader creators.
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
 
-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000
-images per class. There are 50000 training images and 10000 test images.
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
 
-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes containing
-600 images each. There are 500 training images and 100 testing images per class.
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
 
 """
 
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 """
 Conll05 dataset.
-Paddle semantic role labeling Book and demo use this dataset as an example. Because
-Conll05 is not free in public, the default downloaded URL is test set of
-Conll05 (which is public). Users can change URL and MD5 to their Conll dataset.
-And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
 """
 
 import tarfile
@@ -198,9 +199,10 @@ def test():
     """
     Conll05 test set creator.
 
-    Because the train dataset is not free, the test dataset is used for training.
-    It returns a reader creator, each sample in the reader is nine features, including sentence
-    sequence, predicate, predicate context, predicate context flag and tagged sequence.
+    Because the train dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
 
     :return: Train reader creator
     :rtype: callable
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
@@ -14,11 +14,10 @@
 """
 IMDB dataset.
 
-This module download IMDB dataset from
-http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
-highly polar movie reviews for training, and 25,000 for testing. Besides, this
-module also provides API for build dictionary and parse train set and test set
-into paddle reader creators.
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
 """
 
 import paddle.v2.dataset.common
@@ -37,7 +36,7 @@
 
 def tokenize(pattern):
     """
-    Read files that match pattern.  Tokenize and yield each file.
+    Read files that match the given pattern.  Tokenize and yield each file.
     """
 
     with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
@@ -57,7 +56,8 @@ def tokenize(pattern):
 
 def build_dict(pattern, cutoff):
     """
-    Build a word dictionary, the key is word, and the value is index.
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
     """
     word_freq = collections.defaultdict(int)
     for doc in tokenize(pattern):
@@ -123,7 +123,7 @@ def train(word_idx):
     """
     IMDB train set creator.
 
-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is an zero-based ID
     sequence and label in [0, 1].
 
     :param word_idx: word dictionary
@@ -140,7 +140,7 @@ def test(word_idx):
     """
     IMDB test set creator.
 
-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is an zero-based ID
     sequence and label in [0, 1].
 
     :param word_idx: word dictionary
@@ -155,7 +155,7 @@ def test(word_idx):
 
 def word_dict():
     """
-    Build word dictionary.
+    Build a word dictionary from the corpus.
 
     :return: Word dictionary
     :rtype: dict
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
@@ -14,8 +14,9 @@
 """
 imikolov's simple dataset.
 
-This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
-parse train/test set into paddle reader creators.
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse train/test set into paddle
+reader creators.
 """
 import paddle.v2.dataset.common
 import collections
@@ -42,7 +43,8 @@ def word_count(f, word_freq=None):
 
 def build_dict():
     """
-    Build a word dictionary, the key is word, and the value is index.
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
     """
     train_filename = './simple-examples/data/ptb.train.txt'
     test_filename = './simple-examples/data/ptb.valid.txt'
@@ -91,7 +93,7 @@ def train(word_idx, n):
     """
     imikolov train set creator.
 
-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is a word ID
     tuple.
 
     :param word_idx: word dictionary
@@ -108,7 +110,7 @@ def test(word_idx, n):
     """
     imikolov test set creator.
 
-    It returns a reader creator, each sample in the reader is an index
+    It returns a reader creator, each sample in the reader is a word ID
     tuple.
 
     :param word_idx: word dictionary
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
@@ -14,10 +14,11 @@
 """
 Movielens 1-M dataset.
 
-Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was
-collected by GroupLens Research. This module will download Movielens 1-M dataset from
-http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set
-into paddle reader creators.
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test
+set into paddle reader creators.
 
 """
 
@@ -50,7 +51,7 @@ def __init__(self, index, categories, title):
 
     def value(self):
         """
-        Get information of a movie.
+        Get information from a movie.
         """
         return [
             self.index, [CATEGORIES_DICT[c] for c in self.categories],
@@ -78,7 +79,7 @@ def __init__(self, index, gender, age, job_id):
 
     def value(self):
         """
-        Get information of a user.
+        Get information from a user.
         """
         return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
 
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
@@ -75,8 +75,8 @@ def train():
     """
     UCI_HOUSING train set creator.
 
-    It returns a reader creator, each sample in the reader is features after normalization
-    and price number.
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
 
     :return: Train reader creator
     :rtype: callable
@@ -95,8 +95,8 @@ def test():
     """
     UCI_HOUSING test set creator.
 
-    It returns a reader creator, each sample in the reader is features after normalization
-    and price number.
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
 
     :return: Test reader creator
     :rtype: callable
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 """
 WMT14 dataset.
-The original WMT14 dataset is too large and a small set of data for set is provided.
-This module will download dataset from
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
 http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse train/test set into paddle reader creators.
 
@@ -107,8 +107,9 @@ def train(dict_size):
     """
     WMT14 train set creator.
 
-    It returns a reader creator, each sample in the reader is source language word index
-    sequence, target language word index sequence and next word index sequence.
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
 
     :return: Train reader creator
     :rtype: callable
@@ -121,8 +122,9 @@ def test(dict_size):
     """
     WMT14 test set creator.
 
-    It returns a reader creator, each sample in the reader is source language word index
-    sequence, target language word index sequence and next word index sequence.
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
 
     :return: Train reader creator
     :rtype: callable
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
@@ -1,5 +1,5 @@
 """
-Trainer package
+Module Trainer
 """
 import collections