12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
"""
15
- IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
15
+ IMDB dataset.
16
16
17
- TODO(yuyang18): Complete comments.
17
+ This module download IMDB dataset from
18
+ http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
19
+ highly polar movie reviews for training, and 25,000 for testing. Besides, this
20
+ module also provides API for build dictionary and parse train set and test set
21
+ into paddle reader creators.
18
22
"""
19
23
20
24
import paddle .v2 .dataset .common
30
34
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
31
35
32
36
33
- # Read files that match pattern. Tokenize and yield each file.
34
37
def tokenize (pattern ):
38
+ """
39
+ Read files that match pattern. Tokenize and yield each file.
40
+ """
41
+
35
42
with tarfile .open (paddle .v2 .dataset .common .download (URL , 'imdb' ,
36
43
MD5 )) as tarf :
37
44
# Note that we should use tarfile.next(), which does
@@ -48,6 +55,9 @@ def tokenize(pattern):
48
55
49
56
50
57
def build_dict (pattern , cutoff ):
58
+ """
59
+ Build a word dictionary, the key is word, and the value is index.
60
+ """
51
61
word_freq = {}
52
62
for doc in tokenize (pattern ):
53
63
for word in doc :
@@ -109,18 +119,46 @@ def reader():
109
119
110
120
111
121
def train (word_idx ):
122
+ """
123
+ IMDB train set creator.
124
+
125
+ It returns a reader creator, each sample in the reader is an index
126
+ sequence and label in [0, 1].
127
+
128
+ :param word_idx: word dictionary
129
+ :type word_idx: dict
130
+ :return: Train reader creator
131
+ :rtype: callable
132
+ """
112
133
return reader_creator (
113
134
re .compile ("aclImdb/train/pos/.*\.txt$" ),
114
135
re .compile ("aclImdb/train/neg/.*\.txt$" ), word_idx , 1000 )
115
136
116
137
117
138
def test (word_idx ):
139
+ """
140
+ IMDB test set creator.
141
+
142
+ It returns a reader creator, each sample in the reader is an index
143
+ sequence and label in [0, 1].
144
+
145
+ :param word_idx: word dictionary
146
+ :type word_idx: dict
147
+ :return: Test reader creator
148
+ :rtype: callable
149
+ """
118
150
return reader_creator (
119
151
re .compile ("aclImdb/test/pos/.*\.txt$" ),
120
152
re .compile ("aclImdb/test/neg/.*\.txt$" ), word_idx , 1000 )
121
153
122
154
123
155
def word_dict ():
156
+ """
157
+ Build word dictionary.
158
+
159
+ :return: Word dictionary
160
+ :rtype: dict
161
+ """
124
162
return build_dict (
125
163
re .compile ("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$" ), 150 )
126
164
0 commit comments