12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
"""
15
- IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
15
+ IMDB dataset.
16
16
17
- TODO(yuyang18): Complete comments.
17
+ This module downloads IMDB dataset from
18
+ http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
19
+ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
20
+ Besides, this module also provides API for building dictionary.
18
21
"""
19
22
20
23
import paddle .v2 .dataset .common
31
34
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
32
35
33
36
34
- # Read files that match pattern. Tokenize and yield each file.
35
37
def tokenize (pattern ):
38
+ """
39
+ Read files that match the given pattern. Tokenize and yield each file.
40
+ """
41
+
36
42
with tarfile .open (paddle .v2 .dataset .common .download (URL , 'imdb' ,
37
43
MD5 )) as tarf :
38
44
# Note that we should use tarfile.next(), which does
@@ -49,6 +55,10 @@ def tokenize(pattern):
49
55
50
56
51
57
def build_dict (pattern , cutoff ):
58
+ """
59
+ Build a word dictionary from the corpus. Keys of the dictionary are words,
60
+ and values are zero-based IDs of these words.
61
+ """
52
62
word_freq = collections .defaultdict (int )
53
63
for doc in tokenize (pattern ):
54
64
for word in doc :
@@ -110,18 +120,46 @@ def reader():
110
120
111
121
112
122
def train (word_idx ):
123
+ """
124
+ IMDB training set creator.
125
+
126
+ It returns a reader creator, each sample in the reader is an zero-based ID
127
+ sequence and label in [0, 1].
128
+
129
+ :param word_idx: word dictionary
130
+ :type word_idx: dict
131
+ :return: Training reader creator
132
+ :rtype: callable
133
+ """
113
134
return reader_creator (
114
135
re .compile ("aclImdb/train/pos/.*\.txt$" ),
115
136
re .compile ("aclImdb/train/neg/.*\.txt$" ), word_idx , 1000 )
116
137
117
138
118
139
def test (word_idx ):
140
+ """
141
+ IMDB test set creator.
142
+
143
+ It returns a reader creator, each sample in the reader is an zero-based ID
144
+ sequence and label in [0, 1].
145
+
146
+ :param word_idx: word dictionary
147
+ :type word_idx: dict
148
+ :return: Test reader creator
149
+ :rtype: callable
150
+ """
119
151
return reader_creator (
120
152
re .compile ("aclImdb/test/pos/.*\.txt$" ),
121
153
re .compile ("aclImdb/test/neg/.*\.txt$" ), word_idx , 1000 )
122
154
123
155
124
156
def word_dict ():
157
+ """
158
+ Build a word dictionary from the corpus.
159
+
160
+ :return: Word dictionary
161
+ :rtype: dict
162
+ """
125
163
return build_dict (
126
164
re .compile ("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$" ), 150 )
127
165
0 commit comments