Skip to content

Commit 7275e0a

Browse files
committed
In response to comments from Helin
1 parent a2cec42 commit 7275e0a

File tree

2 files changed

+12
-11
lines changed

2 files changed

+12
-11
lines changed

python/paddle/v2/dataset/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,10 @@ def download(url, module_name, md5sum):
3232
shutil.copyfileobj(r.raw, f)
3333

3434
return filename
35+
36+
37+
def dict_add(a_dict, ele):
38+
if ele in a_dict:
39+
a_dict[ele] += 1
40+
else:
41+
a_dict[ele] = 1

python/paddle/v2/dataset/imikolov.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,8 @@
1010
MD5 = '30177ea32e27c525793142b6bf2c8e2d'
1111

1212

13-
def add(a_dict, ele):
14-
if ele in a_dict:
15-
a_dict[ele] += 1
16-
else:
17-
a_dict[ele] = 1
18-
19-
2013
def word_count(f, word_freq=None):
14+
add = paddle.v2.dataset.common.dict_add
2115
if word_freq == None:
2216
word_freq = {}
2317

@@ -45,7 +39,7 @@ def build_dict(train_filename, test_filename):
4539
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
4640
words, _ = list(zip(*dictionary))
4741
word_idx = dict(zip(words, xrange(len(words))))
48-
word_idx['<any>'] = len(words)
42+
word_idx['<unk>'] = len(words)
4943

5044
return word_idx
5145

@@ -66,13 +60,13 @@ def reader():
6660
paddle.v2.dataset.imikolov.MD5)) as tf:
6761
f = tf.extractfile(filename)
6862

69-
ANY = word_idx['<any>']
63+
UNK = word_idx['<unk>']
7064
for l in f:
7165
l = ['<s>'] + l.strip().split() + ['<e>']
7266
if len(l) >= n:
73-
l = [word_idx.get(w, ANY) for w in l]
67+
l = [word_idx.get(w, UNK) for w in l]
7468
for i in range(n, len(l) + 1):
75-
yield l[i - n:i]
69+
yield tuple(l[i - n:i])
7670

7771
return reader
7872

0 commit comments

Comments
 (0)