Skip to content

Commit 406f26a

Browse files
committed
support py2
1 parent 68cc383 commit 406f26a

File tree

1 file changed

+17
-16
lines changed

1 file changed

+17
-16
lines changed

models/contentunderstanding/tagspace/data/text2paddle.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import os
1919
import csv
2020
import re
21+
import io
2122
import sys
2223
if six.PY2:
2324
reload(sys)
@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
4546
word_freq = collections.defaultdict(int)
4647
files = os.listdir(train_dir)
4748
for fi in files:
48-
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
49+
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
4950
word_freq = word_count(column_num, f, word_freq)
5051
files = os.listdir(test_dir)
5152
for fi in files:
52-
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
53+
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
5354
word_freq = word_count(column_num, f, word_freq)
5455

5556
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
6566
if not os.path.exists(output_train_dir):
6667
os.mkdir(output_train_dir)
6768
for fi in files:
68-
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
69-
with open(
69+
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
70+
with io.open(
7071
os.path.join(output_train_dir, fi), "w",
7172
encoding='utf-8') as wf:
7273
data_file = csv.reader(f)
7374
for row in data_file:
7475
tag_raw = re.split(r'\W+', row[0].strip())
7576
pos_index = tag_idx.get(tag_raw[0])
76-
wf.write(str(pos_index) + ",")
77+
wf.write(u"{},".format(str(pos_index)))
7778
text_raw = re.split(r'\W+', row[2].strip())
7879
l = [text_idx.get(w) for w in text_raw]
7980
for w in l:
80-
wf.write(str(w) + " ")
81-
wf.write("\n")
81+
wf.write(u"{} ".format(str(w)))
82+
wf.write(u"\n")
8283

8384
files = os.listdir(test_dir)
8485
if not os.path.exists(output_test_dir):
8586
os.mkdir(output_test_dir)
8687
for fi in files:
87-
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
88-
with open(
88+
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
89+
with io.open(
8990
os.path.join(output_test_dir, fi), "w",
9091
encoding='utf-8') as wf:
9192
data_file = csv.reader(f)
9293
for row in data_file:
9394
tag_raw = re.split(r'\W+', row[0].strip())
9495
pos_index = tag_idx.get(tag_raw[0])
95-
wf.write(str(pos_index) + ",")
96+
wf.write(u"{},".format(str(pos_index)))
9697
text_raw = re.split(r'\W+', row[2].strip())
9798
l = [text_idx.get(w) for w in text_raw]
9899
for w in l:
99-
wf.write(str(w) + " ")
100-
wf.write("\n")
100+
wf.write(u"{} ".format(str(w)))
101+
wf.write(u"\n")
101102

102103

103104
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
104105
output_vocab_text, output_vocab_tag):
105106
print("start constuct word dict")
106107
vocab_text = build_dict(2, 0, train_dir, test_dir)
107-
with open(output_vocab_text, "w", encoding='utf-8') as wf:
108-
wf.write(str(len(vocab_text)) + "\n")
108+
with io.open(output_vocab_text, "w", encoding='utf-8') as wf:
109+
wf.write(u"{}\n".format(str(len(vocab_text))))
109110

110111
vocab_tag = build_dict(0, 0, train_dir, test_dir)
111-
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
112-
wf.write(str(len(vocab_tag)) + "\n")
112+
with io.open(output_vocab_tag, "w", encoding='utf-8') as wf:
113+
wf.write(u"{}\n".format(str(len(vocab_tag))))
113114

114115
print("construct word dict done\n")
115116
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,

0 commit comments

Comments
 (0)