1818import os
1919import csv
2020import re
21+ import io
2122import sys
2223if six .PY2 :
2324 reload (sys )
@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
4546 word_freq = collections .defaultdict (int )
4647 files = os .listdir (train_dir )
4748 for fi in files :
48- with open (os .path .join (train_dir , fi ), "r" , encoding = 'utf-8' ) as f :
49+ with io . open (os .path .join (train_dir , fi ), "r" , encoding = 'utf-8' ) as f :
4950 word_freq = word_count (column_num , f , word_freq )
5051 files = os .listdir (test_dir )
5152 for fi in files :
52- with open (os .path .join (test_dir , fi ), "r" , encoding = 'utf-8' ) as f :
53+ with io . open (os .path .join (test_dir , fi ), "r" , encoding = 'utf-8' ) as f :
5354 word_freq = word_count (column_num , f , word_freq )
5455
5556 word_freq = [x for x in six .iteritems (word_freq ) if x [1 ] > min_word_freq ]
@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
6566 if not os .path .exists (output_train_dir ):
6667 os .mkdir (output_train_dir )
6768 for fi in files :
68- with open (os .path .join (train_dir , fi ), "r" , encoding = 'utf-8' ) as f :
69- with open (
69+ with io . open (os .path .join (train_dir , fi ), "r" , encoding = 'utf-8' ) as f :
70+ with io . open (
7071 os .path .join (output_train_dir , fi ), "w" ,
7172 encoding = 'utf-8' ) as wf :
7273 data_file = csv .reader (f )
7374 for row in data_file :
7475 tag_raw = re .split (r'\W+' , row [0 ].strip ())
7576 pos_index = tag_idx .get (tag_raw [0 ])
76- wf .write (str (pos_index ) + "," )
77+ wf .write (u"{}," . format ( str (pos_index )) )
7778 text_raw = re .split (r'\W+' , row [2 ].strip ())
7879 l = [text_idx .get (w ) for w in text_raw ]
7980 for w in l :
80- wf .write (str (w ) + " " )
81- wf .write ("\n " )
81+ wf .write (u"{} " . format ( str (w )) )
82+ wf .write (u "\n " )
8283
8384 files = os .listdir (test_dir )
8485 if not os .path .exists (output_test_dir ):
8586 os .mkdir (output_test_dir )
8687 for fi in files :
87- with open (os .path .join (test_dir , fi ), "r" , encoding = 'utf-8' ) as f :
88- with open (
88+ with io . open (os .path .join (test_dir , fi ), "r" , encoding = 'utf-8' ) as f :
89+ with io . open (
8990 os .path .join (output_test_dir , fi ), "w" ,
9091 encoding = 'utf-8' ) as wf :
9192 data_file = csv .reader (f )
9293 for row in data_file :
9394 tag_raw = re .split (r'\W+' , row [0 ].strip ())
9495 pos_index = tag_idx .get (tag_raw [0 ])
95- wf .write (str (pos_index ) + "," )
96+ wf .write (u"{}," . format ( str (pos_index )) )
9697 text_raw = re .split (r'\W+' , row [2 ].strip ())
9798 l = [text_idx .get (w ) for w in text_raw ]
9899 for w in l :
99- wf .write (str (w ) + " " )
100- wf .write ("\n " )
100+ wf .write (u"{} " . format ( str (w )) )
101+ wf .write (u "\n " )
101102
102103
103104def text2paddle (train_dir , test_dir , output_train_dir , output_test_dir ,
104105 output_vocab_text , output_vocab_tag ):
105106 print ("start constuct word dict" )
106107 vocab_text = build_dict (2 , 0 , train_dir , test_dir )
107- with open (output_vocab_text , "w" , encoding = 'utf-8' ) as wf :
108- wf .write (str (len (vocab_text )) + " \n " )
108+ with io . open (output_vocab_text , "w" , encoding = 'utf-8' ) as wf :
109+ wf .write (u"{} \n " . format ( str (len (vocab_text ))) )
109110
110111 vocab_tag = build_dict (0 , 0 , train_dir , test_dir )
111- with open (output_vocab_tag , "w" , encoding = 'utf-8' ) as wf :
112- wf .write (str (len (vocab_tag )) + " \n " )
112+ with io . open (output_vocab_tag , "w" , encoding = 'utf-8' ) as wf :
113+ wf .write (u"{} \n " . format ( str (len (vocab_tag ))) )
113114
114115 print ("construct word dict done\n " )
115116 write_paddle (vocab_text , vocab_tag , train_dir , test_dir , output_train_dir ,
0 commit comments