@@ -44,20 +44,21 @@ def preprocess_data(self):
44
44
if self .opt .skip_preprocess :
45
45
return
46
46
iou = IoUtils ()
47
- if not self .opt .data_dir :
48
- self .opt .data_dir = tempfile .TemporaryDirectory ().name
47
+ if not self .opt .processed_data_dir :
48
+ self .opt .processed_data_dir = tempfile .TemporaryDirectory ().name
49
49
iou .convert_stream_to_h5 (self .opt .data_path , self .opt .word_min_count ,
50
- self .opt .data_dir )
50
+ self .opt .processed_data_dir )
51
51
52
52
def init_model (self ):
53
53
# load voca
54
- self .logger .info ("load key from %s" , pjoin (self .opt .data_dir , "keys.txt" ))
55
- with open (pjoin (self .opt .data_dir , "keys.txt" ), "rb" ) as fin :
54
+ data_dir = self .opt .processed_data_dir
55
+ self .logger .info ("load key from %s" , pjoin (data_dir , "keys.txt" ))
56
+ with open (pjoin (data_dir , "keys.txt" ), "rb" ) as fin :
56
57
self .words = [line .strip () for line in fin ]
57
58
self .num_words = len (self .words )
58
59
59
60
# count number of docs
60
- h5f = h5py .File (pjoin (self . opt . data_dir , "token.h5" ), "r" )
61
+ h5f = h5py .File (pjoin (data_dir , "token.h5" ), "r" )
61
62
self .num_docs = h5f ["indptr" ].shape [0 ] - 1
62
63
h5f .close ()
63
64
@@ -88,7 +89,7 @@ def init_model(self):
88
89
def train_model (self ):
89
90
self .preprocess_data ()
90
91
self .init_model ()
91
- h5f = h5py .File (pjoin (self .opt .data_dir , "token.h5" ), "r" )
92
+ h5f = h5py .File (pjoin (self .opt .processed_data_dir , "token.h5" ), "r" )
92
93
for epoch in range (1 , self .opt .epochs + 1 ):
93
94
self .logger .info ("Epoch %d / %d" , epoch , self .opt .epochs )
94
95
self ._train_e_step (h5f )
0 commit comments