@@ -123,7 +123,6 @@ def load(self, offset=0, stride=1):
123123 with exfile_open (self .src , mode = 'rb' ) as fs ,\
124124 exfile_open (self .tgt , mode = 'rb' ) as ft ,\
125125 exfile_open (self .align , mode = 'rb' ) as fa :
126- logger .info (f"Loading { repr (self )} ..." )
127126 for i , (sline , tline , align ) in enumerate (zip (fs , ft , fa )):
128127 if (i % stride ) == offset :
129128 sline = sline .decode ('utf-8' )
@@ -136,7 +135,7 @@ def load(self, offset=0, stride=1):
136135 example ['align' ] = align .decode ('utf-8' )
137136 yield example
138137
139- def __repr__ (self ):
138+ def __str__ (self ):
140139 cls_name = type (self ).__name__
141140 return '{}({}, {}, align={})' .format (
142141 cls_name , self .src , self .tgt , self .align )
@@ -169,19 +168,17 @@ class ParallelCorpusIterator(object):
169168
170169 Args:
171170 corpus (ParallelCorpus): corpus to iterate;
172- transform (Transform): transforms to be applied to corpus;
173- infinitely (bool): True to iterate endlessly;
171+ transform (TransformPipe): transforms to be applied to corpus;
174172 skip_empty_level (str): security level when encouter empty line;
175173 stride (int): iterate corpus with this line stride;
176174 offset (int): iterate corpus with this line offset.
177175 """
178176
179- def __init__ (self , corpus , transform , infinitely = False ,
177+ def __init__ (self , corpus , transform ,
180178 skip_empty_level = 'warning' , stride = 1 , offset = 0 ):
181179 self .cid = corpus .id
182180 self .corpus = corpus
183181 self .transform = transform
184- self .infinitely = infinitely
185182 if skip_empty_level not in ['silent' , 'warning' , 'error' ]:
186183 raise ValueError (
187184 f"Invalid argument skip_empty_level={ skip_empty_level } " )
@@ -208,8 +205,11 @@ def _transform(self, stream):
208205 yield item
209206 report_msg = self .transform .stats ()
210207 if report_msg != '' :
211- logger .info ("Transform statistics for {}:\n {}" .format (
212- self .cid , report_msg ))
208+ logger .info (
209+ "* Transform statistics for {}({:.2f}%):\n {}\n " .format (
210+ self .cid , 100 / self .stride , report_msg
211+ )
212+ )
213213
214214 def _add_index (self , stream ):
215215 for i , item in enumerate (stream ):
@@ -227,24 +227,17 @@ def _add_index(self, stream):
227227 continue
228228 yield item
229229
230- def _iter_corpus (self ):
230+ def __iter__ (self ):
231231 corpus_stream = self .corpus .load (
232- stride = self .stride , offset = self .offset )
232+ stride = self .stride , offset = self .offset
233+ )
233234 tokenized_corpus = self ._tokenize (corpus_stream )
234235 transformed_corpus = self ._transform (tokenized_corpus )
235236 indexed_corpus = self ._add_index (transformed_corpus )
236237 yield from indexed_corpus
237238
238- def __iter__ (self ):
239- if self .infinitely :
240- while True :
241- _iter = self ._iter_corpus ()
242- yield from _iter
243- else :
244- yield from self ._iter_corpus ()
245-
246239
247- def build_corpora_iters (corpora , transforms , corpora_info , is_train = False ,
240+ def build_corpora_iters (corpora , transforms , corpora_info ,
248241 skip_empty_level = 'warning' , stride = 1 , offset = 0 ):
249242 """Return `ParallelCorpusIterator` for all corpora defined in opts."""
250243 corpora_iters = dict ()
@@ -256,7 +249,7 @@ def build_corpora_iters(corpora, transforms, corpora_info, is_train=False,
256249 transform_pipe = TransformPipe .build_from (corpus_transform )
257250 logger .info (f"{ c_id } 's transforms: { str (transform_pipe )} " )
258251 corpus_iter = ParallelCorpusIterator (
259- corpus , transform_pipe , infinitely = is_train ,
252+ corpus , transform_pipe ,
260253 skip_empty_level = skip_empty_level , stride = stride , offset = offset )
261254 corpora_iters [c_id ] = corpus_iter
262255 return corpora_iters
@@ -294,7 +287,7 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
294287 sub_counter_src = Counter ()
295288 sub_counter_tgt = Counter ()
296289 datasets_iterables = build_corpora_iters (
297- corpora , transforms , opts .data , is_train = False ,
290+ corpora , transforms , opts .data ,
298291 skip_empty_level = opts .skip_empty_level ,
299292 stride = stride , offset = offset )
300293 for c_name , c_iter in datasets_iterables .items ():
@@ -380,7 +373,7 @@ def save_transformed_sample(opts, transforms, n_sample=3):
380373
381374 corpora = get_corpora (opts , is_train = True )
382375 datasets_iterables = build_corpora_iters (
383- corpora , transforms , opts .data , is_train = False ,
376+ corpora , transforms , opts .data ,
384377 skip_empty_level = opts .skip_empty_level )
385378 sample_path = os .path .join (
386379 os .path .dirname (opts .save_data ), CorpusName .SAMPLE )
0 commit comments