@@ -26,6 +26,7 @@ def get_tokenizer(tokenizer, lang=None):
2626 raise NotImplementedError
2727
2828
29+
2930def check_both_latin1 (src_sentence : str , tgt_sentence : str ) -> bool :
3031 """Check whether the sentence pair can all be encoded in latin1
3132
@@ -48,6 +49,28 @@ def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
4849 return True
4950
5051
52+
53+ def check_latin1 (sentence : str ) -> bool :
54+ """Check whether the sentence can be encoded in latin1
55+
56+ This is used in
57+ https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
58+
59+ The idea is to filter the sentences with rare unicode glyphs
60+
61+ Returns
62+ -------
63+ ret
64+ Whether sentences are latin1
65+ """
66+ try :
67+ sentence .encode ('latin1' )
68+ except UnicodeEncodeError :
69+ return False
70+ else :
71+ return True
72+
73+
5174def get_line_byte_start (corpus_path : str ) -> np .ndarray :
5275 """Get the start position of each lines in terms of bytes so that we can use seek + read to
5376 load an arbitrary line.
@@ -240,7 +263,137 @@ def chunk_iterator(step=10):
240263 return filtered_line_count
241264
242265
243- def get_parser ():
266+
267+
268+ class MonoCorpusProcessor :
269+ """Process sentence of corpus.
270+
271+ This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
272+ The difference is that it is customizable with pure python.
273+
274+ By default, we will perform the following pre-processing pipeline.
275+ Each stage could be turned on/off and specialized based on the input arguments.
276+ Also, you may directly revise the code and write your own processing script.
277+
278+ 1. Normalize sentence
279+ 2. Pre-filter
280+ 3. Tokenize the sentence
281+ 4. Filter the sentence based on different rules
282+ 3.1 Remove sentences where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
283+ 3.2 Remove sentences where not `min_max_words <= len(lhs) <= max_num_words` and
284+ `min_max_words <= len(rhs) <= max_num_words`
285+ """
286+ def __init__ (self , lang : str ,
287+ normalize : bool = True ,
288+ tokenizer : Union [str , BaseTokenizer ] = 'whitespace' ,
289+ min_num_words : Optional [int ] = None ,
290+ max_num_words : Optional [int ] = None ,
291+ discard_non_latin1 : bool = False ):
292+ self ._lang = lang
293+ if normalize :
294+ self ._normalizer = MosesNormalizer (lang = lang )
295+ self ._tokenizer = get_tokenizer (tokenizer , lang )
296+ self ._min_num_words = min_num_words
297+ self ._max_num_words = max_num_words
298+ self ._discard_non_latin1 = discard_non_latin1
299+
300+ def process_chunk (self , args ):
301+ path , chunk_start , chunk_size = args
302+ processed_lines = []
303+ with open (path , 'rb' ) as in_f :
304+ # Read chunk
305+ in_f .seek (chunk_start )
306+ lines = in_f .read (chunk_size )
307+ lines = lines .splitlines ()
308+ unfiltered_line_num = len (lines )
309+ for line in lines :
310+ line = line .decode ('utf-8' ).strip ()
311+ # 1. Normalize
312+ line = self ._normalizer (line )
313+ # 2. Filter after normalization.
314+ if self ._discard_non_latin1 :
315+ if not check_latin1 (line ):
316+ continue
317+ # 3. Tokenize the sentence
318+ tokens = self ._tokenizer .encode (line )
319+ # 4. Filter after tokenization. Filter with multiple rules
320+ if len (tokens ) == 0 :
321+ continue
322+ if self ._max_num_words is not None :
323+ if len (tokens ) > self ._max_num_words :
324+ continue
325+ if self ._min_num_words is not None :
326+ if len (tokens ) < self ._min_num_words :
327+ continue
328+ processed_lines .append (' ' .join (tokens ))
329+ return processed_lines , unfiltered_line_num
330+
331+ def process_mono_corpus (self ,
332+ corpus_paths : List [str ],
333+ out_path : str ,
334+ chunk_size : int = 1024 * 1024 ,
335+ num_process : int = 8 ) -> int :
336+ """Preprocess the mono corpus
337+
338+ Parameters
339+ ----------
340+ corpus_paths
341+ Corpus paths
342+ out_path
343+ Write the results to the output path
344+ chunk_size
345+ Approximately split the corpus files into multiple chunks
346+ num_process
347+ The number of process
348+
349+ Returns
350+ -------
351+ line_count
352+ The number of lines in the final filtered file
353+ """
354+ start = time .time ()
355+ total_line_count = 0
356+ filtered_line_count = 0
357+
358+ def chunk_iterator (step = 10 ):
359+ for path in corpus_paths :
360+ line_pos = get_line_byte_start (path )
361+ line_size = line_pos [1 :] - line_pos [:- 1 ]
362+ num_lines = line_pos .shape [0 ] - 1
363+ budget = chunk_size
364+ chunk_start = 0
365+ cur_chunk_size = 0
366+ for i in range (0 , num_lines , step ):
367+ line_batch_num = min (num_lines - i , step )
368+ batch_line_size = line_size [i :(i + line_batch_num )].sum ()
369+ budget -= batch_line_size
370+ cur_chunk_size += batch_line_size
371+ if budget <= 0 or i + step >= num_lines :
372+ yield path , chunk_start , cur_chunk_size
373+ chunk_start += cur_chunk_size
374+ cur_chunk_size = 0
375+ budget = chunk_size
376+
377+ with open (out_path , 'w' , encoding = 'utf-8' , newline = '\n ' ) as out_f :
378+ with multiprocessing .Pool (num_process ) as pool :
379+ for i , (processed_lines , unfiltered_line_num ) in \
380+ enumerate (pool .imap (self .process_chunk , chunk_iterator ())):
381+ out_f .write ('\n ' .join (processed_lines ) + '\n ' )
382+ filtered_line_count += len (processed_lines )
383+ total_line_count += unfiltered_line_num
384+ if (i + 1 ) % 100 == 0 :
385+ print ('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
386+ .format (i + 1 , total_line_count ,
387+ total_line_count - filtered_line_count ,
388+ filtered_line_count ))
389+ end = time .time ()
390+ print ('Done, #Lines {}/{}, Time spent {}' .format (filtered_line_count ,
391+ total_line_count ,
392+ end - start ))
393+ return filtered_line_count
394+
395+
396+ def get_parser_para ():
244397 parser = argparse .ArgumentParser (
245398 description = 'Clean parallel corpus used in machine translation.' )
246399 parser .add_argument ('--src-corpus' , type = str , nargs = '+' , required = True )
@@ -268,7 +421,30 @@ def get_parser():
268421 return parser
269422
270423
271- def main (args ):
424+
425+ def get_parser_mono ():
426+ parser = argparse .ArgumentParser (
427+ description = 'Clean mono corpus used in machine translation.' )
428+ parser .add_argument ('--corpus' , type = str , nargs = '+' , required = True )
429+ parser .add_argument ('--lang' , type = str , required = True )
430+ parser .add_argument ('--save-path' , type = str , default = None ,
431+ help = 'Path to save the cleaned and tokenized corpus. If not set, '
432+ 'the default is "corpus.tok.{lang}"' )
433+ parser .add_argument ('--tokenizer' , type = str , default = 'moses' )
434+ parser .add_argument ('--min-num-words' , type = int , default = None )
435+ parser .add_argument ('--max-num-words' , type = int , default = None )
436+ parser .add_argument ('--discard-non-latin1' , action = 'store_true' ,
437+ help = 'Whether to discard the sentence pair if both sentences cannot be '
438+ 'encoded into latin1.' )
439+ parser .add_argument ('--num-process' , type = int , default = os .cpu_count (),
440+ help = 'number of process' )
441+ parser .add_argument ('--overwrite' , action = 'store_true' )
442+
443+ return parser
444+
445+
446+
447+ def main_para (args ):
272448 src_lang , tgt_lang = args .src_lang , args .tgt_lang
273449 corpus_processor = ParallelCorpusProcessor (src_lang = src_lang ,
274450 tgt_lang = tgt_lang ,
@@ -303,11 +479,43 @@ def main(args):
303479 num_process = args .num_process )
304480
305481
482+
483+ def main_mono (args ):
484+ corpus_processor = MonoCorpusProcessor (lang = args .lang ,
485+ tokenizer = args .tokenizer ,
486+ min_num_words = args .min_num_words ,
487+ max_num_words = args .max_num_words ,
488+ discard_non_latin1 = args .discard_non_latin1 )
489+ print ('Clean the mono corpus:' )
490+ print (' {}: {}' .format (args .lang , args .corpus ))
491+ if args .save_path is None :
492+ save_path = 'corpus.tok.{}' .format (args .lang )
493+ else :
494+ save_path = args .save_path
495+ print ('Save to {} -> {} \n ' .format (args .lang , save_path ))
496+ if os .path .exists (save_path ) and not args .overwrite :
497+ warnings .warn ('{} exists, skip. If you need to overwrite this file, '
498+ 'rerun the script with --overwrite.' .format (save_path ))
499+ else :
500+ corpus_processor .process_mono_corpus (
501+ corpus_paths = args .corpus ,
502+ out_path = save_path ,
503+ num_process = args .num_process )
504+
505+
306506def cli_main ():
307- parser = get_parser ()
308- args = parser .parse_args ()
309- main (args )
507+
508+ try :
509+ parser_para = get_parser_para ()
510+ args_para = parser_para .parse_args ()
511+ main_para (args_para )
512+
513+ except :
514+ parser_mono = get_parser_mono ()
515+ args_mono = parser_mono .parse_args ()
516+ main_mono (args_mono )
310517
311518
312519if __name__ == '__main__' :
313520 cli_main ()
521+
0 commit comments