Skip to content
This repository was archived by the owner on Jan 15, 2024. It is now read-only.

Commit 5d4bc9e

Browse files
authored
Fixed #1490 and #1520 (#1579)
1 parent 83780ab commit 5d4bc9e

File tree

3 files changed

+218
-263
lines changed

3 files changed

+218
-263
lines changed

scripts/processing/__main__.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
import textwrap
33

44
from . import (
5-
clean_tok_para_corpus,
6-
clean_tok_mono_corpus,
5+
clean_tok_corpus,
76
learn_subword,
87
apply_subword
98
)
@@ -24,13 +23,13 @@ def cli_main():
2423
'Choices are {}.'.format(SUBCOMMANDS))
2524
args, other_args = parser.parse_known_args()
2625
if args.command == 'clean_tok_para_corpus':
27-
parser = clean_tok_para_corpus.get_parser()
26+
parser = clean_tok_corpus.get_parser.para()
2827
sub_args = parser.parse_args(other_args)
29-
clean_tok_para_corpus.main(sub_args)
28+
clean_tok_corpus.main_para(sub_args)
3029
elif args.command == 'clean_tok_mono_corpus':
31-
parser = clean_tok_mono_corpus.get_parser()
30+
parser = clean_tok_corpus.get_parser.mono()
3231
sub_args = parser.parse_args(other_args)
33-
clean_tok_mono_corpus.main(sub_args)
32+
clean_tok_corpus.main_mono(sub_args)
3433
elif args.command == 'learn_subword':
3534
parser = learn_subword.get_parser()
3635
sub_args = parser.parse_args(other_args)

scripts/processing/clean_tok_para_corpus.py renamed to scripts/processing/clean_tok_corpus.py

Lines changed: 213 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def get_tokenizer(tokenizer, lang=None):
2626
raise NotImplementedError
2727

2828

29+
2930
def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
3031
"""Check whether the sentence pair can all be encoded in latin1
3132
@@ -48,6 +49,28 @@ def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
4849
return True
4950

5051

52+
53+
def check_latin1(sentence: str) -> bool:
54+
"""Check whether the sentence can be encoded in latin1
55+
56+
This is used in
57+
https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
58+
59+
The idea is to filter the sentences with rare unicode glyphs
60+
61+
Returns
62+
-------
63+
ret
64+
Whether sentences are latin1
65+
"""
66+
try:
67+
sentence.encode('latin1')
68+
except UnicodeEncodeError:
69+
return False
70+
else:
71+
return True
72+
73+
5174
def get_line_byte_start(corpus_path: str) -> np.ndarray:
5275
"""Get the start position of each lines in terms of bytes so that we can use seek + read to
5376
load an arbitrary line.
@@ -240,7 +263,137 @@ def chunk_iterator(step=10):
240263
return filtered_line_count
241264

242265

243-
def get_parser():
266+
267+
268+
class MonoCorpusProcessor:
269+
"""Process sentence of corpus.
270+
271+
This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
272+
The difference is that it is customizable with pure python.
273+
274+
By default, we will perform the following pre-processing pipeline.
275+
Each stage could be turned on/off and specialized based on the input arguments.
276+
Also, you may directly revise the code and write your own processing script.
277+
278+
1. Normalize sentence
279+
2. Pre-filter
280+
3. Tokenize the sentence
281+
4. Filter the sentence based on different rules
282+
3.1 Remove sentences where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
283+
3.2 Remove sentences where not `min_max_words <= len(lhs) <= max_num_words` and
284+
`min_max_words <= len(rhs) <= max_num_words`
285+
"""
286+
def __init__(self, lang: str,
287+
normalize: bool = True,
288+
tokenizer: Union[str, BaseTokenizer] = 'whitespace',
289+
min_num_words: Optional[int] = None,
290+
max_num_words: Optional[int] = None,
291+
discard_non_latin1: bool = False):
292+
self._lang = lang
293+
if normalize:
294+
self._normalizer = MosesNormalizer(lang=lang)
295+
self._tokenizer = get_tokenizer(tokenizer, lang)
296+
self._min_num_words = min_num_words
297+
self._max_num_words = max_num_words
298+
self._discard_non_latin1 = discard_non_latin1
299+
300+
def process_chunk(self, args):
301+
path, chunk_start, chunk_size = args
302+
processed_lines = []
303+
with open(path, 'rb') as in_f:
304+
# Read chunk
305+
in_f.seek(chunk_start)
306+
lines = in_f.read(chunk_size)
307+
lines = lines.splitlines()
308+
unfiltered_line_num = len(lines)
309+
for line in lines:
310+
line = line.decode('utf-8').strip()
311+
# 1. Normalize
312+
line = self._normalizer(line)
313+
# 2. Filter after normalization.
314+
if self._discard_non_latin1:
315+
if not check_latin1(line):
316+
continue
317+
# 3. Tokenize the sentence
318+
tokens = self._tokenizer.encode(line)
319+
# 4. Filter after tokenization. Filter with multiple rules
320+
if len(tokens) == 0:
321+
continue
322+
if self._max_num_words is not None:
323+
if len(tokens) > self._max_num_words:
324+
continue
325+
if self._min_num_words is not None:
326+
if len(tokens) < self._min_num_words:
327+
continue
328+
processed_lines.append(' '.join(tokens))
329+
return processed_lines, unfiltered_line_num
330+
331+
def process_mono_corpus(self,
332+
corpus_paths: List[str],
333+
out_path: str,
334+
chunk_size: int = 1024 * 1024,
335+
num_process: int = 8) -> int:
336+
"""Preprocess the mono corpus
337+
338+
Parameters
339+
----------
340+
corpus_paths
341+
Corpus paths
342+
out_path
343+
Write the results to the output path
344+
chunk_size
345+
Approximately split the corpus files into multiple chunks
346+
num_process
347+
The number of process
348+
349+
Returns
350+
-------
351+
line_count
352+
The number of lines in the final filtered file
353+
"""
354+
start = time.time()
355+
total_line_count = 0
356+
filtered_line_count = 0
357+
358+
def chunk_iterator(step=10):
359+
for path in corpus_paths:
360+
line_pos = get_line_byte_start(path)
361+
line_size = line_pos[1:] - line_pos[:-1]
362+
num_lines = line_pos.shape[0] - 1
363+
budget = chunk_size
364+
chunk_start = 0
365+
cur_chunk_size = 0
366+
for i in range(0, num_lines, step):
367+
line_batch_num = min(num_lines - i, step)
368+
batch_line_size = line_size[i:(i + line_batch_num)].sum()
369+
budget -= batch_line_size
370+
cur_chunk_size += batch_line_size
371+
if budget <= 0 or i + step >= num_lines:
372+
yield path, chunk_start, cur_chunk_size
373+
chunk_start += cur_chunk_size
374+
cur_chunk_size = 0
375+
budget = chunk_size
376+
377+
with open(out_path, 'w', encoding='utf-8', newline='\n') as out_f:
378+
with multiprocessing.Pool(num_process) as pool:
379+
for i, (processed_lines, unfiltered_line_num) in \
380+
enumerate(pool.imap(self.process_chunk, chunk_iterator())):
381+
out_f.write('\n'.join(processed_lines) + '\n')
382+
filtered_line_count += len(processed_lines)
383+
total_line_count += unfiltered_line_num
384+
if (i + 1) % 100 == 0:
385+
print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
386+
.format(i + 1, total_line_count,
387+
total_line_count - filtered_line_count,
388+
filtered_line_count))
389+
end = time.time()
390+
print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
391+
total_line_count,
392+
end - start))
393+
return filtered_line_count
394+
395+
396+
def get_parser_para():
244397
parser = argparse.ArgumentParser(
245398
description='Clean parallel corpus used in machine translation.')
246399
parser.add_argument('--src-corpus', type=str, nargs='+', required=True)
@@ -268,7 +421,30 @@ def get_parser():
268421
return parser
269422

270423

271-
def main(args):
424+
425+
def get_parser_mono():
426+
parser = argparse.ArgumentParser(
427+
description='Clean mono corpus used in machine translation.')
428+
parser.add_argument('--corpus', type=str, nargs='+', required=True)
429+
parser.add_argument('--lang', type=str, required=True)
430+
parser.add_argument('--save-path', type=str, default=None,
431+
help='Path to save the cleaned and tokenized corpus. If not set, '
432+
'the default is "corpus.tok.{lang}"')
433+
parser.add_argument('--tokenizer', type=str, default='moses')
434+
parser.add_argument('--min-num-words', type=int, default=None)
435+
parser.add_argument('--max-num-words', type=int, default=None)
436+
parser.add_argument('--discard-non-latin1', action='store_true',
437+
help='Whether to discard the sentence pair if both sentences cannot be '
438+
'encoded into latin1.')
439+
parser.add_argument('--num-process', type=int, default=os.cpu_count(),
440+
help='number of process')
441+
parser.add_argument('--overwrite', action='store_true')
442+
443+
return parser
444+
445+
446+
447+
def main_para(args):
272448
src_lang, tgt_lang = args.src_lang, args.tgt_lang
273449
corpus_processor = ParallelCorpusProcessor(src_lang=src_lang,
274450
tgt_lang=tgt_lang,
@@ -303,11 +479,43 @@ def main(args):
303479
num_process=args.num_process)
304480

305481

482+
483+
def main_mono(args):
484+
corpus_processor = MonoCorpusProcessor(lang=args.lang,
485+
tokenizer=args.tokenizer,
486+
min_num_words=args.min_num_words,
487+
max_num_words=args.max_num_words,
488+
discard_non_latin1=args.discard_non_latin1)
489+
print('Clean the mono corpus:')
490+
print(' {}: {}'.format(args.lang, args.corpus))
491+
if args.save_path is None:
492+
save_path = 'corpus.tok.{}'.format(args.lang)
493+
else:
494+
save_path = args.save_path
495+
print('Save to {} -> {} \n'.format(args.lang, save_path))
496+
if os.path.exists(save_path) and not args.overwrite:
497+
warnings.warn('{} exists, skip. If you need to overwrite this file, '
498+
'rerun the script with --overwrite.'.format(save_path))
499+
else:
500+
corpus_processor.process_mono_corpus(
501+
corpus_paths=args.corpus,
502+
out_path=save_path,
503+
num_process=args.num_process)
504+
505+
306506
def cli_main():
307-
parser = get_parser()
308-
args = parser.parse_args()
309-
main(args)
507+
508+
try:
509+
parser_para = get_parser_para()
510+
args_para = parser_para.parse_args()
511+
main_para(args_para)
512+
513+
except:
514+
parser_mono = get_parser_mono()
515+
args_mono = parser_mono.parse_args()
516+
main_mono(args_mono)
310517

311518

312519
if __name__ == '__main__':
313520
cli_main()
521+

0 commit comments

Comments
 (0)