@@ -423,33 +423,91 @@ def check_number_words(
423423 return cond
424424
425425 @staticmethod
426- def compute_repetitions_ratio (document , repetitions_length ):
427- def get_freq_ngrams (document , n ):
428- ngrams = [document [i : i + n ] for i in range (len (document ) - n + 1 )]
429- freq_ngrams = {}
430- for ngram in ngrams :
431- freq_ngrams [ngram ] = freq_ngrams .get (ngram , 0 ) + 1
432- return freq_ngrams
433-
434- freq_ngrams = get_freq_ngrams (document , repetitions_length )
435- if len (freq_ngrams ) == 0 :
426+ def compute_character_repetition_ratio (document , character_repetition_length ):
427+ def get_freq_character_ngrams (document , n ):
428+ character_ngrams = [
429+ document [i : i + n ] for i in range (len (document ) - n + 1 )
430+ ]
431+ freq_character_ngrams = {}
432+ for character_ngram in character_ngrams :
433+ freq_character_ngrams [character_ngram ] = (
434+ freq_character_ngrams .get (character_ngram , 0 ) + 1
435+ )
436+ return freq_character_ngrams
437+
438+ freq_character_ngrams = get_freq_character_ngrams (
439+ document , character_repetition_length
440+ )
441+ if len (freq_character_ngrams ) == 0 :
442+ return 0
443+ freq_character_ngrams = list (freq_character_ngrams .values ())
444+ freq_character_ngrams = sorted (freq_character_ngrams , reverse = True )
445+ val_less_than_one = len ([el for el in freq_character_ngrams if el > 1 ])
446+ num_rep_character_ngrams = min (
447+ int (np .sqrt (len (freq_character_ngrams ))),
448+ len (freq_character_ngrams ) - val_less_than_one ,
449+ )
450+ character_repetition_ratio = sum (
451+ freq_character_ngrams [:num_rep_character_ngrams ]
452+ ) / sum (freq_character_ngrams )
453+ return character_repetition_ratio
454+
455+ @staticmethod
456+ def check_character_repetition_removal (
457+ document ,
458+ character_repetition_length ,
459+ character_repetition_max_cutoff ,
460+ ):
461+ character_repetition_ratio = Filtering .compute_character_repetition_ratio (
462+ document , character_repetition_length
463+ )
464+ cond = character_repetition_ratio <= character_repetition_max_cutoff
465+ return cond
466+
467+ @staticmethod
468+ def compute_word_repetition_ratio (
469+ document , sentencepiece_model_tok , strip_characters , word_repetition_length
470+ ):
471+ def get_freq_word_ngrams (
472+ document , sentencepiece_model_tok , strip_characters , n
473+ ):
474+ words = ModifyingDocuments .get_words_from_document (
475+ document ,
476+ sentencepiece_model_tok ,
477+ lower_case = True ,
478+ strip_characters = strip_characters ,
479+ )
480+ word_ngrams = [
481+ " " .join (words [i : i + n ]) for i in range (len (words ) - n + 1 )
482+ ]
483+ freq_word_ngrams = {}
484+ for word_ngram in word_ngrams :
485+ freq_word_ngrams [word_ngram ] = freq_word_ngrams .get (word_ngram , 0 ) + 1
486+ return freq_word_ngrams
487+
488+ freq_word_ngrams = get_freq_word_ngrams (
489+ document , sentencepiece_model_tok , strip_characters , word_repetition_length
490+ )
491+ if len (freq_word_ngrams ) == 0 :
436492 return 0
437- freq_ngrams = list (freq_ngrams .values ())
438- freq_ngrams = sorted ( freq_ngrams , reverse = True )
439- num_rep_ngrams = int ( np . sqrt ( len ( freq_ngrams )))
440- repetitions_ratio = sum ( freq_ngrams [: num_rep_ngrams ] ) / sum (freq_ngrams )
441- return repetitions_ratio
493+ freq_word_ngrams = list (freq_word_ngrams .values ())
494+ word_repetition_ratio = sum (
495+ freq for freq in freq_word_ngrams if freq > 1
496+ ) / sum (freq_word_ngrams )
497+ return word_repetition_ratio
442498
443499 @staticmethod
444- def check_repetitions_removal (
500+ def check_word_repetition_removal (
445501 document ,
446- repetitions_length ,
447- repetitions_max_cutoff ,
502+ sentencepiece_model_tok ,
503+ strip_characters ,
504+ word_repetition_length ,
505+ word_repetition_max_cutoff ,
448506 ):
449- repetitions_ratio = Filtering .compute_repetitions_ratio (
450- document , repetitions_length
507+ word_repetition_ratio = Filtering .compute_word_repetition_ratio (
508+ document , sentencepiece_model_tok , strip_characters , word_repetition_length
451509 )
452- cond = repetitions_ratio <= repetitions_max_cutoff
510+ cond = word_repetition_ratio <= word_repetition_max_cutoff
453511 return cond
454512
455513 @staticmethod
@@ -670,9 +728,12 @@ def filtering(
670728 strip_characters ,
671729 number_words_min_cutoff ,
672730 number_words_max_cutoff ,
673- cond_check_repetitions_removal ,
674- repetitions_length ,
675- repetitions_max_cutoff ,
731+ cond_check_character_repetition_removal ,
732+ character_repetition_length ,
733+ character_repetition_max_cutoff ,
734+ cond_check_word_repetition_removal ,
735+ word_repetition_length ,
736+ word_repetition_max_cutoff ,
676737 cond_check_special_characters ,
677738 special_characters ,
678739 special_characters_max_cutoff ,
@@ -703,11 +764,20 @@ def filtering(
703764 number_words_max_cutoff ,
704765 ):
705766 return False
706- if cond_check_repetitions_removal :
707- if not Filtering .check_repetitions_removal (
767+ if cond_check_character_repetition_removal :
768+ if not Filtering .check_character_repetition_removal (
708769 document ,
709- repetitions_length ,
710- repetitions_max_cutoff ,
770+ character_repetition_length ,
771+ character_repetition_max_cutoff ,
772+ ):
773+ return False
774+ if cond_check_word_repetition_removal :
775+ if not Filtering .check_word_repetition_removal (
776+ document ,
777+ sentencepiece_model_tok ,
778+ strip_characters ,
779+ word_repetition_length ,
780+ word_repetition_max_cutoff ,
711781 ):
712782 return False
713783 if cond_check_special_characters :
@@ -797,9 +867,18 @@ def __call__(self, example):
797867 strip_characters = self .param ["strip_characters" ],
798868 number_words_min_cutoff = self .param ["number_words_min_cutoff" ],
799869 number_words_max_cutoff = self .param ["number_words_max_cutoff" ],
800- cond_check_repetitions_removal = self .param ["check_repetitions_removal" ],
801- repetitions_length = self .param ["repetitions_length" ],
802- repetitions_max_cutoff = self .param ["repetitions_max_cutoff" ],
870+ cond_check_character_repetition_removal = self .param [
871+ "cond_check_character_repetition_removal"
872+ ],
873+ character_repetition_length = self .param ["character_repetition_length" ],
874+ character_repetition_max_cutoff = self .param [
875+ "character_repetition_max_cutoff"
876+ ],
877+ cond_check_word_repetition_removal = self .param [
878+ "cond_check_word_repetition_removal"
879+ ],
880+ word_repetition_length = self .param ["word_repetition_length" ],
881+ word_repetition_max_cutoff = self .param ["word_repetition_max_cutoff" ],
803882 cond_check_special_characters = self .param ["cond_check_special_characters" ],
804883 special_characters = self .param ["special_characters" ],
805884 special_characters_max_cutoff = self .param ["special_characters_max_cutoff" ],
0 commit comments