Skip to content

Commit dc14a14

Browse files
committed
new filter for word repetition removal
1 parent 7251590 commit dc14a14

File tree

4 files changed

+273
-112
lines changed

4 files changed

+273
-112
lines changed

ac_dc/filtering.py

Lines changed: 110 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -423,33 +423,91 @@ def check_number_words(
423423
return cond
424424

425425
@staticmethod
426-
def compute_repetitions_ratio(document, repetitions_length):
427-
def get_freq_ngrams(document, n):
428-
ngrams = [document[i : i + n] for i in range(len(document) - n + 1)]
429-
freq_ngrams = {}
430-
for ngram in ngrams:
431-
freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1
432-
return freq_ngrams
433-
434-
freq_ngrams = get_freq_ngrams(document, repetitions_length)
435-
if len(freq_ngrams) == 0:
426+
def compute_character_repetition_ratio(document, character_repetition_length):
427+
def get_freq_character_ngrams(document, n):
428+
character_ngrams = [
429+
document[i : i + n] for i in range(len(document) - n + 1)
430+
]
431+
freq_character_ngrams = {}
432+
for character_ngram in character_ngrams:
433+
freq_character_ngrams[character_ngram] = (
434+
freq_character_ngrams.get(character_ngram, 0) + 1
435+
)
436+
return freq_character_ngrams
437+
438+
freq_character_ngrams = get_freq_character_ngrams(
439+
document, character_repetition_length
440+
)
441+
if len(freq_character_ngrams) == 0:
442+
return 0
443+
freq_character_ngrams = list(freq_character_ngrams.values())
444+
freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
445+
val_less_than_one = len([el for el in freq_character_ngrams if el > 1])
446+
num_rep_character_ngrams = min(
447+
int(np.sqrt(len(freq_character_ngrams))),
448+
len(freq_character_ngrams) - val_less_than_one,
449+
)
450+
character_repetition_ratio = sum(
451+
freq_character_ngrams[:num_rep_character_ngrams]
452+
) / sum(freq_character_ngrams)
453+
return character_repetition_ratio
454+
455+
@staticmethod
456+
def check_character_repetition_removal(
457+
document,
458+
character_repetition_length,
459+
character_repetition_max_cutoff,
460+
):
461+
character_repetition_ratio = Filtering.compute_character_repetition_ratio(
462+
document, character_repetition_length
463+
)
464+
cond = character_repetition_ratio <= character_repetition_max_cutoff
465+
return cond
466+
467+
@staticmethod
468+
def compute_word_repetition_ratio(
469+
document, sentencepiece_model_tok, strip_characters, word_repetition_length
470+
):
471+
def get_freq_word_ngrams(
472+
document, sentencepiece_model_tok, strip_characters, n
473+
):
474+
words = ModifyingDocuments.get_words_from_document(
475+
document,
476+
sentencepiece_model_tok,
477+
lower_case=True,
478+
strip_characters=strip_characters,
479+
)
480+
word_ngrams = [
481+
" ".join(words[i : i + n]) for i in range(len(words) - n + 1)
482+
]
483+
freq_word_ngrams = {}
484+
for word_ngram in word_ngrams:
485+
freq_word_ngrams[word_ngram] = freq_word_ngrams.get(word_ngram, 0) + 1
486+
return freq_word_ngrams
487+
488+
freq_word_ngrams = get_freq_word_ngrams(
489+
document, sentencepiece_model_tok, strip_characters, word_repetition_length
490+
)
491+
if len(freq_word_ngrams) == 0:
436492
return 0
437-
freq_ngrams = list(freq_ngrams.values())
438-
freq_ngrams = sorted(freq_ngrams, reverse=True)
439-
num_rep_ngrams = int(np.sqrt(len(freq_ngrams)))
440-
repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams)
441-
return repetitions_ratio
493+
freq_word_ngrams = list(freq_word_ngrams.values())
494+
word_repetition_ratio = sum(
495+
freq for freq in freq_word_ngrams if freq > 1
496+
) / sum(freq_word_ngrams)
497+
return word_repetition_ratio
442498

443499
@staticmethod
444-
def check_repetitions_removal(
500+
def check_word_repetition_removal(
445501
document,
446-
repetitions_length,
447-
repetitions_max_cutoff,
502+
sentencepiece_model_tok,
503+
strip_characters,
504+
word_repetition_length,
505+
word_repetition_max_cutoff,
448506
):
449-
repetitions_ratio = Filtering.compute_repetitions_ratio(
450-
document, repetitions_length
507+
word_repetition_ratio = Filtering.compute_word_repetition_ratio(
508+
document, sentencepiece_model_tok, strip_characters, word_repetition_length
451509
)
452-
cond = repetitions_ratio <= repetitions_max_cutoff
510+
cond = word_repetition_ratio <= word_repetition_max_cutoff
453511
return cond
454512

455513
@staticmethod
@@ -670,9 +728,12 @@ def filtering(
670728
strip_characters,
671729
number_words_min_cutoff,
672730
number_words_max_cutoff,
673-
cond_check_repetitions_removal,
674-
repetitions_length,
675-
repetitions_max_cutoff,
731+
cond_check_character_repetition_removal,
732+
character_repetition_length,
733+
character_repetition_max_cutoff,
734+
cond_check_word_repetition_removal,
735+
word_repetition_length,
736+
word_repetition_max_cutoff,
676737
cond_check_special_characters,
677738
special_characters,
678739
special_characters_max_cutoff,
@@ -703,11 +764,20 @@ def filtering(
703764
number_words_max_cutoff,
704765
):
705766
return False
706-
if cond_check_repetitions_removal:
707-
if not Filtering.check_repetitions_removal(
767+
if cond_check_character_repetition_removal:
768+
if not Filtering.check_character_repetition_removal(
708769
document,
709-
repetitions_length,
710-
repetitions_max_cutoff,
770+
character_repetition_length,
771+
character_repetition_max_cutoff,
772+
):
773+
return False
774+
if cond_check_word_repetition_removal:
775+
if not Filtering.check_word_repetition_removal(
776+
document,
777+
sentencepiece_model_tok,
778+
strip_characters,
779+
word_repetition_length,
780+
word_repetition_max_cutoff,
711781
):
712782
return False
713783
if cond_check_special_characters:
@@ -797,9 +867,18 @@ def __call__(self, example):
797867
strip_characters=self.param["strip_characters"],
798868
number_words_min_cutoff=self.param["number_words_min_cutoff"],
799869
number_words_max_cutoff=self.param["number_words_max_cutoff"],
800-
cond_check_repetitions_removal=self.param["check_repetitions_removal"],
801-
repetitions_length=self.param["repetitions_length"],
802-
repetitions_max_cutoff=self.param["repetitions_max_cutoff"],
870+
cond_check_character_repetition_removal=self.param[
871+
"cond_check_character_repetition_removal"
872+
],
873+
character_repetition_length=self.param["character_repetition_length"],
874+
character_repetition_max_cutoff=self.param[
875+
"character_repetition_max_cutoff"
876+
],
877+
cond_check_word_repetition_removal=self.param[
878+
"cond_check_word_repetition_removal"
879+
],
880+
word_repetition_length=self.param["word_repetition_length"],
881+
word_repetition_max_cutoff=self.param["word_repetition_max_cutoff"],
803882
cond_check_special_characters=self.param["cond_check_special_characters"],
804883
special_characters=self.param["special_characters"],
805884
special_characters_max_cutoff=self.param["special_characters_max_cutoff"],

0 commit comments

Comments
 (0)