Skip to content

Commit c8032af

Browse files
committed
visualization: integration of the filter on word repetition ratio
1 parent b089958 commit c8032af

File tree

2 files changed

+122
-34
lines changed

2 files changed

+122
-34
lines changed

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,6 @@ def __init__(
4343
lang_dataset_id, path_kenlm_model
4444
)
4545

46-
self.keys_stats = [
47-
"special_characters_ratio",
48-
"stopwords_ratio",
49-
"flagged_words_ratio",
50-
"lang_id_score",
51-
"perplexity_score",
52-
]
5346
self.path_save_stats = path_save_stats
5447

5548
def compute_stats(self):
@@ -88,13 +81,29 @@ def compute_stats(self):
8881
number_words = len(words)
8982
stats_document["number_words"] = number_words
9083

91-
repetitions_ratios = {
84+
character_repetition_ratios = {
9285
n: round(
9386
Filtering.compute_character_repetition_ratio(document, n), 4
9487
)
9588
for n in range(2, 16)
9689
}
97-
stats_document["repetitions_ratio"] = repetitions_ratios
90+
stats_document[
91+
"character_repetition_ratio"
92+
] = character_repetition_ratios
93+
94+
word_repetition_ratios = {
95+
n: round(
96+
Filtering.compute_word_repetition_ratio(
97+
document,
98+
self.sentencepiece_model_tok,
99+
self.param["strip_characters"],
100+
n,
101+
),
102+
4,
103+
)
104+
for n in range(3, 11)
105+
}
106+
stats_document["word_repetition_ratio"] = word_repetition_ratios
98107

99108
special_characters_ratio = Filtering.compute_special_characters_ratio(
100109
document, self.param["special_characters"]

ac_dc/visualization/visualization.py

Lines changed: 104 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
sys.path.insert(1, os.path.join(sys.path[0], ".."))
2222
# Append the path of the ac_dc directory to the python path
23-
# to find the file filtering.py in the parent directory
23+
# to find the files filtering.py and languages_id.py in the parent directory
2424
sys.path.append(str(Path(sys.path[0]).parent.absolute().parent.absolute()))
2525

2626
from filtering import LoadParameters, ModifyingDocuments, Filtering
@@ -164,17 +164,17 @@ def get_cond(key, cutoff, max_cutoff):
164164

165165
conds["number_words"] = [cond_1, cond_2]
166166

167-
if "repetitions_ratio" in columns:
168-
with st.sidebar.expander("Repetitions ratio"):
167+
if "character_repetition_ratio" in columns:
168+
with st.sidebar.expander("Character repetition ratio"):
169169
val_repetitions_lengths = list(
170-
self.docs["repetitions_ratio"].iloc[0].keys()
170+
self.docs["character_repetition_ratio"].iloc[0].keys()
171171
)
172172
default_index = (
173173
val_repetitions_lengths.index("10")
174174
if "10" in val_repetitions_lengths
175175
else 0
176176
)
177-
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
177+
label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)."
178178
repetitions_length = st.selectbox(
179179
label=label_selectbox,
180180
options=val_repetitions_lengths,
@@ -183,33 +183,83 @@ def get_cond(key, cutoff, max_cutoff):
183183
st.caption(
184184
"Choosing a higher or lower number does not mean that the filtering "
185185
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
186-
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
186+
"tends to associate a high character repetition ratio to very long documents (like book chapters), but with "
187187
"few or no repetitions, simply because their length gives them more diversity, and we do "
188-
"not want to discard such documents."
188+
"not want to discard such documents. It is generally better to increase this number, so that false "
189+
"positives are very short documents (which we want to delete anyway) rather than long ones. However, "
190+
"a low number can be useful for Chinese, where a character can designate a whole word."
189191
)
190-
self.docs["repetitions_ratio"] = self.docs_checkpoint[
191-
"repetitions_ratio"
192+
self.docs["character_repetition_ratio"] = self.docs_checkpoint[
193+
"character_repetition_ratio"
192194
]
193-
for i in range(len(self.docs["repetitions_ratio"])):
194-
self.docs["repetitions_ratio"].iloc[i] = self.docs[
195-
"repetitions_ratio"
195+
for i in range(len(self.docs["character_repetition_ratio"])):
196+
self.docs["character_repetition_ratio"].iloc[i] = self.docs[
197+
"character_repetition_ratio"
196198
].iloc[i][repetitions_length]
197199

198-
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
199-
cutoff_repetitions_ratio = st.slider(
200+
cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed."
201+
cutoff_character_repetition_ratio = st.slider(
200202
cutoff_def, 0.0, 1.0, 1.0, step=0.01
201203
)
202204
new_key = (
203-
"repetitions_ratio",
204-
cutoff_repetitions_ratio,
205+
"character_repetition_ratio",
206+
cutoff_character_repetition_ratio,
205207
True,
206208
repetitions_length,
207209
)
208210
keys.append(new_key)
209211
Visualization_for_lang.plot_hist(self.docs, new_key)
210212
cond = get_cond(new_key[0], new_key[1], new_key[2])
211213
Visualization_for_lang.print_discarded_by_cond(cond)
212-
conds["repetitions_ratio"] = [cond]
214+
conds["character_repetition_ratio"] = [cond]
215+
216+
if "word_repetition_ratio" in columns:
217+
with st.sidebar.expander("Word repetition ratio"):
218+
val_repetitions_lengths = list(
219+
self.docs["word_repetition_ratio"].iloc[0].keys()
220+
)
221+
default_index = (
222+
val_repetitions_lengths.index("5")
223+
if "5" in val_repetitions_lengths
224+
else 0
225+
)
226+
label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)."
227+
repetitions_length = st.selectbox(
228+
label=label_selectbox,
229+
options=val_repetitions_lengths,
230+
index=default_index,
231+
)
232+
st.caption(
233+
"Choosing a higher or lower number does not mean that the filtering "
234+
"is stronger or weaker. Be careful, choosing a low number (like 3) could "
235+
"tend to associate a high word repetition ratio to very long documents (like book chapters), but with "
236+
"few or no repetitions, simply because their length gives them more diversity, and we do "
237+
"not want to discard such documents. It is generally better to increase a bit this number, so that false "
238+
"positives are very short documents (which we want to delete anyway) rather than long ones."
239+
)
240+
self.docs["word_repetition_ratio"] = self.docs_checkpoint[
241+
"word_repetition_ratio"
242+
]
243+
for i in range(len(self.docs["word_repetition_ratio"])):
244+
self.docs["word_repetition_ratio"].iloc[i] = self.docs[
245+
"word_repetition_ratio"
246+
].iloc[i][repetitions_length]
247+
248+
cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed."
249+
cutoff_word_repetition_ratio = st.slider(
250+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
251+
)
252+
new_key = (
253+
"word_repetition_ratio",
254+
cutoff_word_repetition_ratio,
255+
True,
256+
repetitions_length,
257+
)
258+
keys.append(new_key)
259+
Visualization_for_lang.plot_hist(self.docs, new_key)
260+
cond = get_cond(new_key[0], new_key[1], new_key[2])
261+
Visualization_for_lang.print_discarded_by_cond(cond)
262+
conds["word_repetition_ratio"] = [cond]
213263

214264
if "special_characters_ratio" in columns:
215265
with st.sidebar.expander("Special characters ratio"):
@@ -369,12 +419,25 @@ def get_cond(key, cutoff, max_cutoff):
369419
"docs",
370420
)
371421

372-
if "repetitions_ratio" in columns:
373-
cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
422+
if "character_repetition_ratio" in columns:
423+
cond_filter = np.invert(
424+
np.all(conds["character_repetition_ratio"], axis=0)
425+
)
374426
Visualization_for_lang.display_dataset(
375427
self.docs,
376428
cond_filter,
377-
"Discarded documents for the filter on the repetitions ratio",
429+
"Discarded documents for the filter on the character repetition ratio",
430+
"docs",
431+
)
432+
433+
if "word_repetition_ratio" in columns:
434+
cond_filter = np.invert(
435+
np.all(conds["word_repetition_ratio"], axis=0)
436+
)
437+
Visualization_for_lang.display_dataset(
438+
self.docs,
439+
cond_filter,
440+
"Discarded documents for the filter on the word repetition ratio",
378441
"docs",
379442
)
380443

@@ -614,15 +677,31 @@ def is_doc_discarded(key, score):
614677
if is_doc_discarded(key, len(words)):
615678
is_discarded = True
616679

617-
elif key[0] == "repetitions_ratio":
618-
repetitions_ratio = (
680+
elif key[0] == "character_repetition_ratio":
681+
character_repetition_ratio = (
619682
Filtering.compute_character_repetition_ratio(
620683
personal_doc, int(key[3])
621684
)
622685
)
623-
repetitions_ratio = round(repetitions_ratio, 3)
624-
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
625-
if is_doc_discarded(key, repetitions_ratio):
686+
character_repetition_ratio = round(
687+
character_repetition_ratio, 3
688+
)
689+
st.markdown(
690+
f"Character repetition ratio: {character_repetition_ratio}"
691+
)
692+
if is_doc_discarded(key, character_repetition_ratio):
693+
is_discarded = True
694+
695+
elif key[0] == "word_repetition_ratio":
696+
word_repetition_ratio = Filtering.compute_word_repetition_ratio(
697+
personal_doc,
698+
self.sentencepiece_model_tok,
699+
self.param["strip_characters"],
700+
int(key[3]),
701+
)
702+
word_repetition_ratio = round(word_repetition_ratio, 3)
703+
st.markdown(f"Word repetition ratio: {word_repetition_ratio}")
704+
if is_doc_discarded(key, word_repetition_ratio):
626705
is_discarded = True
627706

628707
elif key[0] == "special_characters_ratio":

0 commit comments

Comments
 (0)