2020
2121sys .path .insert (1 , os .path .join (sys .path [0 ], ".." ))
2222# Append the path of the ac_dc directory to the python path
23- # to find the file filtering.py in the parent directory
23+ # to find the files filtering.py and languages_id .py in the parent directory
2424sys .path .append (str (Path (sys .path [0 ]).parent .absolute ().parent .absolute ()))
2525
2626from filtering import LoadParameters , ModifyingDocuments , Filtering
@@ -164,17 +164,17 @@ def get_cond(key, cutoff, max_cutoff):
164164
165165 conds ["number_words" ] = [cond_1 , cond_2 ]
166166
167- if "repetitions_ratio " in columns :
168- with st .sidebar .expander ("Repetitions ratio" ):
167+ if "character_repetition_ratio " in columns :
168+ with st .sidebar .expander ("Character repetition ratio" ):
169169 val_repetitions_lengths = list (
170- self .docs ["repetitions_ratio " ].iloc [0 ].keys ()
170+ self .docs ["character_repetition_ratio " ].iloc [0 ].keys ()
171171 )
172172 default_index = (
173173 val_repetitions_lengths .index ("10" )
174174 if "10" in val_repetitions_lengths
175175 else 0
176176 )
177- label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
177+ label_selectbox = "Length of repetitions in characters (that will influence the character repetition ratio)."
178178 repetitions_length = st .selectbox (
179179 label = label_selectbox ,
180180 options = val_repetitions_lengths ,
@@ -183,33 +183,83 @@ def get_cond(key, cutoff, max_cutoff):
183183 st .caption (
184184 "Choosing a higher or lower number does not mean that the filtering "
185185 "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
186- "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
186+ "tends to associate a high character repetition ratio to very long documents (like book chapters), but with "
187187 "few or no repetitions, simply because their length gives them more diversity, and we do "
188- "not want to discard such documents."
188+ "not want to discard such documents. It is generally better to increase this number, so that false "
189+ "positives are very short documents (which we want to delete anyway) rather than long ones. However, "
190+ "a low number can be useful for Chinese, where a character can designate a whole word."
189191 )
190- self .docs ["repetitions_ratio " ] = self .docs_checkpoint [
191- "repetitions_ratio "
192+ self .docs ["character_repetition_ratio " ] = self .docs_checkpoint [
193+ "character_repetition_ratio "
192194 ]
193- for i in range (len (self .docs ["repetitions_ratio " ])):
194- self .docs ["repetitions_ratio " ].iloc [i ] = self .docs [
195- "repetitions_ratio "
195+ for i in range (len (self .docs ["character_repetition_ratio " ])):
196+ self .docs ["character_repetition_ratio " ].iloc [i ] = self .docs [
197+ "character_repetition_ratio "
196198 ].iloc [i ][repetitions_length ]
197199
198- cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
199- cutoff_repetitions_ratio = st .slider (
200+ cutoff_def = "If the character repetition ratio of a document is higher than this number, the document is removed."
201+ cutoff_character_repetition_ratio = st .slider (
200202 cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
201203 )
202204 new_key = (
203- "repetitions_ratio " ,
204- cutoff_repetitions_ratio ,
205+ "character_repetition_ratio " ,
206+ cutoff_character_repetition_ratio ,
205207 True ,
206208 repetitions_length ,
207209 )
208210 keys .append (new_key )
209211 Visualization_for_lang .plot_hist (self .docs , new_key )
210212 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
211213 Visualization_for_lang .print_discarded_by_cond (cond )
212- conds ["repetitions_ratio" ] = [cond ]
214+ conds ["character_repetition_ratio" ] = [cond ]
215+
216+ if "word_repetition_ratio" in columns :
217+ with st .sidebar .expander ("Word repetition ratio" ):
218+ val_repetitions_lengths = list (
219+ self .docs ["word_repetition_ratio" ].iloc [0 ].keys ()
220+ )
221+ default_index = (
222+ val_repetitions_lengths .index ("5" )
223+ if "5" in val_repetitions_lengths
224+ else 0
225+ )
226+ label_selectbox = "Length of repetitions in words (that will influence the word repetition ratio)."
227+ repetitions_length = st .selectbox (
228+ label = label_selectbox ,
229+ options = val_repetitions_lengths ,
230+ index = default_index ,
231+ )
232+ st .caption (
233+ "Choosing a higher or lower number does not mean that the filtering "
234+ "is stronger or weaker. Be careful, choosing a low number (like 3) could "
235+ "tend to associate a high word repetition ratio to very long documents (like book chapters), but with "
236+ "few or no repetitions, simply because their length gives them more diversity, and we do "
237+ "not want to discard such documents. It is generally better to increase a bit this number, so that false "
238+ "positives are very short documents (which we want to delete anyway) rather than long ones."
239+ )
240+ self .docs ["word_repetition_ratio" ] = self .docs_checkpoint [
241+ "word_repetition_ratio"
242+ ]
243+ for i in range (len (self .docs ["word_repetition_ratio" ])):
244+ self .docs ["word_repetition_ratio" ].iloc [i ] = self .docs [
245+ "word_repetition_ratio"
246+ ].iloc [i ][repetitions_length ]
247+
248+ cutoff_def = "If the word repetition ratio of a document is higher than this number, the document is removed."
249+ cutoff_word_repetition_ratio = st .slider (
250+ cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
251+ )
252+ new_key = (
253+ "word_repetition_ratio" ,
254+ cutoff_word_repetition_ratio ,
255+ True ,
256+ repetitions_length ,
257+ )
258+ keys .append (new_key )
259+ Visualization_for_lang .plot_hist (self .docs , new_key )
260+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
261+ Visualization_for_lang .print_discarded_by_cond (cond )
262+ conds ["word_repetition_ratio" ] = [cond ]
213263
214264 if "special_characters_ratio" in columns :
215265 with st .sidebar .expander ("Special characters ratio" ):
@@ -369,12 +419,25 @@ def get_cond(key, cutoff, max_cutoff):
369419 "docs" ,
370420 )
371421
372- if "repetitions_ratio" in columns :
373- cond_filter = np .invert (np .all (conds ["repetitions_ratio" ], axis = 0 ))
422+ if "character_repetition_ratio" in columns :
423+ cond_filter = np .invert (
424+ np .all (conds ["character_repetition_ratio" ], axis = 0 )
425+ )
374426 Visualization_for_lang .display_dataset (
375427 self .docs ,
376428 cond_filter ,
377- "Discarded documents for the filter on the repetitions ratio" ,
429+ "Discarded documents for the filter on the character repetition ratio" ,
430+ "docs" ,
431+ )
432+
433+ if "word_repetition_ratio" in columns :
434+ cond_filter = np .invert (
435+ np .all (conds ["word_repetition_ratio" ], axis = 0 )
436+ )
437+ Visualization_for_lang .display_dataset (
438+ self .docs ,
439+ cond_filter ,
440+ "Discarded documents for the filter on the word repetition ratio" ,
378441 "docs" ,
379442 )
380443
@@ -614,15 +677,31 @@ def is_doc_discarded(key, score):
614677 if is_doc_discarded (key , len (words )):
615678 is_discarded = True
616679
617- elif key [0 ] == "repetitions_ratio " :
618- repetitions_ratio = (
680+ elif key [0 ] == "character_repetition_ratio " :
681+ character_repetition_ratio = (
619682 Filtering .compute_character_repetition_ratio (
620683 personal_doc , int (key [3 ])
621684 )
622685 )
623- repetitions_ratio = round (repetitions_ratio , 3 )
624- st .markdown (f"Repetitions ratio: { repetitions_ratio } " )
625- if is_doc_discarded (key , repetitions_ratio ):
686+ character_repetition_ratio = round (
687+ character_repetition_ratio , 3
688+ )
689+ st .markdown (
690+ f"Character repetition ratio: { character_repetition_ratio } "
691+ )
692+ if is_doc_discarded (key , character_repetition_ratio ):
693+ is_discarded = True
694+
695+ elif key [0 ] == "word_repetition_ratio" :
696+ word_repetition_ratio = Filtering .compute_word_repetition_ratio (
697+ personal_doc ,
698+ self .sentencepiece_model_tok ,
699+ self .param ["strip_characters" ],
700+ int (key [3 ]),
701+ )
702+ word_repetition_ratio = round (word_repetition_ratio , 3 )
703+ st .markdown (f"Word repetition ratio: { word_repetition_ratio } " )
704+ if is_doc_discarded (key , word_repetition_ratio ):
626705 is_discarded = True
627706
628707 elif key [0 ] == "special_characters_ratio" :
0 commit comments