@@ -41,7 +41,7 @@ def test_from_yaml():
4141 analyses = [AnalysisType .DEEP , AnalysisType .MULTILINGUAL ],
4242 censor_char = '#' ,
4343 censor_whole_words = False ,
44- languages = ['ru ' , 'en ' ],
44+ languages = ['ru_core_news_sm ' , 'en_core_web_sm ' ],
4545 max_relative_distance = 0.2 ,
4646 )
4747 config_dict = config .dict ()
@@ -86,8 +86,8 @@ def test_censor_char(pf):
8686def test_custom_profane_word_dictionaries (pf , empty_profane_word_dictionaries ):
8787 assert pf .custom_profane_word_dictionaries == empty_profane_word_dictionaries
8888 profane_words = ['unicorn' , 'windows' ]
89- pf .custom_profane_word_dictionaries = {'en ' : profane_words }
90- assert (pf .custom_profane_word_dictionaries == create_profane_word_dictionaries (en = OrderedSet (profane_words )))
89+ pf .custom_profane_word_dictionaries = {'en_core_web_sm ' : profane_words }
90+ assert (pf .custom_profane_word_dictionaries == create_profane_word_dictionaries (en_core_web_sm = OrderedSet (profane_words )))
9191 assert pf .censor_word ('unicorn' ) == Word (uncensored = 'unicorn' , censored = '*******' , original_profane_word = 'unicorn' )
9292 assert pf .censor_word ('windows' ) == Word (uncensored = 'windows' , censored = '*******' , original_profane_word = 'windows' )
9393 assert pf .censor_word ('fuck' ) == Word (uncensored = 'fuck' , censored = 'fuck' , original_profane_word = None )
@@ -97,28 +97,28 @@ def test_custom_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
9797def test_extra_profane_word_dictionaries (pf , empty_profane_word_dictionaries ):
9898 assert pf .extra_profane_word_dictionaries == empty_profane_word_dictionaries
9999 extra_profane_words = ['hey' , 'like' ]
100- pf .extra_profane_word_dictionaries = {'en ' : extra_profane_words }
101- assert (pf .extra_profane_word_dictionaries == create_profane_word_dictionaries (en = OrderedSet (extra_profane_words )))
100+ pf .extra_profane_word_dictionaries = {'en_core_web_sm ' : extra_profane_words }
101+ assert (pf .extra_profane_word_dictionaries == create_profane_word_dictionaries (en_core_web_sm = OrderedSet (extra_profane_words )))
102102 assert pf .censor_word ('hey' ) == Word (uncensored = 'hey' , censored = '***' , original_profane_word = 'hey' )
103103 assert pf .censor_word ('like' ) == Word (uncensored = 'like' , censored = '****' , original_profane_word = 'like' )
104104 assert pf .censor_word ('fuck' ) == Word (uncensored = 'fuck' , censored = '****' , original_profane_word = 'fuck' )
105105
106106
107107@with_config (TestConfig ())
108108def test_restore_words (pf , empty_profane_word_dictionaries ):
109- pf .custom_profane_word_dictionaries = {'en ' : ['cupcakes' ]}
110- pf .extra_profane_word_dictionaries = {'en ' : ['dibs' ]}
109+ pf .custom_profane_word_dictionaries = {'en_core_web_sm ' : ['cupcakes' ]}
110+ pf .extra_profane_word_dictionaries = {'en_core_web_sm ' : ['dibs' ]}
111111 pf .restore_profane_word_dictionaries ()
112112 assert pf .custom_profane_word_dictionaries == empty_profane_word_dictionaries
113113 assert pf .extra_profane_word_dictionaries == empty_profane_word_dictionaries
114114 profane_word_dictionaries = pf .profane_word_dictionaries
115- assert 'dibs' not in profane_word_dictionaries ['en ' ]
116- assert 'cupcakes' not in profane_word_dictionaries ['en ' ]
115+ assert 'dibs' not in profane_word_dictionaries ['en_core_web_sm ' ]
116+ assert 'cupcakes' not in profane_word_dictionaries ['en_core_web_sm ' ]
117117
118118
119119@with_config (TestConfig ())
120120def test_tokenization (pf ):
121- pf .custom_profane_word_dictionaries = {'en ' : ['chocolate' ]}
121+ pf .custom_profane_word_dictionaries = {'en_core_web_sm ' : ['chocolate' ]}
122122 assert pf .censor (TEST_STATEMENT ) == "Hey, I like unicorns, *********, oranges and man's blood, turd!"
123123
124124
@@ -182,22 +182,22 @@ def test_deep_analysis_with_censor_whole_words_false(pf):
182182 assert pf .censor_word ('h0r1h0r1' ) == Word (uncensored = 'h0r1h0r1' , censored = '***1***1' , original_profane_word = 'h0r' )
183183
184184
185- @with_config (TestConfig (languages = ('ru ' , 'en ' )))
185+ @with_config (TestConfig (languages = ('ru_core_news_sm ' , 'en_core_web_sm ' )))
186186def test_languages (pf ):
187- assert pf .languages == OrderedSet (['ru ' , 'en ' ])
188- assert pf .languages_str == 'ru, en '
187+ assert pf .languages == OrderedSet (['ru_core_news_sm ' , 'en_core_web_sm ' ])
188+ assert pf .languages_str == 'ru_core_news_sm, en_core_web_sm '
189189
190190
191- @with_config (TestConfig (languages = ('ru ' , 'en ' )))
191+ @with_config (TestConfig (languages = ('ru_core_news_sm ' , 'en_core_web_sm ' )))
192192def test_russian (pf ):
193193 assert pf .censor_word ('бля' ) == Word (uncensored = 'бля' , censored = '***' , original_profane_word = 'бля' )
194194
195195
196- @with_config (TestConfig (analyses = frozenset ([AnalysisType .DEEP ]), languages = ('ru ' , 'en ' )))
196+ @with_config (TestConfig (analyses = frozenset ([AnalysisType .DEEP ]), languages = ('ru_core_news_sm ' , 'en_core_web_sm ' )))
197197def test_russian_deep_analysis (pf ):
198198 assert pf .censor_word ('бл@ка' ) == Word (uncensored = 'бл@ка' , censored = '*****' , original_profane_word = 'бля' )
199199
200200
201- @with_config (TestConfig (analyses = frozenset ([AnalysisType .MULTILINGUAL ]), languages = ('ru ' , 'en ' )))
201+ @with_config (TestConfig (analyses = frozenset ([AnalysisType .MULTILINGUAL ]), languages = ('ru_core_news_sm ' , 'en_core_web_sm ' )))
202202def test_multilingual (pf ):
203203 assert pf .censor ("Да бля, это просто shit какой-то!" ) == "Да ***, это просто **** какой-то!"
0 commit comments