Skip to content

Commit 362177b

Browse files
author
Ruslan Gareev
committed
Update spacy and numpy versions. Fix changed api.
1 parent 1af0ecd commit 362177b

File tree

9 files changed

+697
-540
lines changed

9 files changed

+697
-540
lines changed

poetry.lock

Lines changed: 653 additions & 509 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

profanity_filter/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class Config(BaseModel):
1616
cache_redis_connection_url: Optional[str] = None
1717
censor_char: str = '*'
1818
censor_whole_words: bool = True
19-
languages: List[Language] = ['en']
19+
languages: List[Language] = ['en_core_web_sm']
2020
max_relative_distance: float = 0.34
2121

2222
@classmethod
File renamed without changes.
File renamed without changes.

profanity_filter/profanity_filter.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,14 @@ def __init__(self,
9797
morphs: Optional[Morphs] = None,
9898
nlps: Optional[Nlps] = None,
9999
spells: Optional[Spells] = None,
100+
data_dir: Optional[Path] = None
100101
):
101102
# Path to data dir
102-
self._BASE_DIR = Path(__file__).absolute().parent
103-
self._DATA_DIR: Path = self._BASE_DIR / 'data'
103+
if not data_dir:
104+
self._BASE_DIR = Path(__file__).absolute().parent
105+
self._DATA_DIR: Path = self._BASE_DIR / 'data'
106+
else:
107+
self._DATA_DIR: Path = data_dir
104108

105109
self._MAX_MAX_DISTANCE = 3
106110

@@ -212,11 +216,17 @@ def is_profane(self, text: str) -> bool:
212216
"""Returns True if input_text contains any profane words, False otherwise"""
213217
return self._censor(text=text, return_bool=True)
214218

215-
@cached_property
216-
def spacy_component(self, language: Language = None) -> SpacyProfanityFilterComponent:
219+
def spacy_component(self, language: Language = None) -> str:
220+
name = 'profanity_filter'
217221
nlp = self._get_nlp(language)
218222
[language] = [language for language, nlp_ in self.nlps.items() if nlp_ == nlp]
219-
return SpacyProfanityFilterComponent(profanity_filter=self, nlp=nlp, language=language)
223+
component = SpacyProfanityFilterComponent(profanity_filter=self, nlp=nlp, language=language)
224+
225+
@spacy.language.Language.factory(name)
226+
def _spacy_component(nlp, name):
227+
return component
228+
229+
return name
220230

221231
@property
222232
def analyses(self) -> AnalysesTypes:
@@ -344,7 +354,7 @@ def nlps(self, value: Optional[Nlps]) -> None:
344354
for language in self.languages:
345355
with suppress(OSError):
346356
self._nlps[language] = spacy.load(language, disable=['parser', 'ner'])
347-
self._nlps[language].add_pipe(self.spacy_component, last=True)
357+
self._nlps[language].add_pipe(self.spacy_component(language), last=True)
348358
if not self._nlps:
349359
raise ProfanityFilterError(f"Couldn't load Spacy model for any of languages: {self.languages_str}")
350360

profanity_filter/spacy_component.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ def __init__(self, profanity_filter: 'ProfanityFilter', nlp: spacy.language.Lang
2121
self._stop_on_first_profane_word = stop_on_first_profane_word
2222

2323
# noinspection PyProtectedMember
24+
# TODO: Change getting tokens to custom tokenizer:
25+
# https://spacy.io/usage/linguistic-features#tokenization
2426
def __call__(self, doc: Doc, language: Language = None, stop_on_first_profane_word: Optional[bool] = None) -> Doc:
2527
self.register_extensions(exist_ok=True)
2628
if language is None:

pyproject.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ classifiers = [
2020
]
2121

2222
[tool.poetry.dependencies]
23-
python = "^3.6"
24-
spacy = "^2.0"
23+
python = "^3.8"
24+
spacy = "^3"
2525
ordered-set = "^3.0"
2626
cached-property = "^1.5"
2727
ordered-set-stubs = "^0.1.3"
@@ -33,15 +33,16 @@ pydantic = "^1.3"
3333
"ruamel.yaml" = "^0.15.89"
3434
hunspell = {version = "^0.5.5", optional = true }
3535
python-Levenshtein = {version = "^0.12.0", optional = true }
36-
regex = {version = "^2019.12.20", optional = true }
36+
regex = {version = "^2021.9.24", optional = true }
3737
polyglot = {version = "^16.7", optional = true }
3838
pycld2 = {version = "=0.31", optional = true }
3939
PyICU = {version = "^2.4", optional = true }
4040
pymorphy2-dicts-ru = {version = "^2.4.404381", optional = true }
4141
pymorphy2-dicts-uk = {version = "^2.4.1", optional = true }
4242
appdirs = {version = "^1.4.3", optional = true }
4343
fastapi = {version = "^0.45.0", optional = true }
44-
uvicorn = {version = "^0.11.1", optional = true }
44+
uvicorn = {version = "^0.15.0", optional = true }
45+
numpy = "^1.20"
4546

4647
[tool.poetry.extras]
4748
deep-analysis = ["hunspell", "python-Levenshtein", "regex"]
@@ -52,7 +53,7 @@ web = ["appdirs", "fastapi", "uvicorn"]
5253

5354
[tool.poetry.dev-dependencies]
5455
pytest = "^3.8"
55-
dill = "^0.2.9"
56+
dill = "^0.3.4"
5657

5758
[tool.poetry.scripts]
5859
profanity_filter = 'profanity_filter.console:main'

tests/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class Config:
1919
censor_whole_words: bool = True
2020
deep_copy: bool = False
2121
dill: bool = False
22-
languages: Tuple[Language, ...] = ('en', )
22+
languages: Tuple[Language, ...] = ('en_core_web_sm', 'ru_core_news_sm')
2323

2424

2525
def create_profane_word_dictionaries(**kwargs) -> ProfaneWordDictionaries:
@@ -51,8 +51,8 @@ def pf(request) -> ProfanityFilter:
5151

5252
@pytest.fixture
5353
def nlp(pf) -> spacy.language.Language:
54-
nlp = spacy.load('en')
55-
nlp.add_pipe(pf.spacy_component, last=True)
54+
nlp = spacy.load('en_core_web_sm')
55+
nlp.add_pipe(pf.spacy_component('en_core_web_sm'), last=True)
5656
return nlp
5757

5858

tests/test_profanity_filter.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_from_yaml():
4141
analyses=[AnalysisType.DEEP, AnalysisType.MULTILINGUAL],
4242
censor_char='#',
4343
censor_whole_words=False,
44-
languages=['ru', 'en'],
44+
languages=['ru_core_news_sm', 'en_core_web_sm'],
4545
max_relative_distance=0.2,
4646
)
4747
config_dict = config.dict()
@@ -86,8 +86,8 @@ def test_censor_char(pf):
8686
def test_custom_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
8787
assert pf.custom_profane_word_dictionaries == empty_profane_word_dictionaries
8888
profane_words = ['unicorn', 'windows']
89-
pf.custom_profane_word_dictionaries = {'en': profane_words}
90-
assert (pf.custom_profane_word_dictionaries == create_profane_word_dictionaries(en=OrderedSet(profane_words)))
89+
pf.custom_profane_word_dictionaries = {'en_core_web_sm': profane_words}
90+
assert (pf.custom_profane_word_dictionaries == create_profane_word_dictionaries(en_core_web_sm=OrderedSet(profane_words)))
9191
assert pf.censor_word('unicorn') == Word(uncensored='unicorn', censored='*******', original_profane_word='unicorn')
9292
assert pf.censor_word('windows') == Word(uncensored='windows', censored='*******', original_profane_word='windows')
9393
assert pf.censor_word('fuck') == Word(uncensored='fuck', censored='fuck', original_profane_word=None)
@@ -97,28 +97,28 @@ def test_custom_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
9797
def test_extra_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
9898
assert pf.extra_profane_word_dictionaries == empty_profane_word_dictionaries
9999
extra_profane_words = ['hey', 'like']
100-
pf.extra_profane_word_dictionaries = {'en': extra_profane_words}
101-
assert (pf.extra_profane_word_dictionaries == create_profane_word_dictionaries(en=OrderedSet(extra_profane_words)))
100+
pf.extra_profane_word_dictionaries = {'en_core_web_sm': extra_profane_words}
101+
assert (pf.extra_profane_word_dictionaries == create_profane_word_dictionaries(en_core_web_sm=OrderedSet(extra_profane_words)))
102102
assert pf.censor_word('hey') == Word(uncensored='hey', censored='***', original_profane_word='hey')
103103
assert pf.censor_word('like') == Word(uncensored='like', censored='****', original_profane_word='like')
104104
assert pf.censor_word('fuck') == Word(uncensored='fuck', censored='****', original_profane_word='fuck')
105105

106106

107107
@with_config(TestConfig())
108108
def test_restore_words(pf, empty_profane_word_dictionaries):
109-
pf.custom_profane_word_dictionaries = {'en': ['cupcakes']}
110-
pf.extra_profane_word_dictionaries = {'en': ['dibs']}
109+
pf.custom_profane_word_dictionaries = {'en_core_web_sm': ['cupcakes']}
110+
pf.extra_profane_word_dictionaries = {'en_core_web_sm': ['dibs']}
111111
pf.restore_profane_word_dictionaries()
112112
assert pf.custom_profane_word_dictionaries == empty_profane_word_dictionaries
113113
assert pf.extra_profane_word_dictionaries == empty_profane_word_dictionaries
114114
profane_word_dictionaries = pf.profane_word_dictionaries
115-
assert 'dibs' not in profane_word_dictionaries['en']
116-
assert 'cupcakes' not in profane_word_dictionaries['en']
115+
assert 'dibs' not in profane_word_dictionaries['en_core_web_sm']
116+
assert 'cupcakes' not in profane_word_dictionaries['en_core_web_sm']
117117

118118

119119
@with_config(TestConfig())
120120
def test_tokenization(pf):
121-
pf.custom_profane_word_dictionaries = {'en': ['chocolate']}
121+
pf.custom_profane_word_dictionaries = {'en_core_web_sm': ['chocolate']}
122122
assert pf.censor(TEST_STATEMENT) == "Hey, I like unicorns, *********, oranges and man's blood, turd!"
123123

124124

@@ -182,22 +182,22 @@ def test_deep_analysis_with_censor_whole_words_false(pf):
182182
assert pf.censor_word('h0r1h0r1') == Word(uncensored='h0r1h0r1', censored='***1***1', original_profane_word='h0r')
183183

184184

185-
@with_config(TestConfig(languages=('ru', 'en')))
185+
@with_config(TestConfig(languages=('ru_core_news_sm', 'en_core_web_sm')))
186186
def test_languages(pf):
187-
assert pf.languages == OrderedSet(['ru', 'en'])
188-
assert pf.languages_str == 'ru, en'
187+
assert pf.languages == OrderedSet(['ru_core_news_sm', 'en_core_web_sm'])
188+
assert pf.languages_str == 'ru_core_news_sm, en_core_web_sm'
189189

190190

191-
@with_config(TestConfig(languages=('ru', 'en')))
191+
@with_config(TestConfig(languages=('ru_core_news_sm', 'en_core_web_sm')))
192192
def test_russian(pf):
193193
assert pf.censor_word('бля') == Word(uncensored='бля', censored='***', original_profane_word='бля')
194194

195195

196-
@with_config(TestConfig(analyses=frozenset([AnalysisType.DEEP]), languages=('ru', 'en')))
196+
@with_config(TestConfig(analyses=frozenset([AnalysisType.DEEP]), languages=('ru_core_news_sm', 'en_core_web_sm')))
197197
def test_russian_deep_analysis(pf):
198198
assert pf.censor_word('бл@ка') == Word(uncensored='бл@ка', censored='*****', original_profane_word='бля')
199199

200200

201-
@with_config(TestConfig(analyses=frozenset([AnalysisType.MULTILINGUAL]), languages=('ru', 'en')))
201+
@with_config(TestConfig(analyses=frozenset([AnalysisType.MULTILINGUAL]), languages=('ru_core_news_sm', 'en_core_web_sm')))
202202
def test_multilingual(pf):
203203
assert pf.censor("Да бля, это просто shit какой-то!") == "Да ***, это просто **** какой-то!"

0 commit comments

Comments
 (0)