Update spacy and numpy versions. Fix changed api.

Ruslan Gareev · Ruslan Gareev · commit 362177b61c8f · 2021-09-29T09:57:35.000+05:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/profanity_filter/config.py b/profanity_filter/config.py
@@ -16,7 +16,7 @@ class Config(BaseModel):
     cache_redis_connection_url: Optional[str] = None
     censor_char: str = '*'
     censor_whole_words: bool = True
-    languages: List[Language] = ['en']
+    languages: List[Language] = ['en_core_web_sm']
     max_relative_distance: float = 0.34
 
     @classmethod
diff --git a/profanity_filter/data/en_core_web_sm_profane_words.txt b/profanity_filter/data/en_core_web_sm_profane_words.txt
diff --git a/profanity_filter/data/ru_core_news_sm_profane_words.txt b/profanity_filter/data/ru_core_news_sm_profane_words.txt
diff --git a/profanity_filter/profanity_filter.py b/profanity_filter/profanity_filter.py
@@ -97,10 +97,14 @@ def __init__(self,
                  morphs: Optional[Morphs] = None,
                  nlps: Optional[Nlps] = None,
                  spells: Optional[Spells] = None,
+                 data_dir: Optional[Path] = None
                  ):
         # Path to data dir
-        self._BASE_DIR = Path(__file__).absolute().parent
-        self._DATA_DIR: Path = self._BASE_DIR / 'data'
+        if not data_dir:
+            self._BASE_DIR = Path(__file__).absolute().parent
+            self._DATA_DIR: Path = self._BASE_DIR / 'data'
+        else:
+            self._DATA_DIR: Path = data_dir
 
         self._MAX_MAX_DISTANCE = 3
 
@@ -212,11 +216,17 @@ def is_profane(self, text: str) -> bool:
         """Returns True if input_text contains any profane words, False otherwise"""
         return self._censor(text=text, return_bool=True)
 
-    @cached_property
-    def spacy_component(self, language: Language = None) -> SpacyProfanityFilterComponent:
+    def spacy_component(self, language: Language = None) -> str:
+        name = 'profanity_filter'
         nlp = self._get_nlp(language)
         [language] = [language for language, nlp_ in self.nlps.items() if nlp_ == nlp]
-        return SpacyProfanityFilterComponent(profanity_filter=self, nlp=nlp, language=language)
+        component = SpacyProfanityFilterComponent(profanity_filter=self, nlp=nlp, language=language)
+
+        @spacy.language.Language.factory(name)
+        def _spacy_component(nlp, name):
+            return component
+
+        return name
 
     @property
     def analyses(self) -> AnalysesTypes:
@@ -344,7 +354,7 @@ def nlps(self, value: Optional[Nlps]) -> None:
             for language in self.languages:
                 with suppress(OSError):
                     self._nlps[language] = spacy.load(language, disable=['parser', 'ner'])
-                    self._nlps[language].add_pipe(self.spacy_component, last=True)
+                    self._nlps[language].add_pipe(self.spacy_component(language), last=True)
             if not self._nlps:
                 raise ProfanityFilterError(f"Couldn't load Spacy model for any of languages: {self.languages_str}")
 
diff --git a/profanity_filter/spacy_component.py b/profanity_filter/spacy_component.py
@@ -21,6 +21,8 @@ def __init__(self, profanity_filter: 'ProfanityFilter', nlp: spacy.language.Lang
         self._stop_on_first_profane_word = stop_on_first_profane_word
 
     # noinspection PyProtectedMember
+    # TODO: Change getting tokens to custom tokenizer:
+    #  https://spacy.io/usage/linguistic-features#tokenization
     def __call__(self, doc: Doc, language: Language = None, stop_on_first_profane_word: Optional[bool] = None) -> Doc:
         self.register_extensions(exist_ok=True)
         if language is None:
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,8 +20,8 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = "^3.6"
-spacy = "^2.0"
+python = "^3.8"
+spacy = "^3"
 ordered-set = "^3.0"
 cached-property = "^1.5"
 ordered-set-stubs = "^0.1.3"
@@ -33,15 +33,16 @@ pydantic = "^1.3"
 "ruamel.yaml" = "^0.15.89"
 hunspell = {version = "^0.5.5", optional = true }
 python-Levenshtein = {version = "^0.12.0", optional = true }
-regex = {version = "^2019.12.20", optional = true }
+regex = {version = "^2021.9.24", optional = true }
 polyglot = {version = "^16.7", optional = true }
 pycld2 = {version = "=0.31", optional = true }
 PyICU = {version = "^2.4", optional = true }
 pymorphy2-dicts-ru = {version = "^2.4.404381", optional = true }
 pymorphy2-dicts-uk = {version = "^2.4.1", optional = true }
 appdirs = {version = "^1.4.3", optional = true }
 fastapi = {version = "^0.45.0", optional = true }
-uvicorn = {version = "^0.11.1", optional = true }
+uvicorn = {version = "^0.15.0", optional = true }
+numpy = "^1.20"
 
 [tool.poetry.extras]
 deep-analysis = ["hunspell", "python-Levenshtein", "regex"]
@@ -52,7 +53,7 @@ web = ["appdirs", "fastapi", "uvicorn"]
 
 [tool.poetry.dev-dependencies]
 pytest = "^3.8"
-dill = "^0.2.9"
+dill = "^0.3.4"
 
 [tool.poetry.scripts]
 profanity_filter = 'profanity_filter.console:main'
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,7 +19,7 @@ class Config:
     censor_whole_words: bool = True
     deep_copy: bool = False
     dill: bool = False
-    languages: Tuple[Language, ...] = ('en', )
+    languages: Tuple[Language, ...] = ('en_core_web_sm', 'ru_core_news_sm')
 
 
 def create_profane_word_dictionaries(**kwargs) -> ProfaneWordDictionaries:
@@ -51,8 +51,8 @@ def pf(request) -> ProfanityFilter:
 
 @pytest.fixture
 def nlp(pf) -> spacy.language.Language:
-    nlp = spacy.load('en')
-    nlp.add_pipe(pf.spacy_component, last=True)
+    nlp = spacy.load('en_core_web_sm')
+    nlp.add_pipe(pf.spacy_component('en_core_web_sm'), last=True)
     return nlp
 
 
diff --git a/tests/test_profanity_filter.py b/tests/test_profanity_filter.py
@@ -41,7 +41,7 @@ def test_from_yaml():
             analyses=[AnalysisType.DEEP, AnalysisType.MULTILINGUAL],
             censor_char='#',
             censor_whole_words=False,
-            languages=['ru', 'en'],
+            languages=['ru_core_news_sm', 'en_core_web_sm'],
             max_relative_distance=0.2,
         )
         config_dict = config.dict()
@@ -86,8 +86,8 @@ def test_censor_char(pf):
 def test_custom_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
     assert pf.custom_profane_word_dictionaries == empty_profane_word_dictionaries
     profane_words = ['unicorn', 'windows']
-    pf.custom_profane_word_dictionaries = {'en': profane_words}
-    assert (pf.custom_profane_word_dictionaries == create_profane_word_dictionaries(en=OrderedSet(profane_words)))
+    pf.custom_profane_word_dictionaries = {'en_core_web_sm': profane_words}
+    assert (pf.custom_profane_word_dictionaries == create_profane_word_dictionaries(en_core_web_sm=OrderedSet(profane_words)))
     assert pf.censor_word('unicorn') == Word(uncensored='unicorn', censored='*******', original_profane_word='unicorn')
     assert pf.censor_word('windows') == Word(uncensored='windows', censored='*******', original_profane_word='windows')
     assert pf.censor_word('fuck') == Word(uncensored='fuck', censored='fuck', original_profane_word=None)
@@ -97,28 +97,28 @@ def test_custom_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
 def test_extra_profane_word_dictionaries(pf, empty_profane_word_dictionaries):
     assert pf.extra_profane_word_dictionaries == empty_profane_word_dictionaries
     extra_profane_words = ['hey', 'like']
-    pf.extra_profane_word_dictionaries = {'en': extra_profane_words}
-    assert (pf.extra_profane_word_dictionaries == create_profane_word_dictionaries(en=OrderedSet(extra_profane_words)))
+    pf.extra_profane_word_dictionaries = {'en_core_web_sm': extra_profane_words}
+    assert (pf.extra_profane_word_dictionaries == create_profane_word_dictionaries(en_core_web_sm=OrderedSet(extra_profane_words)))
     assert pf.censor_word('hey') == Word(uncensored='hey', censored='***', original_profane_word='hey')
     assert pf.censor_word('like') == Word(uncensored='like', censored='****', original_profane_word='like')
     assert pf.censor_word('fuck') == Word(uncensored='fuck', censored='****', original_profane_word='fuck')
 
 
 @with_config(TestConfig())
 def test_restore_words(pf, empty_profane_word_dictionaries):
-    pf.custom_profane_word_dictionaries = {'en': ['cupcakes']}
-    pf.extra_profane_word_dictionaries = {'en': ['dibs']}
+    pf.custom_profane_word_dictionaries = {'en_core_web_sm': ['cupcakes']}
+    pf.extra_profane_word_dictionaries = {'en_core_web_sm': ['dibs']}
     pf.restore_profane_word_dictionaries()
     assert pf.custom_profane_word_dictionaries == empty_profane_word_dictionaries
     assert pf.extra_profane_word_dictionaries == empty_profane_word_dictionaries
     profane_word_dictionaries = pf.profane_word_dictionaries
-    assert 'dibs' not in profane_word_dictionaries['en']
-    assert 'cupcakes' not in profane_word_dictionaries['en']
+    assert 'dibs' not in profane_word_dictionaries['en_core_web_sm']
+    assert 'cupcakes' not in profane_word_dictionaries['en_core_web_sm']
 
 
 @with_config(TestConfig())
 def test_tokenization(pf):
-    pf.custom_profane_word_dictionaries = {'en': ['chocolate']}
+    pf.custom_profane_word_dictionaries = {'en_core_web_sm': ['chocolate']}
     assert pf.censor(TEST_STATEMENT) == "Hey, I like unicorns, *********, oranges and man's blood, turd!"
 
 
@@ -182,22 +182,22 @@ def test_deep_analysis_with_censor_whole_words_false(pf):
     assert pf.censor_word('h0r1h0r1') == Word(uncensored='h0r1h0r1', censored='***1***1', original_profane_word='h0r')
 
 
-@with_config(TestConfig(languages=('ru', 'en')))
+@with_config(TestConfig(languages=('ru_core_news_sm', 'en_core_web_sm')))
 def test_languages(pf):
-    assert pf.languages == OrderedSet(['ru', 'en'])
-    assert pf.languages_str == 'ru, en'
+    assert pf.languages == OrderedSet(['ru_core_news_sm', 'en_core_web_sm'])
+    assert pf.languages_str == 'ru_core_news_sm, en_core_web_sm'
 
 
-@with_config(TestConfig(languages=('ru', 'en')))
+@with_config(TestConfig(languages=('ru_core_news_sm', 'en_core_web_sm')))
 def test_russian(pf):
     assert pf.censor_word('бля') == Word(uncensored='бля', censored='***', original_profane_word='бля')
 
 
-@with_config(TestConfig(analyses=frozenset([AnalysisType.DEEP]), languages=('ru', 'en')))
+@with_config(TestConfig(analyses=frozenset([AnalysisType.DEEP]), languages=('ru_core_news_sm', 'en_core_web_sm')))
 def test_russian_deep_analysis(pf):
     assert pf.censor_word('бл@ка') == Word(uncensored='бл@ка', censored='*****', original_profane_word='бля')
 
 
-@with_config(TestConfig(analyses=frozenset([AnalysisType.MULTILINGUAL]), languages=('ru', 'en')))
+@with_config(TestConfig(analyses=frozenset([AnalysisType.MULTILINGUAL]), languages=('ru_core_news_sm', 'en_core_web_sm')))
 def test_multilingual(pf):
     assert pf.censor("Да бля, это просто shit какой-то!") == "Да ***, это просто **** какой-то!"