-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Any language classification #25875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Any language classification #25875
Changes from 3 commits
443dd3f
0992955
f686b3d
955ba18
ff887a6
c0fe9db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| from typing import List, Optional, Sequence, final | ||
| from itertools import groupby | ||
| from typing import List, Optional, Sequence, Union, final | ||
|
|
||
| from presidio_analyzer import ( | ||
| AnalyzerEngine, | ||
|
|
@@ -94,7 +95,10 @@ def get_recognizers_by(self, target: recognizer.Target) -> list[EntityRecognizer | |
|
|
||
| created = PresidioRecognizerFactory.create_recognizer(recognizer) | ||
| if created is not None: | ||
| if created.supported_language != self._language.value: | ||
| if ( | ||
| self._language is not ClassificationLanguage.any | ||
| and created.supported_language != self._language.value | ||
| ): | ||
| continue | ||
| recognizers.append(created) | ||
|
|
||
|
|
@@ -113,37 +117,70 @@ def _column_name(self) -> str: | |
| return self._column.name.root | ||
|
|
||
| def build_analyzer_with( | ||
| self, recognizers: list[EntityRecognizer] | ||
| self, | ||
| recognizers: list[EntityRecognizer], | ||
| nlp_engine: Optional[NlpEngine] = None, | ||
| ) -> AnalyzerEngine: | ||
| supported_languages = [rec.supported_language for rec in recognizers] | ||
| recognizer_registry = RecognizerRegistry( | ||
| recognizers=recognizers, supported_languages=supported_languages | ||
| ) | ||
| effective_nlp = nlp_engine if nlp_engine is not None else self._nlp_engine | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| return AnalyzerEngine( | ||
| registry=recognizer_registry, | ||
| nlp_engine=self._nlp_engine, | ||
| nlp_engine=effective_nlp, | ||
| supported_languages=supported_languages, | ||
| ) | ||
|
|
||
| def _analyze_with( | ||
| self, | ||
| text_or_values: Union[str, Sequence[str]], | ||
| recognizers: list[EntityRecognizer], | ||
| context: Optional[list[str]] = None, | ||
| ) -> list[RecognizerResult]: | ||
| values = ( | ||
| [text_or_values] | ||
| if isinstance(text_or_values, str) | ||
| else list(text_or_values) | ||
| ) | ||
| results: list[RecognizerResult] = [] | ||
|
|
||
| if self._language is not ClassificationLanguage.any: | ||
| analyzer = self.build_analyzer_with(recognizers) | ||
| for value in values: | ||
| results.extend( | ||
| analyzer.analyze( | ||
| value, | ||
| language=self._language.value, | ||
| context=context, | ||
| return_decision_process=True, | ||
| ) | ||
| ) | ||
| return results | ||
|
|
||
| sorted_recs = sorted(recognizers, key=lambda r: r.supported_language) | ||
| for lang, group in groupby(sorted_recs, key=lambda r: r.supported_language): | ||
| lang_recognizers = list(group) | ||
| analyzer = self.build_analyzer_with(lang_recognizers, nlp_engine=None) | ||
| for value in values: | ||
| results.extend( | ||
| analyzer.analyze( | ||
| value, | ||
| language=lang, | ||
| context=context, | ||
| return_decision_process=True, | ||
| ) | ||
| ) | ||
edg956 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return results | ||
|
|
||
| def analyze_content(self, values: Sequence[str]) -> TagAnalysis: | ||
| recognizers = self.content_recognizers | ||
|
|
||
| if not recognizers: | ||
| return self._build_tag_analysis([], 1, recognizer.Target.content) | ||
|
|
||
| context = split_column_name(self._column_name) | ||
| analyzer = self.build_analyzer_with(recognizers) | ||
|
|
||
| results: list[RecognizerResult] = [] | ||
| for value in values: | ||
| results.extend( | ||
| analyzer.analyze( | ||
| value, | ||
| language=self._language.value, | ||
| context=context, | ||
| return_decision_process=True, | ||
| ) | ||
| ) | ||
| results = self._analyze_with(values, recognizers, context=context) | ||
|
|
||
| return self._build_tag_analysis(results, len(values), recognizer.Target.content) | ||
|
|
||
|
|
@@ -153,12 +190,7 @@ def analyze_column(self) -> TagAnalysis: | |
| if not recognizers: | ||
| return self._build_tag_analysis([], 1, recognizer.Target.column_name) | ||
|
|
||
| analyzer = self.build_analyzer_with(recognizers) | ||
| results = analyzer.analyze( | ||
| self._column_name, | ||
| language=self._language.value, | ||
| return_decision_process=True, | ||
| ) | ||
| results = self._analyze_with(self._column_name, recognizers) | ||
|
|
||
| return self._build_tag_analysis(results, 1, recognizer.Target.column_name) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
anyto English model contradicts PR descriptionThe PR description states: "The multilingual spaCy model (
xx_ent_wiki_sm) is mapped toClassificationLanguage.anyinLANGUAGE_MODEL_MAPPING". However, the code mapsClassificationLanguage.anytoSPACY_EN_MODEL(en_core_web_md), not toSPACY_MULTILANG_MODEL(xx_ent_wiki_sm).When
load_nlp_engine(classification_language=ClassificationLanguage.any)is called fromtag_processor.py, it createsSpacyNlpEngine(models=[{"lang_code": "any", "model_name": "en_core_web_md"}]). This has two issues:lang_code:"any"is not a valid ISO 639-1 code.load_nlp_enginesetssupported_language = classification_language.valuewhich is"any". When spaCy/Presidio tries to use this NLP engine with a real language code like"en"or"fr", the mismatch may cause errors.Since "any" mode uses
nlp_engine=Nonein the analyzer (or at least intends to — see related sentinel bug), the NLP engine loaded here is only used as a fallback. But given the sentinel bug, it IS actually used, making this mapping impactful.Suggestion: If "any" mode truly shouldn't use an NLP engine, consider skipping NLP engine loading entirely for
anyin the caller. Otherwise, map to the multilingual model as the PR description states.Suggested fix:
Was this helpful? React with 👍 / 👎