Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ingestion/src/metadata/pii/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
LANGUAGE_MODEL_MAPPING = defaultdict(
lambda: SPACY_MULTILANG_MODEL,
{
ClassificationLanguage.any: SPACY_EN_MODEL,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Bug: Mapping any to English model contradicts PR description

The PR description states: "The multilingual spaCy model (xx_ent_wiki_sm) is mapped to ClassificationLanguage.any in LANGUAGE_MODEL_MAPPING". However, the code maps ClassificationLanguage.any to SPACY_EN_MODEL (en_core_web_md), not to SPACY_MULTILANG_MODEL (xx_ent_wiki_sm).

When load_nlp_engine(classification_language=ClassificationLanguage.any) is called from tag_processor.py, it creates SpacyNlpEngine(models=[{"lang_code": "any", "model_name": "en_core_web_md"}]). This has two issues:

  1. Wrong model: The English model is loaded instead of the multilingual model, contradicting the stated design.
  2. Invalid lang_code: "any" is not a valid ISO 639-1 code. load_nlp_engine sets supported_language = classification_language.value which is "any". When spaCy/Presidio tries to use this NLP engine with a real language code like "en" or "fr", the mismatch may cause errors.

Since "any" mode uses nlp_engine=None in the analyzer (or at least intends to — see related sentinel bug), the NLP engine loaded here is only used as a fallback. But given the sentinel bug, it IS actually used, making this mapping impactful.

Suggestion: If "any" mode truly shouldn't use an NLP engine, consider skipping NLP engine loading entirely for any in the caller. Otherwise, map to the multilingual model as the PR description states.

Suggested fix:

        ClassificationLanguage.any: SPACY_MULTILANG_MODEL,

Was this helpful? React with 👍 / 👎

ClassificationLanguage.ca: "ca_core_news_md",
ClassificationLanguage.zh: "zh_core_web_md",
ClassificationLanguage.hr: "hr_core_news_md",
Expand Down
76 changes: 54 additions & 22 deletions ingestion/src/metadata/pii/tag_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional, Sequence, final
from itertools import groupby
from typing import List, Optional, Sequence, Union, final

from presidio_analyzer import (
AnalyzerEngine,
Expand Down Expand Up @@ -94,7 +95,10 @@ def get_recognizers_by(self, target: recognizer.Target) -> list[EntityRecognizer

created = PresidioRecognizerFactory.create_recognizer(recognizer)
if created is not None:
if created.supported_language != self._language.value:
if (
self._language is not ClassificationLanguage.any
and created.supported_language != self._language.value
):
continue
recognizers.append(created)

Expand All @@ -113,37 +117,70 @@ def _column_name(self) -> str:
return self._column.name.root

def build_analyzer_with(
self, recognizers: list[EntityRecognizer]
self,
recognizers: list[EntityRecognizer],
nlp_engine: Optional[NlpEngine] = None,
) -> AnalyzerEngine:
supported_languages = [rec.supported_language for rec in recognizers]
recognizer_registry = RecognizerRegistry(
recognizers=recognizers, supported_languages=supported_languages
)
effective_nlp = nlp_engine if nlp_engine is not None else self._nlp_engine
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Bug: nlp_engine=None sentinel can't disable NLP engine in "any" mode

The build_analyzer_with method uses None as the default for nlp_engine, intending it as a sentinel meaning "use self._nlp_engine":

effective_nlp = nlp_engine if nlp_engine is not None else self._nlp_engine

In the "any" mode path (_analyze_with, line 164), the code calls self.build_analyzer_with(lang_recognizers, nlp_engine=None) intending to pass no NLP engine to AnalyzerEngine. However, because None is the sentinel for "use default", effective_nlp falls back to self._nlp_engine.

This means in "any" mode, the AnalyzerEngine is still built with the NLP engine that was loaded with lang_code="any" (from load_nlp_engine(classification_language=ClassificationLanguage.any) in tag_processor.py). When analyzer.analyze(value, language="en") is called, Presidio will try to use the NLP engine whose supported language is "any" — not "en" — which will likely cause a language mismatch error at runtime.

The tests don't catch this because mock_nlp_engine = MagicMock() silently accepts any call. The tests only verify that build_analyzer_with is called with nlp_engine=None, not what AnalyzerEngine actually receives.

Fix: Use a sentinel object instead of None to distinguish "no argument given" from "explicitly no NLP engine":

_UNSET = object()

def build_analyzer_with(
    self,
    recognizers: list[EntityRecognizer],
    nlp_engine: Optional[NlpEngine] = _UNSET,
) -> AnalyzerEngine:
    ...
    effective_nlp = self._nlp_engine if nlp_engine is _UNSET else nlp_engine
    ...

This allows build_analyzer_with(recs, nlp_engine=None) to actually pass None to AnalyzerEngine.

Suggested fix:

_UNSET_NLP = object()

    def build_analyzer_with(
        self,
        recognizers: list[EntityRecognizer],
        nlp_engine: Optional[NlpEngine] = _UNSET_NLP,
    ) -> AnalyzerEngine:
        supported_languages = [rec.supported_language for rec in recognizers]
        recognizer_registry = RecognizerRegistry(
            recognizers=recognizers, supported_languages=supported_languages
        )
        effective_nlp = self._nlp_engine if nlp_engine is _UNSET_NLP else nlp_engine
        return AnalyzerEngine(
            registry=recognizer_registry,
            nlp_engine=effective_nlp,
            supported_languages=supported_languages,
        )

Was this helpful? React with 👍 / 👎

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been changed so that we always have a nlp recognizer. The prev implementation would've broken if a single spacy model was found in the recognizer set

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification! I understand the design decision now - maintaining an NLP recognizer prevents issues when spaCy models are present in the recognizer set.

Details

Your approach ensures that even in "any" mode, if there are recognizers that expect an NLP engine, they'll have one available. This prevents runtime errors that would occur if pattern-only recognizers were mixed with NLP-dependent recognizers and no NLP engine was provided.

The current implementation with nlp_engine if nlp_engine is not None else self._nlp_engine at tag_analyzer.py:128 achieves this by always providing an NLP engine (either the explicitly passed one or the instance default), which is the intended behavior based on your explanation.

return AnalyzerEngine(
registry=recognizer_registry,
nlp_engine=self._nlp_engine,
nlp_engine=effective_nlp,
supported_languages=supported_languages,
)

def _analyze_with(
self,
text_or_values: Union[str, Sequence[str]],
recognizers: list[EntityRecognizer],
context: Optional[list[str]] = None,
) -> list[RecognizerResult]:
values = (
[text_or_values]
if isinstance(text_or_values, str)
else list(text_or_values)
)
results: list[RecognizerResult] = []

if self._language is not ClassificationLanguage.any:
analyzer = self.build_analyzer_with(recognizers)
for value in values:
results.extend(
analyzer.analyze(
value,
language=self._language.value,
context=context,
return_decision_process=True,
)
)
return results

sorted_recs = sorted(recognizers, key=lambda r: r.supported_language)
for lang, group in groupby(sorted_recs, key=lambda r: r.supported_language):
lang_recognizers = list(group)
analyzer = self.build_analyzer_with(lang_recognizers, nlp_engine=None)
for value in values:
results.extend(
analyzer.analyze(
value,
language=lang,
context=context,
return_decision_process=True,
)
)
return results

def analyze_content(self, values: Sequence[str]) -> TagAnalysis:
recognizers = self.content_recognizers

if not recognizers:
return self._build_tag_analysis([], 1, recognizer.Target.content)

context = split_column_name(self._column_name)
analyzer = self.build_analyzer_with(recognizers)

results: list[RecognizerResult] = []
for value in values:
results.extend(
analyzer.analyze(
value,
language=self._language.value,
context=context,
return_decision_process=True,
)
)
results = self._analyze_with(values, recognizers, context=context)

return self._build_tag_analysis(results, len(values), recognizer.Target.content)

Expand All @@ -153,12 +190,7 @@ def analyze_column(self) -> TagAnalysis:
if not recognizers:
return self._build_tag_analysis([], 1, recognizer.Target.column_name)

analyzer = self.build_analyzer_with(recognizers)
results = analyzer.analyze(
self._column_name,
language=self._language.value,
return_decision_process=True,
)
results = self._analyze_with(self._column_name, recognizers)

return self._build_tag_analysis(results, 1, recognizer.Target.column_name)

Expand Down
Loading