Skip to content

Commit 6dccf34

Browse files
committed
wip
1 parent a8d7c01 commit 6dccf34

File tree

15 files changed

+85
-97
lines changed

15 files changed

+85
-97
lines changed

ingestion/src/_openmetadata_testutils/factories/metadata/generated/schema/type/recognizer.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from metadata.generated.schema.type.customRecognizer import CustomRecognizer
2020
from metadata.generated.schema.type.exactTermsRecognizer import ExactTermsRecognizer
2121
from metadata.generated.schema.type.patternRecognizer import PatternRecognizer
22-
from metadata.generated.schema.type.piiEntity import PIIEntity
2322
from metadata.generated.schema.type.predefinedRecognizer import Name as PredefinedName
2423
from metadata.generated.schema.type.predefinedRecognizer import PredefinedRecognizer
2524
from metadata.generated.schema.type.recognizer import (
@@ -54,7 +53,6 @@ class PatternRecognizerFactory(factory.Factory):
5453
patterns = factory.List([factory.SubFactory(PatternFactory)])
5554
regexFlags = factory.SubFactory(RegexFlagsFactory)
5655
context = factory.LazyFunction(lambda: ["email", "contact"])
57-
supportedEntity = PIIEntity.EMAIL_ADDRESS
5856
supportedLanguage = ClassificationLanguage.en
5957

6058
class Meta:
@@ -64,7 +62,6 @@ class Meta:
6462
class ExactTermsRecognizerFactory(factory.Factory):
6563
type = "exact_terms"
6664
exactTerms = factory.LazyFunction(lambda: ["sensitive", "confidential"])
67-
supportedEntity = PIIEntity.EMAIL_ADDRESS
6865
supportedLanguage = ClassificationLanguage.en
6966
regexFlags = factory.SubFactory(RegexFlagsFactory)
7067

@@ -75,7 +72,6 @@ class Meta:
7572
class ContextRecognizerFactory(factory.Factory):
7673
type = "context"
7774
contextWords = factory.LazyFunction(lambda: ["ssn", "social security"])
78-
supportedEntity = PIIEntity.US_SSN
7975
supportedLanguage = ClassificationLanguage.en
8076
minScore = 0.4
8177
maxScore = 0.8
@@ -99,7 +95,6 @@ class Meta:
9995
class CustomRecognizerFactory(factory.Factory):
10096
type = "custom"
10197
validatorFunction = factory.fuzzy.FuzzyText()
102-
supportedEntity = PIIEntity.PERSON
10398
supportedLanguage = ClassificationLanguage.en
10499

105100
class Meta:

ingestion/src/metadata/pii/algorithms/presidio_recognizer_factory.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,15 @@ class PresidioRecognizerFactory:
4040
"""Factory for creating Presidio recognizers from OpenMetadata configurations."""
4141

4242
@staticmethod
43-
def create_recognizer(recognizer_config: Recognizer) -> Optional[EntityRecognizer]:
43+
def create_recognizer(
44+
recognizer_config: Recognizer, tag_fqn: str
45+
) -> Optional[EntityRecognizer]:
4446
"""
4547
Create a Presidio recognizer from an OpenMetadata recognizer configuration.
4648
4749
Args:
4850
recognizer_config: The recognizer configuration from OpenMetadata
49-
tag_name: The name of the tag this recognizer belongs to
51+
tag_fqn: The fully qualified name of the tag this recognizer belongs to
5052
5153
Returns:
5254
A Presidio EntityRecognizer or None if creation fails
@@ -58,15 +60,15 @@ def create_recognizer(recognizer_config: Recognizer) -> Optional[EntityRecognize
5860

5961
if isinstance(config, PatternRecognizer):
6062
recognizer = PresidioRecognizerFactory._create_pattern_recognizer(
61-
config, recognizer_config
63+
config, recognizer_config, tag_fqn
6264
)
6365
elif isinstance(config, ExactTermsRecognizer):
6466
recognizer = PresidioRecognizerFactory._create_exact_terms_recognizer(
65-
config, recognizer_config
67+
config, recognizer_config, tag_fqn
6668
)
6769
elif isinstance(config, ContextRecognizer):
6870
recognizer = PresidioRecognizerFactory._create_context_recognizer(
69-
config, recognizer_config
71+
config, recognizer_config, tag_fqn
7072
)
7173
elif isinstance(config, CustomRecognizer):
7274
recognizer = PresidioRecognizerFactory._create_custom_recognizer(
@@ -107,6 +109,7 @@ def _get_regex_flags(flags: Optional[RegexFlags]) -> Optional[int]:
107109
def _create_pattern_recognizer(
108110
config: PatternRecognizer,
109111
recognizer_config: Recognizer,
112+
tag_fqn: str,
110113
) -> PresidioPatternRecognizer:
111114
"""Create a pattern-based recognizer."""
112115
patterns: List[PresidioPattern] = []
@@ -120,7 +123,7 @@ def _create_pattern_recognizer(
120123
)
121124

122125
return PresidioPatternRecognizer(
123-
supported_entity=config.supportedEntity.value,
126+
supported_entity=tag_fqn,
124127
patterns=patterns,
125128
name=recognizer_config.name.root,
126129
supported_language=config.supportedLanguage.value,
@@ -131,24 +134,23 @@ def _create_pattern_recognizer(
131134

132135
@staticmethod
133136
def _create_exact_terms_recognizer(
134-
config: ExactTermsRecognizer, recognizer_config: Recognizer
137+
config: ExactTermsRecognizer, recognizer_config: Recognizer, tag_fqn: str
135138
) -> PresidioPatternRecognizer:
136139
"""Create an exact terms recognizer using patterns."""
137140
patterns: List[PresidioPattern] = []
138141
for value in config.exactTerms:
139-
# Escape special regex characters in the value
140142
escaped_value = re.escape(value)
141143

142144
patterns.append(
143145
PresidioPattern(
144146
name=f"exact_term_{value}",
145147
regex=escaped_value,
146-
score=0.9, # High confidence for exact matches
148+
score=0.9,
147149
)
148150
)
149151

150152
return PresidioPatternRecognizer(
151-
supported_entity=config.supportedEntity.value,
153+
supported_entity=tag_fqn,
152154
patterns=patterns,
153155
name=recognizer_config.name.root,
154156
supported_language=config.supportedLanguage.value,
@@ -159,16 +161,12 @@ def _create_exact_terms_recognizer(
159161

160162
@staticmethod
161163
def _create_context_recognizer(
162-
config: ContextRecognizer, recognizer_config: Recognizer
164+
config: ContextRecognizer, recognizer_config: Recognizer, tag_fqn: str
163165
) -> PresidioPatternRecognizer:
164166
"""Create a context-aware recognizer."""
165-
# For context recognizers, we can use a pattern recognizer with context words
166-
# or implement a custom recognizer that uses NLP
167167
context_patterns: List[PresidioPattern] = []
168168

169-
# Create patterns that look for context words near potential entities
170169
for context_word in config.contextWords:
171-
# Pattern to match words near context words
172170
pattern = f"(?i)(?:{context_word})\\s+\\w+|\\w+\\s+(?:{context_word})"
173171
context_patterns.append(
174172
PresidioPattern(
@@ -181,7 +179,7 @@ def _create_context_recognizer(
181179
)
182180

183181
return PresidioPatternRecognizer(
184-
supported_entity=config.supportedEntity.value,
182+
supported_entity=tag_fqn,
185183
patterns=context_patterns,
186184
name=recognizer_config.name.root,
187185
supported_language=config.supportedLanguage.value,
@@ -249,7 +247,9 @@ def create_recognizers_for_tag(tag: Tag) -> List[EntityRecognizer]:
249247
return recognizers
250248

251249
for recognizer_config in tag.recognizers:
252-
recognizer = PresidioRecognizerFactory.create_recognizer(recognizer_config)
250+
recognizer = PresidioRecognizerFactory.create_recognizer(
251+
recognizer_config, cast(str, tag.fullyQualifiedName)
252+
)
253253
if recognizer:
254254
recognizers.append(recognizer)
255255
logger.info(

ingestion/src/metadata/pii/scanners/custom_ner_scanner.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def _find_matching_tags(
198198
Find tags that have recognizers for the given entity type.
199199
200200
Args:
201-
entity_type: The entity type detected by Presidio
201+
entity_type: The entity type detected by Presidio (now matches tag FQN)
202202
classification_fqn: Optional classification to filter tags
203203
204204
Returns:
@@ -216,13 +216,8 @@ def _find_matching_tags(
216216
if not tag.autoClassificationEnabled or not tag.recognizers:
217217
continue
218218

219-
for recognizer in tag.recognizers:
220-
if recognizer.recognizerConfig and hasattr(
221-
recognizer.recognizerConfig, "supportedEntity"
222-
):
223-
if recognizer.recognizerConfig.supportedEntity == entity_type:
224-
matching_tags.append(tag)
225-
break
219+
if tag.fullyQualifiedName == entity_type:
220+
matching_tags.append(tag)
226221

227222
return matching_tags
228223

ingestion/tests/unit/metadata/pii/conftest.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ def email_tag_pii(pii_classification: Classification) -> Tag:
9393
email_pattern_recognizer = PatternRecognizerFactory.create(
9494
patterns=[email_pattern],
9595
context=[],
96-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
9796
supportedLanguage="en",
9897
)
9998

@@ -128,7 +127,6 @@ def phone_tag_pii(pii_classification: Classification) -> Tag:
128127
phone_pattern_recognizer = PatternRecognizerFactory.create(
129128
patterns=[phone_pattern_1, phone_pattern_2],
130129
context=[],
131-
supportedEntity=PIIEntity.PHONE_NUMBER,
132130
supportedLanguage="en",
133131
)
134132

ingestion/tests/unit/metadata/pii/test_tag_processor.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ def phone_tag(self, pii_classification: Classification) -> Tag:
135135
)
136136
phone_pattern_recognizer = PatternRecognizerFactory.create(
137137
patterns=[phone_pattern],
138-
supportedEntity=PIIEntity.PHONE_NUMBER,
139138
context=[],
140139
supportedLanguage=ClassificationLanguage.en,
141140
)
@@ -336,7 +335,6 @@ def test_mixed_pii_data_chooses_highest_confidence(
336335
)
337336
mixed_pattern_recognizer = PatternRecognizerFactory.create(
338337
patterns=[email_pattern, name_pattern],
339-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
340338
context=[],
341339
supportedLanguage=ClassificationLanguage.en,
342340
)
@@ -431,7 +429,6 @@ def test_ssn_classification_with_custom_analyzer(
431429
)
432430
ssn_pattern_recognizer = PatternRecognizerFactory.create(
433431
patterns=[ssn_pattern],
434-
supportedEntity=PIIEntity.US_SSN,
435432
context=[],
436433
supportedLanguage=ClassificationLanguage.en,
437434
)

ingestion/tests/unit/metadata/pii/test_tag_processor_integration.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ def general_password_tag(
186186
password_pattern_recognizer = PatternRecognizerFactory.create(
187187
patterns=[pwd_pattern],
188188
context=[],
189-
supportedEntity=PIIEntity.PERSON,
190189
supportedLanguage="en",
191190
)
192191
recognizer = RecognizerFactory.create(
@@ -210,7 +209,6 @@ def techdetail_secret_tag(self, techdetail_classification: Classification):
210209
secret_pattern_recognizer = PatternRecognizerFactory.create(
211210
patterns=[secret_pattern],
212211
context=[],
213-
supportedEntity=PIIEntity.PERSON,
214212
supportedLanguage="en",
215213
)
216214
recognizer = RecognizerFactory.create(

ingestion/tests/unit/metadata/pii/test_tag_scoring.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ def email_tag(
7979
)
8080
email_pattern_recognizer = PatternRecognizerFactory.create(
8181
patterns=[email_pattern],
82-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
8382
context=[],
8483
supportedLanguage=ClassificationLanguage.en,
8584
)
@@ -104,7 +103,6 @@ def email_tag(
104103
)
105104
column_name_pattern_recognizer = PatternRecognizerFactory.create(
106105
patterns=[column_name_pattern],
107-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
108106
context=[],
109107
supportedLanguage=ClassificationLanguage.en,
110108
)
@@ -135,7 +133,6 @@ def phone_tag(self, pii_classification: Classification) -> Tag:
135133
)
136134
phone_pattern_recognizer = PatternRecognizerFactory.create(
137135
patterns=[phone_pattern],
138-
supportedEntity=PIIEntity.PHONE_NUMBER,
139136
context=[],
140137
supportedLanguage=ClassificationLanguage.en,
141138
)
@@ -304,7 +301,6 @@ def email_tag(self) -> Tag:
304301
)
305302
email_pattern_recognizer = PatternRecognizerFactory.create(
306303
patterns=[email_pattern],
307-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
308304
context=[],
309305
supportedLanguage=ClassificationLanguage.en,
310306
)
@@ -371,7 +367,6 @@ def test_analyze_column_name(self, email_tag, nlp_engine):
371367
)
372368
column_name_pattern_recognizer = PatternRecognizerFactory.create(
373369
patterns=[column_name_pattern],
374-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
375370
regexFlags__ignoreCase=True,
376371
context=[],
377372
supportedLanguage=ClassificationLanguage.en,
@@ -413,7 +408,6 @@ def test_disabled_auto_classification(self, column, nlp_engine):
413408
test_pattern = PatternFactory.create(name="test", regex=".*", score=1.0)
414409
test_pattern_recognizer = PatternRecognizerFactory.create(
415410
patterns=[test_pattern],
416-
supportedEntity=PIIEntity.EMAIL_ADDRESS,
417411
context=[],
418412
supportedLanguage=ClassificationLanguage.en,
419413
)

0 commit comments

Comments
 (0)