Skip to content

Commit 63a9a6d

Browse files
committed
feature: 增加高级匿名化算子
1 parent 701926a commit 63a9a6d

File tree

9 files changed

+178
-43
lines changed

9 files changed

+178
-43
lines changed

runtime/ops/mapper/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def _import_operators():
4646
from . import img_resize
4747
from . import remove_duplicate_sentences
4848
from . import knowledge_relation_slice
49+
from . import pii_ner_detection
4950

5051

5152
_import_operators()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from datamate.core.base_op import OPERATORS
2+
3+
OPERATORS.register_module(module_name='PiiDetector',
4+
module_path='ops.mapper.pii_ner_detection.process')
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import presidio_analyzer as analyzer
2+
3+
# 中国身份证号识别器
4+
id_recognizer = analyzer.PatternRecognizer(
5+
supported_entity="ID_CHINA",
6+
supported_language="zh",
7+
patterns=[
8+
analyzer.Pattern(
9+
name="china_id_pattern",
10+
regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b",
11+
score=0.9
12+
)
13+
],
14+
context=["身份证", "身份证明", "身份证号", "证件号码"]
15+
)
16+
17+
# 中国电话号码识别器
18+
phone_recognizer = analyzer.PatternRecognizer(
19+
supported_entity="Phone_CHINA",
20+
supported_language="zh",
21+
patterns=[
22+
analyzer.Pattern(
23+
name="china_mobile_pattern",
24+
regex=r"\b(1[3-9]\d{9})\b",
25+
score=0.85
26+
),
27+
analyzer.Pattern(
28+
name="china_landline_pattern",
29+
regex=r"\b(0\d{2,3}-?\d{7,8})\b",
30+
score=0.8
31+
)
32+
],
33+
context=["电话", "手机", "联系方式", "联系电话"]
34+
)
35+
36+
# 中国邮编识别器
37+
zipcode_recognizer = analyzer.PatternRecognizer(
38+
supported_entity="ZIPCODE_CHINA",
39+
supported_language="zh",
40+
patterns=[
41+
analyzer.Pattern(
42+
name="china_zipcode_pattern",
43+
regex=r"\b[1-9]\d{5}\b",
44+
score=0.7
45+
)
46+
],
47+
context=["邮编", "邮政编码", "邮编号码"]
48+
)
49+
50+
# 兼容中文域名的URL识别器
51+
url_recognizer = analyzer.PatternRecognizer(
52+
supported_entity="URL",
53+
supported_language="zh",
54+
patterns=[
55+
analyzer.Pattern(
56+
name="url_pattern",
57+
regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b",
58+
score=0.9
59+
)
60+
],
61+
context=["网址", "链接", "网站", "网页"]
62+
)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name: '高级匿名化'
2+
language: 'Python'
3+
vendor: 'others'
4+
raw_id: 'PiiDetector'
5+
version: '1.0.0'
6+
description: '高级匿名化算子,检测命名实体并匿名化。'
7+
modal: 'text'
8+
inputs: 'text'
9+
outputs: 'text'
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import presidio_analyzer as analyzer
2+
import presidio_anonymizer as anonymizer
3+
import spacy
4+
5+
from datamate.core.base_op import Mapper
6+
7+
from .custom_entities import id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer
8+
9+
10+
class PiiDetector(Mapper):
11+
custom_ops = True
12+
13+
def __init__(self, *args, **kwargs):
14+
super(PiiDetector, self).__init__(*args, **kwargs)
15+
self.support_language = kwargs.get("support_language", "zh")
16+
17+
self.nlp_engine = None
18+
self.text_analyzer = None
19+
self.anom = None
20+
21+
self.init_model(*args, **kwargs)
22+
23+
def init_model(self, *args, **kwargs):
24+
spacy.load("zh_core_web_sm")
25+
provider = analyzer.nlp_engine.NlpEngineProvider(
26+
nlp_configuration={
27+
"nlp_engine_name": "spacy",
28+
"models": [
29+
{"lang_code": "zh", "model_name": "zh_core_web_sm"}
30+
]
31+
}
32+
)
33+
34+
# 创建NLP Engine
35+
self.nlp_engine = provider.create_engine()
36+
37+
# 初始化AnalyzerEngine
38+
self.text_analyzer = analyzer.AnalyzerEngine(nlp_engine=self.nlp_engine, supported_languages=["zh"])
39+
self.text_analyzer.registry.load_predefined_recognizers()
40+
for recognizer in [id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer]:
41+
self.text_analyzer.registry.add_recognizer(recognizer)
42+
43+
# 初始化AnonymizerEngine
44+
self.anom = anonymizer.AnonymizerEngine()
45+
46+
def execute(self, sample):
47+
self.read_file_first(sample)
48+
text = sample.get('text')
49+
analyzer_results = self.text_analyzer.analyze(text=text, language=self.support_language)
50+
res = self.anom.anonymize(text=text, analyzer_results=analyzer_results)
51+
sample['text'] = res.text
52+
return sample

runtime/ops/pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "ops"
33
version = "0.0.1"
44
description = "Add your description here"
55
readme = "README.md"
6-
requires-python = ">=3.12"
6+
requires-python = ">=3.11"
77
dependencies = [
88
"beautifulsoup4>=4.14.3",
99
"datasketch>=1.8.0",
@@ -18,11 +18,14 @@ dependencies = [
1818
"paddleocr==2.8.1",
1919
"paddlepaddle==2.6.2",
2020
"pandas==1.5.3",
21+
"presidio-analyzer==2.2.25",
22+
"presidio-anonymizer==2.2.25",
2123
"pycryptodome>=3.23.0",
2224
"pymysql>=1.1.2",
2325
"python-docx>=1.2.0",
2426
"pytz>=2025.2",
2527
"six>=1.17.0",
28+
"spacy==3.7.0",
2629
"sqlalchemy>=2.0.44",
2730
"xmltodict>=1.0.2",
2831
"zhconv>=1.4.3",

scripts/db/data-cleaning-init.sql

Lines changed: 40 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -59,44 +59,45 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
5959
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
6060

6161
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
62-
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
63-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
64-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
65-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
66-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
67-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
68-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
69-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
70-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
71-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
72-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
73-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
74-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
75-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
76-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
77-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
78-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
79-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
80-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
81-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
82-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
83-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
84-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
85-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
86-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
87-
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
62+
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, null),
63+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, null),
64+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, null),
65+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, null),
66+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, null),
67+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, null),
68+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, null),
69+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, null),
70+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, null),
71+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, null),
72+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, null),
73+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, null),
74+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, null),
75+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, null),
76+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, null),
77+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, null),
78+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, null),
79+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, null),
80+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, null),
81+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, null),
82+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, null),
83+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, null),
84+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, null),
85+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, null),
86+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, null),
87+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, null),
88+
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, null);
8889

8990
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
90-
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
91-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
92-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
93-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
94-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null),
95-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null),
96-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null),
97-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null),
98-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
99-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
100-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 12, null),
101-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 13, null),
102-
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 14, null);
91+
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, null),
92+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, null),
93+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, null),
94+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, null),
95+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, null),
96+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, null),
97+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, null),
98+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, null),
99+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, null),
100+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, null),
101+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, null),
102+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
103+
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);

scripts/db/data-operator-init.sql

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P
106106
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
107107
('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
108108
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'),
109-
('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false');
109+
('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
110+
('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false');
110111

111112
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
112113
SELECT c.id, o.id
@@ -119,7 +120,8 @@ AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFil
119120
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
120121
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
121122
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
122-
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter');
123+
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter',
124+
'PiiDetector');
123125

124126
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
125127
SELECT c.id, o.id

scripts/images/runtime/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ WORKDIR /opt/runtime
2121

2222
RUN --mount=type=cache,target=/root/.cache/uv \
2323
uv pip install -e . --system \
24-
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
24+
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
25+
&& python -m spacy download zh_core_web_sm
2526

2627
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
2728
&& chmod +x /opt/runtime/start.sh \

0 commit comments

Comments
 (0)