Skip to content

Commit 19a04df

Browse files
authored
feature: 增加水印去除/高级匿名化算子 (#151)
* feature: 增加水印去除算子 * feature: clean code * feature: clean code * feature: 增加高级匿名化算子
1 parent cbb146d commit 19a04df

File tree

15 files changed

+197
-274
lines changed

15 files changed

+197
-274
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,15 @@ make install-mineru
7373
```
7474

7575
### Deploy the DeerFlow service
76-
1. Modify `runtime/deer-flow/.env.example` and add configurations for SEARCH_API_KEY and the EMBEDDING model.
77-
2. Modify `runtime/deer-flow/.conf.yaml.example` and add basic model service configurations.
78-
3. Execute `make install-deer-flow`
76+
```bash
77+
make install-deer-flow
78+
```
7979

8080
### Local Development and Deployment
8181
After modifying the local code, please execute the following commands to build the image and deploy using the local image.
8282
```bash
8383
make build
84-
make install REGISTRY=""
84+
make install dev=true
8585
```
8686

8787
## 🤝 Contribution Guidelines

runtime/ops/mapper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def _import_operators():
2323
from . import garble_characters_cleaner
2424
from . import html_tag_cleaner
2525
from . import id_number_cleaner
26-
from . import img_watermark_remove
2726
from . import invisible_characters_cleaner
2827
from . import ip_address_cleaner
2928
from . import legend_cleaner
@@ -47,6 +46,7 @@ def _import_operators():
4746
from . import img_resize
4847
from . import remove_duplicate_sentences
4948
from . import knowledge_relation_slice
49+
from . import pii_ner_detection
5050

5151

5252
_import_operators()

runtime/ops/mapper/img_direction_correct/base_model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ class BaseModel:
1111

1212
def __init__(self, model_type='vertical'):
1313
models_path = os.getenv("MODELS_PATH", "/home/models")
14-
self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources'))
1514
args = Namespace()
1615
args.cls_image_shape = '3, 224, 224'
1716
args.cls_batch_num = 6
@@ -20,13 +19,14 @@ def __init__(self, model_type='vertical'):
2019
args.use_gpu = False
2120
args.use_npu = False
2221
args.use_xpu = False
22+
args.use_mlu = False
2323
args.enable_mkldnn = False
2424
if model_type == 'vertical':
25-
args.cls_model_dir = str(Path(self.resources_path, 'vertical_model'))
25+
args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer'))
2626
self.model_name = 'standard model to detect image 0 or 90 rotated'
2727
args.label_list = ['0', '90']
2828
else:
29-
args.cls_model_dir = str(Path(self.resources_path, 'standard_model'))
29+
args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer'))
3030
self.model_name = 'standard model to detect image 0 or 180 rotated'
3131
args.label_list = ['0', '180']
3232

runtime/ops/mapper/img_watermark_remove/__init__.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

runtime/ops/mapper/img_watermark_remove/metadata.yml

Lines changed: 0 additions & 26 deletions
This file was deleted.

runtime/ops/mapper/img_watermark_remove/process.py

Lines changed: 0 additions & 161 deletions
This file was deleted.

runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py

Lines changed: 0 additions & 25 deletions
This file was deleted.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from datamate.core.base_op import OPERATORS
2+
3+
OPERATORS.register_module(module_name='PiiDetector',
4+
module_path='ops.mapper.pii_ner_detection.process')
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import presidio_analyzer as analyzer
2+
3+
# 中国身份证号识别器
4+
id_recognizer = analyzer.PatternRecognizer(
5+
supported_entity="ID_CHINA",
6+
supported_language="zh",
7+
patterns=[
8+
analyzer.Pattern(
9+
name="china_id_pattern",
10+
regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b",
11+
score=0.9
12+
)
13+
],
14+
context=["身份证", "身份证明", "身份证号", "证件号码"]
15+
)
16+
17+
# 中国电话号码识别器
18+
phone_recognizer = analyzer.PatternRecognizer(
19+
supported_entity="Phone_CHINA",
20+
supported_language="zh",
21+
patterns=[
22+
analyzer.Pattern(
23+
name="china_mobile_pattern",
24+
regex=r"\b(1[3-9]\d{9})\b",
25+
score=0.85
26+
),
27+
analyzer.Pattern(
28+
name="china_landline_pattern",
29+
regex=r"\b(0\d{2,3}-?\d{7,8})\b",
30+
score=0.8
31+
)
32+
],
33+
context=["电话", "手机", "联系方式", "联系电话"]
34+
)
35+
36+
# 中国邮编识别器
37+
zipcode_recognizer = analyzer.PatternRecognizer(
38+
supported_entity="ZIPCODE_CHINA",
39+
supported_language="zh",
40+
patterns=[
41+
analyzer.Pattern(
42+
name="china_zipcode_pattern",
43+
regex=r"\b[1-9]\d{5}\b",
44+
score=0.7
45+
)
46+
],
47+
context=["邮编", "邮政编码", "邮编号码"]
48+
)
49+
50+
# 兼容中文域名的URL识别器
51+
url_recognizer = analyzer.PatternRecognizer(
52+
supported_entity="URL",
53+
supported_language="zh",
54+
patterns=[
55+
analyzer.Pattern(
56+
name="url_pattern",
57+
regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b",
58+
score=0.9
59+
)
60+
],
61+
context=["网址", "链接", "网站", "网页"]
62+
)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name: '高级匿名化'
2+
language: 'Python'
3+
vendor: 'others'
4+
raw_id: 'PiiDetector'
5+
version: '1.0.0'
6+
description: '高级匿名化算子,检测命名实体并匿名化。'
7+
modal: 'text'
8+
inputs: 'text'
9+
outputs: 'text'

0 commit comments

Comments
 (0)