Skip to content

Commit 563bea1

Browse files
authored
Add custom Presidio docker images with Flair NLP (#2)
- Removed .Values.imageVersion in flavor of analyzer.imageTag and anonymizer.imageTag
1 parent 7d6c2e7 commit 563bea1

File tree

6 files changed

+489
-4
lines changed

6 files changed

+489
-4
lines changed

chart/templates/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ spec:
1313
app: presidio-analyzer
1414
spec:
1515
containers:
16-
- image: '{{ .Values.analyzer.imageRepository | default "mcr.microsoft.com/presidio-analyzer" }}:{{ .Values.imageVersion | default "latest" }}'
16+
- image: '{{ .Values.analyzer.imageRepository | default "mcr.microsoft.com/presidio-analyzer" }}:{{ .Values.analyzer.imageTag | default "latest" }}'
1717
name: presidio-analyzer
1818
imagePullPolicy: {{ .Values.analyzer.imagePullPolicy | default "Always" }}
1919
resources:
@@ -37,7 +37,7 @@ spec:
3737
app: presidio-anonymizer
3838
spec:
3939
containers:
40-
- image: '{{ .Values.anonymizer.imageRepository | default "mcr.microsoft.com/presidio-anonymizer" }}:{{ .Values.imageVersion | default "latest" }}'
40+
- image: '{{ .Values.anonymizer.imageRepository | default "mcr.microsoft.com/presidio-anonymizer" }}:{{ .Values.anonymizer.imageTag | default "latest" }}'
4141
name: presidio-anonymizer
4242
imagePullPolicy: {{ .Values.anonymizer.imagePullPolicy | default "Always" }}
4343
resources:
@@ -70,4 +70,4 @@ spec:
7070
- port: 3000
7171
name: api
7272
protocol: TCP
73-
targetPort: 3000
73+
targetPort: 3000

chart/values.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11

22
# Presidio version information: https://github.com/microsoft/presidio/releases
3-
imageVersion: latest
43

54
# analyzer service configuration
65
analyzer:
76
replicas: 1
87
imageRepository: mcr.microsoft.com/presidio-analyzer
98
imagePullPolicy: Always
9+
# Presidio version information: https://github.com/microsoft/presidio/releases
10+
imageTag: latest
1011
resources:
1112
limits:
1213
cpu: 512m
@@ -20,6 +21,8 @@ anonymizer:
2021
replicas: 1
2122
imageRepository: mcr.microsoft.com/presidio-anonymizer
2223
imagePullPolicy: Always
24+
# Presidio version information: https://github.com/microsoft/presidio/releases
25+
imageTag: latest
2326
resources:
2427
limits:
2528
cpu: 512m

docker-images/flair/Dockerfile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
FROM mcr.microsoft.com/presidio-analyzer
2+
3+
# Install Flair and its dependencies
4+
RUN poetry run pip install flair torch
5+
# Install Flair models
6+
7+
COPY ./install_flair_models.py /usr/bin/presidio-analyzer/
8+
RUN poetry run python install_flair_models.py --models flair/ner-english-large
9+
10+
# default app path from presidio analyzer base image
11+
COPY ./flair_recognizer.py /usr/bin/presidio-analyzer
12+
13+
# override the main application that creates the flask api
14+
COPY ./app.py /usr/bin/presidio-analyzer/app.py

docker-images/flair/app.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
"""REST API server for analyzer."""
2+
3+
import json
4+
import logging
5+
import os
6+
from logging.config import fileConfig
7+
from pathlib import Path
8+
from typing import Tuple
9+
10+
from flask import Flask, Response, jsonify, request
11+
from presidio_analyzer import AnalyzerEngine, AnalyzerEngineProvider, AnalyzerRequest
12+
from werkzeug.exceptions import HTTPException
13+
14+
# Additional imports for Flair support
15+
import spacy
16+
import spacy.cli
17+
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
18+
from presidio_analyzer.recognizer_registry import RecognizerRegistry
19+
20+
DEFAULT_PORT = "3000"
21+
22+
LOGGING_CONF_FILE = "logging.ini"
23+
24+
WELCOME_MESSAGE = r"""
25+
_______ _______ _______ _______ _________ ______ _________ _______
26+
( ____ )( ____ )( ____ \( ____ \\__ __/( __ \ \__ __/( ___ )
27+
| ( )|| ( )|| ( \/| ( \/ ) ( | ( \ ) ) ( | ( ) |
28+
| (____)|| (____)|| (__ | (_____ | | | | ) | | | | | | |
29+
| _____)| __)| __) (_____ ) | | | | | | | | | | | |
30+
| ( | (\ ( | ( ) | | | | | ) | | | | | | |
31+
| ) | ) \ \__| (____/\/\____) |___) (___| (__/ )___) (___| (___) |
32+
|/ |/ \__/(_______/\_______)\_______/(______/ \_______/(_______)
33+
"""
34+
35+
36+
def create_nlp_engine_with_flair() -> Tuple[NlpEngine, RecognizerRegistry]:
37+
"""
38+
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
39+
The FlairRecognizer would return results from Flair models, the spaCy model
40+
would return NlpArtifacts such as POS and lemmas.
41+
:param model_path: Flair model path.
42+
"""
43+
from flair_recognizer import FlairRecognizer
44+
45+
registry = RecognizerRegistry()
46+
registry.load_predefined_recognizers()
47+
48+
# there is no official Flair NlpEngine, hence we load it as an additional recognizer
49+
50+
if not spacy.util.is_package("en_core_web_sm"):
51+
spacy.cli.download("en_core_web_sm")
52+
# Using a small spaCy model + a Flair NER model
53+
flair_recognizer = FlairRecognizer()
54+
nlp_configuration = {
55+
"nlp_engine_name": "spacy",
56+
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
57+
}
58+
registry.add_recognizer(flair_recognizer)
59+
registry.remove_recognizer("SpacyRecognizer")
60+
61+
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
62+
63+
return nlp_engine, registry
64+
65+
66+
class Server:
67+
"""HTTP Server for calling Presidio Analyzer."""
68+
69+
def __init__(self):
70+
fileConfig(Path(Path(__file__).parent, LOGGING_CONF_FILE))
71+
self.logger = logging.getLogger("presidio-analyzer")
72+
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
73+
self.app = Flask(__name__)
74+
75+
analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE")
76+
nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE")
77+
recognizer_registry_conf_file = os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE")
78+
79+
self.logger.info("Starting analyzer engine")
80+
self.engine: AnalyzerEngine = AnalyzerEngineProvider(
81+
analyzer_engine_conf_file=analyzer_conf_file,
82+
nlp_engine_conf_file=nlp_engine_conf_file,
83+
recognizer_registry_conf_file=recognizer_registry_conf_file,
84+
).create_engine()
85+
86+
self.logger.info("Loading Flair-based NLP engine")
87+
nlp_engine, registry = create_nlp_engine_with_flair()
88+
self.engine = AnalyzerEngine(
89+
nlp_engine=nlp_engine,
90+
registry=registry
91+
)
92+
self.logger.info("Flair-based analyzer engine loaded successfully")
93+
self.logger.info(WELCOME_MESSAGE)
94+
95+
@self.app.route("/health")
96+
def health() -> str:
97+
"""Return basic health probe result."""
98+
return "Presidio Analyzer service is up"
99+
100+
@self.app.route("/analyze", methods=["POST"])
101+
def analyze() -> Tuple[str, int]:
102+
"""Execute the analyzer function."""
103+
# Parse the request params
104+
try:
105+
req_data = AnalyzerRequest(request.get_json())
106+
if not req_data.text:
107+
raise Exception("No text provided")
108+
109+
if not req_data.language:
110+
raise Exception("No language provided")
111+
112+
recognizer_result_list = self.engine.analyze(
113+
text=req_data.text,
114+
language=req_data.language,
115+
correlation_id=req_data.correlation_id,
116+
score_threshold=req_data.score_threshold,
117+
entities=req_data.entities,
118+
return_decision_process=req_data.return_decision_process,
119+
ad_hoc_recognizers=req_data.ad_hoc_recognizers,
120+
context=req_data.context,
121+
allow_list=req_data.allow_list,
122+
allow_list_match=req_data.allow_list_match,
123+
regex_flags=req_data.regex_flags,
124+
)
125+
_exclude_attributes_from_dto(recognizer_result_list)
126+
127+
return Response(
128+
json.dumps(
129+
recognizer_result_list,
130+
default=lambda o: o.to_dict(),
131+
sort_keys=True,
132+
),
133+
content_type="application/json",
134+
)
135+
except TypeError as te:
136+
error_msg = (
137+
f"Failed to parse /analyze request "
138+
f"for AnalyzerEngine.analyze(). {te.args[0]}"
139+
)
140+
self.logger.error(error_msg)
141+
return jsonify(error=error_msg), 400
142+
143+
except Exception as e:
144+
self.logger.error(
145+
f"A fatal error occurred during execution of "
146+
f"AnalyzerEngine.analyze(). {e}"
147+
)
148+
return jsonify(error=e.args[0]), 500
149+
150+
@self.app.route("/recognizers", methods=["GET"])
151+
def recognizers() -> Tuple[str, int]:
152+
"""Return a list of supported recognizers."""
153+
language = request.args.get("language")
154+
try:
155+
recognizers_list = self.engine.get_recognizers(language)
156+
names = [o.name for o in recognizers_list]
157+
return jsonify(names), 200
158+
except Exception as e:
159+
self.logger.error(
160+
f"A fatal error occurred during execution of "
161+
f"AnalyzerEngine.get_recognizers(). {e}"
162+
)
163+
return jsonify(error=e.args[0]), 500
164+
165+
@self.app.route("/supportedentities", methods=["GET"])
166+
def supported_entities() -> Tuple[str, int]:
167+
"""Return a list of supported entities."""
168+
language = request.args.get("language")
169+
try:
170+
entities_list = self.engine.get_supported_entities(language)
171+
return jsonify(entities_list), 200
172+
except Exception as e:
173+
self.logger.error(
174+
f"A fatal error occurred during execution of "
175+
f"AnalyzerEngine.supported_entities(). {e}"
176+
)
177+
return jsonify(error=e.args[0]), 500
178+
179+
@self.app.errorhandler(HTTPException)
180+
def http_exception(e):
181+
return jsonify(error=e.description), e.code
182+
183+
184+
def _exclude_attributes_from_dto(recognizer_result_list):
185+
excluded_attributes = [
186+
"recognition_metadata",
187+
]
188+
for result in recognizer_result_list:
189+
for attr in excluded_attributes:
190+
if hasattr(result, attr):
191+
delattr(result, attr)
192+
193+
194+
def create_app(): # noqa
195+
server = Server()
196+
return server.app
197+
198+
199+
if __name__ == "__main__":
200+
app = create_app()
201+
port = int(os.environ.get("PORT", DEFAULT_PORT))
202+
print(f"Starting Presidio Analyzer API on port {port}")
203+
app.run(host="0.0.0.0", port=port)

0 commit comments

Comments
 (0)