|
| 1 | +"""REST API server for analyzer.""" |
| 2 | + |
| 3 | +import json |
| 4 | +import logging |
| 5 | +import os |
| 6 | +from logging.config import fileConfig |
| 7 | +from pathlib import Path |
| 8 | +from typing import Tuple |
| 9 | + |
| 10 | +from flask import Flask, Response, jsonify, request |
| 11 | +from presidio_analyzer import AnalyzerEngine, AnalyzerEngineProvider, AnalyzerRequest |
| 12 | +from werkzeug.exceptions import HTTPException |
| 13 | + |
| 14 | +# Additional imports for Flair support |
| 15 | +import spacy |
| 16 | +import spacy.cli |
| 17 | +from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider |
| 18 | +from presidio_analyzer.recognizer_registry import RecognizerRegistry |
| 19 | + |
| 20 | +DEFAULT_PORT = "3000" |
| 21 | + |
| 22 | +LOGGING_CONF_FILE = "logging.ini" |
| 23 | + |
| 24 | +WELCOME_MESSAGE = r""" |
| 25 | + _______ _______ _______ _______ _________ ______ _________ _______ |
| 26 | +( ____ )( ____ )( ____ \( ____ \\__ __/( __ \ \__ __/( ___ ) |
| 27 | +| ( )|| ( )|| ( \/| ( \/ ) ( | ( \ ) ) ( | ( ) | |
| 28 | +| (____)|| (____)|| (__ | (_____ | | | | ) | | | | | | | |
| 29 | +| _____)| __)| __) (_____ ) | | | | | | | | | | | | |
| 30 | +| ( | (\ ( | ( ) | | | | | ) | | | | | | | |
| 31 | +| ) | ) \ \__| (____/\/\____) |___) (___| (__/ )___) (___| (___) | |
| 32 | +|/ |/ \__/(_______/\_______)\_______/(______/ \_______/(_______) |
| 33 | +""" |
| 34 | + |
| 35 | + |
| 36 | +def create_nlp_engine_with_flair() -> Tuple[NlpEngine, RecognizerRegistry]: |
| 37 | + """ |
| 38 | + Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model. |
| 39 | + The FlairRecognizer would return results from Flair models, the spaCy model |
| 40 | + would return NlpArtifacts such as POS and lemmas. |
| 41 | + :param model_path: Flair model path. |
| 42 | + """ |
| 43 | + from flair_recognizer import FlairRecognizer |
| 44 | + |
| 45 | + registry = RecognizerRegistry() |
| 46 | + registry.load_predefined_recognizers() |
| 47 | + |
| 48 | + # there is no official Flair NlpEngine, hence we load it as an additional recognizer |
| 49 | + |
| 50 | + if not spacy.util.is_package("en_core_web_sm"): |
| 51 | + spacy.cli.download("en_core_web_sm") |
| 52 | + # Using a small spaCy model + a Flair NER model |
| 53 | + flair_recognizer = FlairRecognizer() |
| 54 | + nlp_configuration = { |
| 55 | + "nlp_engine_name": "spacy", |
| 56 | + "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], |
| 57 | + } |
| 58 | + registry.add_recognizer(flair_recognizer) |
| 59 | + registry.remove_recognizer("SpacyRecognizer") |
| 60 | + |
| 61 | + nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() |
| 62 | + |
| 63 | + return nlp_engine, registry |
| 64 | + |
| 65 | + |
| 66 | +class Server: |
| 67 | + """HTTP Server for calling Presidio Analyzer.""" |
| 68 | + |
| 69 | + def __init__(self): |
| 70 | + fileConfig(Path(Path(__file__).parent, LOGGING_CONF_FILE)) |
| 71 | + self.logger = logging.getLogger("presidio-analyzer") |
| 72 | + self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level)) |
| 73 | + self.app = Flask(__name__) |
| 74 | + |
| 75 | + analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE") |
| 76 | + nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE") |
| 77 | + recognizer_registry_conf_file = os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE") |
| 78 | + |
| 79 | + self.logger.info("Starting analyzer engine") |
| 80 | + self.engine: AnalyzerEngine = AnalyzerEngineProvider( |
| 81 | + analyzer_engine_conf_file=analyzer_conf_file, |
| 82 | + nlp_engine_conf_file=nlp_engine_conf_file, |
| 83 | + recognizer_registry_conf_file=recognizer_registry_conf_file, |
| 84 | + ).create_engine() |
| 85 | + |
| 86 | + self.logger.info("Loading Flair-based NLP engine") |
| 87 | + nlp_engine, registry = create_nlp_engine_with_flair() |
| 88 | + self.engine = AnalyzerEngine( |
| 89 | + nlp_engine=nlp_engine, |
| 90 | + registry=registry |
| 91 | + ) |
| 92 | + self.logger.info("Flair-based analyzer engine loaded successfully") |
| 93 | + self.logger.info(WELCOME_MESSAGE) |
| 94 | + |
| 95 | + @self.app.route("/health") |
| 96 | + def health() -> str: |
| 97 | + """Return basic health probe result.""" |
| 98 | + return "Presidio Analyzer service is up" |
| 99 | + |
| 100 | + @self.app.route("/analyze", methods=["POST"]) |
| 101 | + def analyze() -> Tuple[str, int]: |
| 102 | + """Execute the analyzer function.""" |
| 103 | + # Parse the request params |
| 104 | + try: |
| 105 | + req_data = AnalyzerRequest(request.get_json()) |
| 106 | + if not req_data.text: |
| 107 | + raise Exception("No text provided") |
| 108 | + |
| 109 | + if not req_data.language: |
| 110 | + raise Exception("No language provided") |
| 111 | + |
| 112 | + recognizer_result_list = self.engine.analyze( |
| 113 | + text=req_data.text, |
| 114 | + language=req_data.language, |
| 115 | + correlation_id=req_data.correlation_id, |
| 116 | + score_threshold=req_data.score_threshold, |
| 117 | + entities=req_data.entities, |
| 118 | + return_decision_process=req_data.return_decision_process, |
| 119 | + ad_hoc_recognizers=req_data.ad_hoc_recognizers, |
| 120 | + context=req_data.context, |
| 121 | + allow_list=req_data.allow_list, |
| 122 | + allow_list_match=req_data.allow_list_match, |
| 123 | + regex_flags=req_data.regex_flags, |
| 124 | + ) |
| 125 | + _exclude_attributes_from_dto(recognizer_result_list) |
| 126 | + |
| 127 | + return Response( |
| 128 | + json.dumps( |
| 129 | + recognizer_result_list, |
| 130 | + default=lambda o: o.to_dict(), |
| 131 | + sort_keys=True, |
| 132 | + ), |
| 133 | + content_type="application/json", |
| 134 | + ) |
| 135 | + except TypeError as te: |
| 136 | + error_msg = ( |
| 137 | + f"Failed to parse /analyze request " |
| 138 | + f"for AnalyzerEngine.analyze(). {te.args[0]}" |
| 139 | + ) |
| 140 | + self.logger.error(error_msg) |
| 141 | + return jsonify(error=error_msg), 400 |
| 142 | + |
| 143 | + except Exception as e: |
| 144 | + self.logger.error( |
| 145 | + f"A fatal error occurred during execution of " |
| 146 | + f"AnalyzerEngine.analyze(). {e}" |
| 147 | + ) |
| 148 | + return jsonify(error=e.args[0]), 500 |
| 149 | + |
| 150 | + @self.app.route("/recognizers", methods=["GET"]) |
| 151 | + def recognizers() -> Tuple[str, int]: |
| 152 | + """Return a list of supported recognizers.""" |
| 153 | + language = request.args.get("language") |
| 154 | + try: |
| 155 | + recognizers_list = self.engine.get_recognizers(language) |
| 156 | + names = [o.name for o in recognizers_list] |
| 157 | + return jsonify(names), 200 |
| 158 | + except Exception as e: |
| 159 | + self.logger.error( |
| 160 | + f"A fatal error occurred during execution of " |
| 161 | + f"AnalyzerEngine.get_recognizers(). {e}" |
| 162 | + ) |
| 163 | + return jsonify(error=e.args[0]), 500 |
| 164 | + |
| 165 | + @self.app.route("/supportedentities", methods=["GET"]) |
| 166 | + def supported_entities() -> Tuple[str, int]: |
| 167 | + """Return a list of supported entities.""" |
| 168 | + language = request.args.get("language") |
| 169 | + try: |
| 170 | + entities_list = self.engine.get_supported_entities(language) |
| 171 | + return jsonify(entities_list), 200 |
| 172 | + except Exception as e: |
| 173 | + self.logger.error( |
| 174 | + f"A fatal error occurred during execution of " |
| 175 | + f"AnalyzerEngine.supported_entities(). {e}" |
| 176 | + ) |
| 177 | + return jsonify(error=e.args[0]), 500 |
| 178 | + |
| 179 | + @self.app.errorhandler(HTTPException) |
| 180 | + def http_exception(e): |
| 181 | + return jsonify(error=e.description), e.code |
| 182 | + |
| 183 | + |
| 184 | +def _exclude_attributes_from_dto(recognizer_result_list): |
| 185 | + excluded_attributes = [ |
| 186 | + "recognition_metadata", |
| 187 | + ] |
| 188 | + for result in recognizer_result_list: |
| 189 | + for attr in excluded_attributes: |
| 190 | + if hasattr(result, attr): |
| 191 | + delattr(result, attr) |
| 192 | + |
| 193 | + |
| 194 | +def create_app(): # noqa |
| 195 | + server = Server() |
| 196 | + return server.app |
| 197 | + |
| 198 | + |
| 199 | +if __name__ == "__main__": |
| 200 | + app = create_app() |
| 201 | + port = int(os.environ.get("PORT", DEFAULT_PORT)) |
| 202 | + print(f"Starting Presidio Analyzer API on port {port}") |
| 203 | + app.run(host="0.0.0.0", port=port) |
0 commit comments