|
22 | 22 | providing visual highlighting of replacements through span tags. |
23 | 23 | """ |
24 | 24 |
|
25 | | - |
26 | 25 | from dataclasses import dataclass, fields |
27 | | -from enum import Enum |
28 | 26 | from io import StringIO |
29 | 27 | from operator import itemgetter |
30 | 28 | from typing import Any, BinaryIO, Optional, Union |
31 | 29 | from .deidentification_constants import bcolors, GENDER_PRONOUNS, HTML_BEGIN, HTML_END |
32 | | -from .deidentification_constants import pgmName, pgmUrl, pgmVersion |
| 30 | +from .deidentification_constants import pgmName, pgmUrl, pgmVersion, DeidentificationOutputStyle, DeidentificationLanguages |
33 | 31 | from .normalize_punctuation import normalize_punctuation |
34 | 32 | import spacy |
35 | 33 | from spacy.tokens import Doc |
36 | 34 | import sys |
37 | 35 |
|
38 | | -class DeidentificationOutputStyle(Enum): |
39 | | - TEXT = "text" |
40 | | - HTML = "html" |
41 | | - |
42 | 36 | @dataclass |
43 | 37 | class DeidentificationConfig: |
44 | 38 | spacy_load: bool = True |
45 | 39 | spacy_model: str = "en_core_web_trf" |
46 | 40 | output_style: DeidentificationOutputStyle = DeidentificationOutputStyle.TEXT |
47 | | - replacement: str = "PERSON" |
| 41 | + language: DeidentificationLanguages = DeidentificationLanguages.ENGLISH |
| 42 | + replacement: str = DeidentificationLanguages.ENGLISH.value |
48 | 43 | debug: bool = False |
49 | 44 | save_tokens: bool = False |
50 | 45 | filename: Optional[str] = None |
@@ -126,7 +121,7 @@ def model_not_found_error(self, err: str): |
126 | 121 | print(str(err), file=sys.stderr) |
127 | 122 | if "Can't find model" in str(err): |
128 | 123 | print(file=sys.stderr) |
129 | | - print("Please manually run the following command one time to download the required model:", file=sys.stderr) |
| 124 | + print("Please manually run the following command one time to download the required 500 MB model:", file=sys.stderr) |
130 | 125 | print(file=sys.stderr) |
131 | 126 | print(f"python -m spacy download {self.config.spacy_model}", file=sys.stderr) |
132 | 127 | print(file=sys.stderr) |
@@ -239,7 +234,8 @@ def _find_all_pronouns(self) -> int: |
239 | 234 | # Clear out any previous pronouns |
240 | 235 | self.all_pronouns = [] |
241 | 236 |
|
242 | | - gender_keys = GENDER_PRONOUNS.keys() |
| 237 | + # self.config.language equals something like: DeidentificationLanguages.ENGLISH |
| 238 | + gender_keys = GENDER_PRONOUNS[self.config.language].keys() |
243 | 239 | for token in self.doc: |
244 | 240 | if (token.pos_ == "PRON" or token.pos_ == "PROPN") and token.text.lower() in gender_keys: |
245 | 241 | record = {"text": token.text, "start_char": token.idx, "end_char": token.idx + len(token.text) - 1, "label": token.pos_, "shapes": [token.shape_]} |
@@ -345,7 +341,7 @@ def _replace_merged(self, replaced_text: str, merged: list[dict]) -> str: |
345 | 341 | if obj["type"] == "pronoun": |
346 | 342 | start = obj["item"]["start_char"] |
347 | 343 | end = start + len(obj["item"]["text"]) |
348 | | - anon = GENDER_PRONOUNS[obj["item"]["text"].lower()] |
| 344 | + anon = GENDER_PRONOUNS[self.config.language][obj["item"]["text"].lower()] |
349 | 345 | if want_html and len(anon): |
350 | 346 | anon = f'<span id="span1">{anon}</span>' |
351 | 347 | replaced_text = replaced_text[:start] + anon + replaced_text[end:] |
|
0 commit comments