Skip to content

Commit 1a3fd96

Browse files
authored
Merge pull request #9 from jftuga/multi-language
allow for multiple languages
2 parents c7ad98a + 0c6cb0f commit 1a3fd96

File tree

5 files changed

+41
-34
lines changed

5 files changed

+41
-34
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ clean:
4343
rm -rf dist/
4444
rm -rf *.egg-info/
4545
rm -rf $(VENV_NAME)/
46-
rm -rf __pycache__/
4746
rm -rf test-install-venv/
4847
rm -f *.whl.metadata .??*~
4948
find . -type d -name "__pycache__" -exec rm -r "{}" +

Pipfile.lock

Lines changed: 20 additions & 20 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deidentification/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""A Python module for de-identifying personally identifiable information in text."""
22

3-
from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle
3+
from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle, DeidentificationLanguages
44
from .deidentification_constants import pgmName, pgmVersion, pgmUrl
55

66
__version__ = pgmVersion
@@ -9,6 +9,7 @@
99
"Deidentification",
1010
"DeidentificationConfig",
1111
"DeidentificationOutputStyle",
12+
"DeidentificationLanguages",
1213
"pgmName",
1314
"pgmVersion",
1415
"pgmUrl",

deidentification/deidentification.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,24 @@
2222
providing visual highlighting of replacements through span tags.
2323
"""
2424

25-
2625
from dataclasses import dataclass, fields
27-
from enum import Enum
2826
from io import StringIO
2927
from operator import itemgetter
3028
from typing import Any, BinaryIO, Optional, Union
3129
from .deidentification_constants import bcolors, GENDER_PRONOUNS, HTML_BEGIN, HTML_END
32-
from .deidentification_constants import pgmName, pgmUrl, pgmVersion
30+
from .deidentification_constants import pgmName, pgmUrl, pgmVersion, DeidentificationOutputStyle, DeidentificationLanguages
3331
from .normalize_punctuation import normalize_punctuation
3432
import spacy
3533
from spacy.tokens import Doc
3634
import sys
3735

38-
class DeidentificationOutputStyle(Enum):
39-
TEXT = "text"
40-
HTML = "html"
41-
4236
@dataclass
4337
class DeidentificationConfig:
4438
spacy_load: bool = True
4539
spacy_model: str = "en_core_web_trf"
4640
output_style: DeidentificationOutputStyle = DeidentificationOutputStyle.TEXT
47-
replacement: str = "PERSON"
41+
language: DeidentificationLanguages = DeidentificationLanguages.ENGLISH
42+
replacement: str = DeidentificationLanguages.ENGLISH.value
4843
debug: bool = False
4944
save_tokens: bool = False
5045
filename: Optional[str] = None
@@ -126,7 +121,7 @@ def model_not_found_error(self, err: str):
126121
print(str(err), file=sys.stderr)
127122
if "Can't find model" in str(err):
128123
print(file=sys.stderr)
129-
print("Please manually run the following command one time to download the required model:", file=sys.stderr)
124+
print("Please manually run the following command one time to download the required 500 MB model:", file=sys.stderr)
130125
print(file=sys.stderr)
131126
print(f"python -m spacy download {self.config.spacy_model}", file=sys.stderr)
132127
print(file=sys.stderr)
@@ -239,7 +234,8 @@ def _find_all_pronouns(self) -> int:
239234
# Clear out any previous pronouns
240235
self.all_pronouns = []
241236

242-
gender_keys = GENDER_PRONOUNS.keys()
237+
# self.config.language equals something like: DeidentificationLanguages.ENGLISH
238+
gender_keys = GENDER_PRONOUNS[self.config.language].keys()
243239
for token in self.doc:
244240
if (token.pos_ == "PRON" or token.pos_ == "PROPN") and token.text.lower() in gender_keys:
245241
record = {"text": token.text, "start_char": token.idx, "end_char": token.idx + len(token.text) - 1, "label": token.pos_, "shapes": [token.shape_]}
@@ -345,7 +341,7 @@ def _replace_merged(self, replaced_text: str, merged: list[dict]) -> str:
345341
if obj["type"] == "pronoun":
346342
start = obj["item"]["start_char"]
347343
end = start + len(obj["item"]["text"])
348-
anon = GENDER_PRONOUNS[obj["item"]["text"].lower()]
344+
anon = GENDER_PRONOUNS[self.config.language][obj["item"]["text"].lower()]
349345
if want_html and len(anon):
350346
anon = f'<span id="span1">{anon}</span>'
351347
replaced_text = replaced_text[:start] + anon + replaced_text[end:]

deidentification/deidentification_constants.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
1+
from enum import Enum
2+
13
pgmName = "deidentification"
24
pgmUrl = "https://github.com/jftuga/deidentification"
35
pgmVersion = "1.2.1"
46

5-
GENDER_PRONOUNS = {
7+
# the maps the default replacement word for each language
8+
class DeidentificationLanguages(Enum):
9+
ENGLISH = "PERSON"
10+
11+
class DeidentificationOutputStyle(Enum):
12+
TEXT = "text"
13+
HTML = "html"
14+
15+
GENDER_PRONOUNS = {}
16+
GENDER_PRONOUNS[DeidentificationLanguages.ENGLISH] = {
617
"he": "HE/SHE",
718
"him": "HIM/HER",
819
"his": "HIS/HER",

0 commit comments

Comments
 (0)