Merge pull request #9 from jftuga/multi-language

jftuga · web-flow · commit 1a3fd96eb48f · 2025-01-04T17:53:16.000-05:00
allow for multiple languages
diff --git a/Makefile b/Makefile
@@ -43,7 +43,6 @@ clean:
 	rm -rf dist/
 	rm -rf *.egg-info/
 	rm -rf $(VENV_NAME)/
-	rm -rf __pycache__/
 	rm -rf test-install-venv/
 	rm -f *.whl.metadata .??*~
 	find . -type d -name "__pycache__" -exec rm -r "{}" +
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/deidentification/__init__.py b/deidentification/__init__.py
@@ -1,6 +1,6 @@
 """A Python module for de-identifying personally identifiable information in text."""
 
-from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle
+from .deidentification import Deidentification, DeidentificationConfig, DeidentificationOutputStyle, DeidentificationLanguages
 from .deidentification_constants import pgmName, pgmVersion, pgmUrl
 
 __version__ = pgmVersion
@@ -9,6 +9,7 @@
     "Deidentification",
     "DeidentificationConfig",
     "DeidentificationOutputStyle",
+    "DeidentificationLanguages",
     "pgmName",
     "pgmVersion",
     "pgmUrl",
diff --git a/deidentification/deidentification.py b/deidentification/deidentification.py
@@ -22,29 +22,24 @@
 providing visual highlighting of replacements through span tags.
 """
 
-
 from dataclasses import dataclass, fields
-from enum import Enum
 from io import StringIO
 from operator import itemgetter
 from typing import Any, BinaryIO, Optional, Union
 from .deidentification_constants import bcolors, GENDER_PRONOUNS, HTML_BEGIN, HTML_END
-from .deidentification_constants import pgmName, pgmUrl, pgmVersion
+from .deidentification_constants import pgmName, pgmUrl, pgmVersion, DeidentificationOutputStyle, DeidentificationLanguages
 from .normalize_punctuation import normalize_punctuation
 import spacy
 from spacy.tokens import Doc
 import sys
 
-class DeidentificationOutputStyle(Enum):
-    TEXT = "text"
-    HTML = "html"
-
 @dataclass
 class DeidentificationConfig:
     spacy_load: bool = True
     spacy_model: str = "en_core_web_trf"
     output_style: DeidentificationOutputStyle = DeidentificationOutputStyle.TEXT
-    replacement: str = "PERSON"
+    language: DeidentificationLanguages = DeidentificationLanguages.ENGLISH
+    replacement: str = DeidentificationLanguages.ENGLISH.value
     debug: bool = False
     save_tokens: bool = False
     filename: Optional[str] = None
@@ -126,7 +121,7 @@ def model_not_found_error(self, err: str):
         print(str(err), file=sys.stderr)
         if "Can't find model" in str(err):
             print(file=sys.stderr)
-            print("Please manually run the following command one time to download the required model:", file=sys.stderr)
+            print("Please manually run the following command one time to download the required 500 MB model:", file=sys.stderr)
             print(file=sys.stderr)
             print(f"python -m spacy download {self.config.spacy_model}", file=sys.stderr)
             print(file=sys.stderr)
@@ -239,7 +234,8 @@ def _find_all_pronouns(self) -> int:
         # Clear out any previous pronouns
         self.all_pronouns = []
 
-        gender_keys = GENDER_PRONOUNS.keys()
+        # self.config.language equals something like: DeidentificationLanguages.ENGLISH
+        gender_keys = GENDER_PRONOUNS[self.config.language].keys()
         for token in self.doc:
             if (token.pos_ == "PRON" or token.pos_ == "PROPN") and token.text.lower() in gender_keys:
                 record = {"text": token.text, "start_char": token.idx, "end_char": token.idx + len(token.text) - 1, "label": token.pos_, "shapes": [token.shape_]}
@@ -345,7 +341,7 @@ def _replace_merged(self, replaced_text: str, merged: list[dict]) -> str:
             if obj["type"] == "pronoun":
                 start = obj["item"]["start_char"]
                 end = start + len(obj["item"]["text"])
-                anon = GENDER_PRONOUNS[obj["item"]["text"].lower()]
+                anon = GENDER_PRONOUNS[self.config.language][obj["item"]["text"].lower()]
                 if want_html and len(anon):
                     anon = f'<span id="span1">{anon}</span>'
                 replaced_text = replaced_text[:start] + anon + replaced_text[end:]
diff --git a/deidentification/deidentification_constants.py b/deidentification/deidentification_constants.py
@@ -1,8 +1,19 @@
+from enum import Enum
+
 pgmName = "deidentification"
 pgmUrl = "https://github.com/jftuga/deidentification"
 pgmVersion = "1.2.1"
 
-GENDER_PRONOUNS = {
+# the maps the default replacement word for each language
+class DeidentificationLanguages(Enum):
+    ENGLISH = "PERSON"
+
+class DeidentificationOutputStyle(Enum):
+    TEXT = "text"
+    HTML = "html"
+
+GENDER_PRONOUNS = {}
+GENDER_PRONOUNS[DeidentificationLanguages.ENGLISH] = {
     "he": "HE/SHE",
     "him": "HIM/HER",
     "his": "HIS/HER",