Exclude a pseudonym if the name exists in the actual data (#97)

iulusoy · web-flow · commit adb09dc4f6cb · 2025-09-05T13:32:35.000+02:00
* feat: exclude a pseudonym if existing name

* fix&amp;test: forgot about python 3.9 compatibility, add test for check_pseudos

* fix: remove while loop since ner is not reevaluated, sneak in codecov targets
diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,9 @@
+coverage:
+  status:
+    project:
+      default:
+        target: 80%
+        threshold: 1%
+      patch:
+        target: 90%
+        threshold: 1%
diff --git a/mailcom/main.py b/mailcom/main.py
@@ -270,8 +270,8 @@ def process_data(email_list: Iterator[list[dict]], workflow_settings: dict):
             email["detected_datetime"] = [
                 item[0] for item in detected_time
             ]  # only keep the strings
-
-        pseudo_content = pseudonymizer.pseudonymize(
+        exclude_pseudonym = False
+        pseudo_content, exclude_pseudonym = pseudonymizer.pseudonymize(
             email_content,
             lang,
             model=spacy_model,
@@ -281,6 +281,20 @@ def process_data(email_list: Iterator[list[dict]], workflow_settings: dict):
             pseudo_ne=pseudo_ne,
             pseudo_numbers=pseudo_numbers,
         )
+        # make sure ne pseudonymization is restarted in case of
+        # matching pseudonym
+        # note that the matching pseudonym is subsequently excluded
+        # from all further processing but will be present in the initial
+        # data entries
+        pseudo_content, _ = pseudonymizer.pseudonymize_with_updated_ne(
+            copy.deepcopy(pseudonymizer.sentences),
+            None,
+            language=lang,
+            detected_dates=email.get("detected_datetime", None),
+            pseudo_emailaddresses=pseudo_emailaddresses,
+            pseudo_ne=pseudo_ne,
+            pseudo_numbers=pseudo_numbers,
+        )
         # use deepcopy to avoid issue with mutable objects
         email["pseudo_content"] = pseudo_content
         email["ne_list"] = copy.deepcopy(pseudonymizer.ne_list)
diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -1,5 +1,6 @@
 from mailcom import utils
 import re
+from typing import Optional
 
 
 class Pseudonymize:
@@ -48,6 +49,18 @@ def reset(self):
         self.ne_sent.clear()
         self.sentences.clear()
 
+    def _get_ne_sent_dict(self) -> dict:
+        """Convert the list of named entities and their sentence
+        indices into a dictionary."""
+        ne_sent_dict = {}
+        for sent_idx, ne in zip(self.ne_sent, self.ne_list):
+            # drop any existing pseudonyms in ne_list
+            ne.pop("pseudonym", None)
+            if str(sent_idx) not in ne_sent_dict:
+                ne_sent_dict[str(sent_idx)] = []
+            ne_sent_dict[str(sent_idx)].append(ne)
+        return ne_sent_dict
+
     def get_sentences(self, input_text, language, model="default"):
         """Splits a text into sentences using spacy.
 
@@ -90,6 +103,49 @@ def get_ner(self, sentence, pipeline_info: dict = None):
         ner = self.ner_recognizer(sentence)
         return ner
 
+    def _check_pseudonyms_in_content(self, lang: str = "fr"):
+        """Checks if any of the pseudonyms are present in the current content.
+
+        Args:
+            lang (str): Language context of the data, defaults to "fr".
+        """
+        names = []
+        exclude_pseudonym = False
+        for entity in self.ne_list:
+            if entity["entity_group"] == "PER":
+                name = entity["word"]
+                # here we should consider first names only, without
+                # the given name after the space
+                name = name.split(" ")[0] if " " in name else name
+                (
+                    names.extend([name, name.lower(), name.title()])
+                    if name not in names
+                    else None
+                )
+        # now we have collected all possible names, lets check for a match
+        if any(pseudo in names for pseudo in self.pseudo_first_names.get(lang, [])):
+            print("Found matching name(s) from pseudonyms to actual person names.")
+            print(f"Names found: {names}")
+            print(f"Pseudonyms provided: {self.pseudo_first_names.get(lang, [])}")
+            exclude_pseudonym = True
+            # drop the pseudonym from all further processing
+            self.pseudo_first_names[lang] = [
+                pseudo
+                for pseudo in self.pseudo_first_names[lang]
+                if pseudo not in names
+            ]
+            print(f"Updated pseudonyms: {self.pseudo_first_names.get(lang, [])}")
+        # raise an exception for the user to restart with other pseudonyms if there are
+        # no pseudonyms left in the list
+        if not self.pseudo_first_names[lang]:
+            raise ValueError(
+                """Please provide a different list of pseudonyms via the
+                             workflow settings file. The current list of pseudonyms
+                             is too short and contains only names that already
+                             exist in the actual data."""
+            )
+        return exclude_pseudonym
+
     def choose_per_pseudonym(self, name, lang="fr"):
         """Chooses a pseudonym for a PER named entity based on previously used pseudonyms.
         If the name has previously been replaced, the same pseudonym is used again.
@@ -328,12 +384,18 @@ def pseudonymize(
             if pseudo_numbers:
                 sent = self.pseudonymize_numbers(sent, detected_dates)
             pseudonymized_sentences.append(sent)
-        return self.concatenate(pseudonymized_sentences)
+        # check that pseudonyms are not the same as actual
+        # names in the current content
+        # if they are, the pseudonym is dropped for the present and all future content
+        exclude_pseudonym = (
+            self._check_pseudonyms_in_content(lang=language) if self.ne_list else False
+        )
+        return self.concatenate(pseudonymized_sentences), exclude_pseudonym
 
     def pseudonymize_with_updated_ne(
         self,
         sentences,
-        ne_sent_dict: dict[list[dict]],
+        ne_sent_dict: Optional[dict[list[dict]]],
         language="de",
         detected_dates: list[str] = None,
         pseudo_emailaddresses=True,
@@ -346,7 +408,9 @@ def pseudonymize_with_updated_ne(
 
         Args:
             sentences (list[str]): List of sentences to pseudonymize.
-            ne_sent_dict (dict[list[dict]]): Dictionary containing named entities
+            ne_sent_dict (dict[list[dict]]|None): Dictionary containing named entities.
+                If set to none, the previous named entities will be used. This is the case
+                for reprocessing emails with new pseudonyms.
             language (str, optional): Language of the email. Defaults to "de".
             detected_dates (list[str], optional): Detected dates in the email.
                 Defaults to None.
@@ -360,7 +424,12 @@ def pseudonymize_with_updated_ne(
         Returns:
             str: Pseudonymized text
         """
+        if not ne_sent_dict:
+            # the ne was ok last time, but we need to rerun with new pseudonyms
+            ne_sent_dict = self._get_ne_sent_dict()
+
         self.reset()
+        self.sentences = sentences
         pseudonymized_sentences = []
         for sent_idx, sent in enumerate(sentences):
             if pseudo_emailaddresses:
@@ -378,4 +447,10 @@ def pseudonymize_with_updated_ne(
             if pseudo_numbers:
                 sent = self.pseudonymize_numbers(sent, detected_dates)
             pseudonymized_sentences.append(sent)
-        return self.concatenate(pseudonymized_sentences)
+        # check that pseudonyms are not the same as actual
+        # names in the current content
+        # if they are, the pseudonym is dropped for the present and all future content
+        exclude_pseudonym = (
+            self._check_pseudonyms_in_content(lang=language) if self.ne_list else False
+        )
+        return self.concatenate(pseudonymized_sentences), exclude_pseudonym
diff --git a/mailcom/test/test_main.py b/mailcom/test/test_main.py
@@ -307,6 +307,18 @@ def get_data():
     ]
 
 
+@pytest.fixture()
+def get_data_small():
+    return [
+        {
+            "content": "Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30. "  # noqa
+            "Compruébelo en el archivo adjunto",
+            "attachment": 1,
+            "attachement type": ["jpg"],
+        },
+    ]
+
+
 @pytest.fixture()
 def get_settings():
     pkg = resources.files("mailcom")
@@ -440,6 +452,115 @@ def test_process_data_no_ne(get_data, get_settings, get_inout_hl):
     )
 
 
+def test_process_data_matching_pseudonym(get_data, get_settings, get_inout_hl):
+    get_inout_hl.email_list = get_data
+    new_settings = {
+        "pseudo_first_names": {
+            "es": ["Alice", "Angel", "Alex"],
+            "fr": ["Claude", "Dominique", "Alice"],
+        }
+    }
+    main._update_new_settings(get_settings, new_settings=new_settings)
+    # check that pseudonyms have been updated
+    assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
+    main.process_data(get_inout_hl.get_email_list(), get_settings)
+
+    emails = get_inout_hl.get_email_list()
+    email_1 = next(emails)
+    email_2 = next(emails)
+
+    assert email_1.get("cleaned_content") == email_1.get("content")
+    assert email_1.get("lang") == "fr"
+    assert email_1.get("detected_datetime") == []
+    assert (
+        email_1.get("pseudo_content")
+        == "Claude [email] viendra au bâtiment à [number]h[number]. "
+        "Nous nous rendrons ensuite au [location]"
+    )
+    assert email_1.get("sentences") == [
+        "Alice (alice@gmail.com) viendra au bâtiment à 10h00.",
+        "Nous nous rendrons ensuite au MeetingPoint",
+    ]
+    assert email_1.get("sentences_after_email") == [
+        "Alice [email] viendra au bâtiment à 10h00.",
+        "Nous nous rendrons ensuite au MeetingPoint",
+    ]
+
+    assert email_2.get("cleaned_content") == email_2.get("content")
+    assert email_2.get("lang") == "es"
+    assert email_2.get("detected_datetime") == ["28.03.2025 a las 10:30"]
+    assert (
+        email_2.get("pseudo_content")
+        == "Esta foto fue tomada por Angel el 28.03.2025 a las 10:30. "
+        "Compruébelo en el archivo adjunto"
+    )
+    assert email_2.get("sentences") == [
+        "Esta foto fue tomada por Alice el 28.03.2025 a las 10:30.",
+        "Compruébelo en el archivo adjunto",
+    ]
+    assert email_2.get("sentences_after_email") == [
+        "Esta foto fue tomada por Alice el 28.03.2025 a las 10:30.",
+        "Compruébelo en el archivo adjunto",
+    ]
+
+
+def test_process_data_multiple_same_pseudonyms(
+    get_data_small, get_settings, get_inout_hl
+):
+    get_inout_hl.email_list = get_data_small
+    new_settings = {
+        "pseudo_first_names": {
+            "es": [
+                "Alice",
+                "Alaya",
+                "Angel",
+            ],
+        }
+    }
+    main._update_new_settings(get_settings, new_settings=new_settings)
+    # check that pseudonyms have been updated
+    assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
+    main.process_data(get_inout_hl.get_email_list(), get_settings)
+
+    emails = get_inout_hl.get_email_list()
+    email_2 = next(emails)
+
+    assert email_2.get("cleaned_content") == email_2.get("content")
+    assert email_2.get("lang") == "es"
+    assert email_2.get("detected_datetime") == ["28.03.2025 a las 10:30"]
+    assert (
+        email_2.get("pseudo_content")
+        == "Esta foto fue tomada por Alaya e Alaya el 28.03.2025 a las 10:30. "
+        "Compruébelo en el archivo adjunto"
+    )
+    assert email_2.get("sentences") == [
+        "Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30.",
+        "Compruébelo en el archivo adjunto",
+    ]
+    assert email_2.get("sentences_after_email") == [
+        "Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30.",
+        "Compruébelo en el archivo adjunto",
+    ]
+
+
+def test_process_data_same_pseudonym_not_enough_pseudos(
+    get_data_small, get_settings, get_inout_hl
+):
+    get_inout_hl.email_list = get_data_small
+    new_settings = {
+        "pseudo_first_names": {
+            "es": [
+                "Alice",
+            ],
+        }
+    }
+    main._update_new_settings(get_settings, new_settings=new_settings)
+    # check that pseudonyms have been updated
+    assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
+    with pytest.raises(ValueError):
+        main.process_data(get_inout_hl.get_email_list(), get_settings)
+
+
 def test_process_data_no_numbers(get_data, get_settings, get_inout_hl):
     get_settings["pseudo_numbers"] = False
     get_inout_hl.email_list = get_data
diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py