Skip to content

Commit adb09dc

Browse files
authored
Exclude a pseudonym if the name exists in the actual data (#97)
* feat: exclude a pseudonym if existing name * fix&test: forgot about python 3.9 compatibility, add test for check_pseudos * fix: remove while loop since ner is not reevaluated, sneak in codecov targets
1 parent 570d40d commit adb09dc

File tree

5 files changed

+279
-10
lines changed

5 files changed

+279
-10
lines changed

codecov.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
coverage:
2+
status:
3+
project:
4+
default:
5+
target: 80%
6+
threshold: 1%
7+
patch:
8+
target: 90%
9+
threshold: 1%

mailcom/main.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,8 +270,8 @@ def process_data(email_list: Iterator[list[dict]], workflow_settings: dict):
270270
email["detected_datetime"] = [
271271
item[0] for item in detected_time
272272
] # only keep the strings
273-
274-
pseudo_content = pseudonymizer.pseudonymize(
273+
exclude_pseudonym = False
274+
pseudo_content, exclude_pseudonym = pseudonymizer.pseudonymize(
275275
email_content,
276276
lang,
277277
model=spacy_model,
@@ -281,6 +281,20 @@ def process_data(email_list: Iterator[list[dict]], workflow_settings: dict):
281281
pseudo_ne=pseudo_ne,
282282
pseudo_numbers=pseudo_numbers,
283283
)
284+
# make sure ne pseudonymization is restarted in case of
285+
# matching pseudonym
286+
# note that the matching pseudonym is subsequently excluded
287+
# from all further processing but will be present in the initial
288+
# data entries
289+
pseudo_content, _ = pseudonymizer.pseudonymize_with_updated_ne(
290+
copy.deepcopy(pseudonymizer.sentences),
291+
None,
292+
language=lang,
293+
detected_dates=email.get("detected_datetime", None),
294+
pseudo_emailaddresses=pseudo_emailaddresses,
295+
pseudo_ne=pseudo_ne,
296+
pseudo_numbers=pseudo_numbers,
297+
)
284298
# use deepcopy to avoid issue with mutable objects
285299
email["pseudo_content"] = pseudo_content
286300
email["ne_list"] = copy.deepcopy(pseudonymizer.ne_list)

mailcom/parse.py

Lines changed: 79 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from mailcom import utils
22
import re
3+
from typing import Optional
34

45

56
class Pseudonymize:
@@ -48,6 +49,18 @@ def reset(self):
4849
self.ne_sent.clear()
4950
self.sentences.clear()
5051

52+
def _get_ne_sent_dict(self) -> dict:
53+
"""Convert the list of named entities and their sentence
54+
indices into a dictionary."""
55+
ne_sent_dict = {}
56+
for sent_idx, ne in zip(self.ne_sent, self.ne_list):
57+
# drop any existing pseudonyms in ne_list
58+
ne.pop("pseudonym", None)
59+
if str(sent_idx) not in ne_sent_dict:
60+
ne_sent_dict[str(sent_idx)] = []
61+
ne_sent_dict[str(sent_idx)].append(ne)
62+
return ne_sent_dict
63+
5164
def get_sentences(self, input_text, language, model="default"):
5265
"""Splits a text into sentences using spacy.
5366
@@ -90,6 +103,49 @@ def get_ner(self, sentence, pipeline_info: dict = None):
90103
ner = self.ner_recognizer(sentence)
91104
return ner
92105

106+
def _check_pseudonyms_in_content(self, lang: str = "fr"):
107+
"""Checks if any of the pseudonyms are present in the current content.
108+
109+
Args:
110+
lang (str): Language context of the data, defaults to "fr".
111+
"""
112+
names = []
113+
exclude_pseudonym = False
114+
for entity in self.ne_list:
115+
if entity["entity_group"] == "PER":
116+
name = entity["word"]
117+
# here we should consider first names only, without
118+
# the given name after the space
119+
name = name.split(" ")[0] if " " in name else name
120+
(
121+
names.extend([name, name.lower(), name.title()])
122+
if name not in names
123+
else None
124+
)
125+
# now we have collected all possible names, lets check for a match
126+
if any(pseudo in names for pseudo in self.pseudo_first_names.get(lang, [])):
127+
print("Found matching name(s) from pseudonyms to actual person names.")
128+
print(f"Names found: {names}")
129+
print(f"Pseudonyms provided: {self.pseudo_first_names.get(lang, [])}")
130+
exclude_pseudonym = True
131+
# drop the pseudonym from all further processing
132+
self.pseudo_first_names[lang] = [
133+
pseudo
134+
for pseudo in self.pseudo_first_names[lang]
135+
if pseudo not in names
136+
]
137+
print(f"Updated pseudonyms: {self.pseudo_first_names.get(lang, [])}")
138+
# raise an exception for the user to restart with other pseudonyms if there are
139+
# no pseudonyms left in the list
140+
if not self.pseudo_first_names[lang]:
141+
raise ValueError(
142+
"""Please provide a different list of pseudonyms via the
143+
workflow settings file. The current list of pseudonyms
144+
is too short and contains only names that already
145+
exist in the actual data."""
146+
)
147+
return exclude_pseudonym
148+
93149
def choose_per_pseudonym(self, name, lang="fr"):
94150
"""Chooses a pseudonym for a PER named entity based on previously used pseudonyms.
95151
If the name has previously been replaced, the same pseudonym is used again.
@@ -328,12 +384,18 @@ def pseudonymize(
328384
if pseudo_numbers:
329385
sent = self.pseudonymize_numbers(sent, detected_dates)
330386
pseudonymized_sentences.append(sent)
331-
return self.concatenate(pseudonymized_sentences)
387+
# check that pseudonyms are not the same as actual
388+
# names in the current content
389+
# if they are, the pseudonym is dropped for the present and all future content
390+
exclude_pseudonym = (
391+
self._check_pseudonyms_in_content(lang=language) if self.ne_list else False
392+
)
393+
return self.concatenate(pseudonymized_sentences), exclude_pseudonym
332394

333395
def pseudonymize_with_updated_ne(
334396
self,
335397
sentences,
336-
ne_sent_dict: dict[list[dict]],
398+
ne_sent_dict: Optional[dict[list[dict]]],
337399
language="de",
338400
detected_dates: list[str] = None,
339401
pseudo_emailaddresses=True,
@@ -346,7 +408,9 @@ def pseudonymize_with_updated_ne(
346408
347409
Args:
348410
sentences (list[str]): List of sentences to pseudonymize.
349-
ne_sent_dict (dict[list[dict]]): Dictionary containing named entities
411+
ne_sent_dict (dict[list[dict]]|None): Dictionary containing named entities.
412+
If set to none, the previous named entities will be used. This is the case
413+
for reprocessing emails with new pseudonyms.
350414
language (str, optional): Language of the email. Defaults to "de".
351415
detected_dates (list[str], optional): Detected dates in the email.
352416
Defaults to None.
@@ -360,7 +424,12 @@ def pseudonymize_with_updated_ne(
360424
Returns:
361425
str: Pseudonymized text
362426
"""
427+
if not ne_sent_dict:
428+
# the ne was ok last time, but we need to rerun with new pseudonyms
429+
ne_sent_dict = self._get_ne_sent_dict()
430+
363431
self.reset()
432+
self.sentences = sentences
364433
pseudonymized_sentences = []
365434
for sent_idx, sent in enumerate(sentences):
366435
if pseudo_emailaddresses:
@@ -378,4 +447,10 @@ def pseudonymize_with_updated_ne(
378447
if pseudo_numbers:
379448
sent = self.pseudonymize_numbers(sent, detected_dates)
380449
pseudonymized_sentences.append(sent)
381-
return self.concatenate(pseudonymized_sentences)
450+
# check that pseudonyms are not the same as actual
451+
# names in the current content
452+
# if they are, the pseudonym is dropped for the present and all future content
453+
exclude_pseudonym = (
454+
self._check_pseudonyms_in_content(lang=language) if self.ne_list else False
455+
)
456+
return self.concatenate(pseudonymized_sentences), exclude_pseudonym

mailcom/test/test_main.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,18 @@ def get_data():
307307
]
308308

309309

310+
@pytest.fixture()
311+
def get_data_small():
312+
return [
313+
{
314+
"content": "Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30. " # noqa
315+
"Compruébelo en el archivo adjunto",
316+
"attachment": 1,
317+
"attachement type": ["jpg"],
318+
},
319+
]
320+
321+
310322
@pytest.fixture()
311323
def get_settings():
312324
pkg = resources.files("mailcom")
@@ -440,6 +452,115 @@ def test_process_data_no_ne(get_data, get_settings, get_inout_hl):
440452
)
441453

442454

455+
def test_process_data_matching_pseudonym(get_data, get_settings, get_inout_hl):
456+
get_inout_hl.email_list = get_data
457+
new_settings = {
458+
"pseudo_first_names": {
459+
"es": ["Alice", "Angel", "Alex"],
460+
"fr": ["Claude", "Dominique", "Alice"],
461+
}
462+
}
463+
main._update_new_settings(get_settings, new_settings=new_settings)
464+
# check that pseudonyms have been updated
465+
assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
466+
main.process_data(get_inout_hl.get_email_list(), get_settings)
467+
468+
emails = get_inout_hl.get_email_list()
469+
email_1 = next(emails)
470+
email_2 = next(emails)
471+
472+
assert email_1.get("cleaned_content") == email_1.get("content")
473+
assert email_1.get("lang") == "fr"
474+
assert email_1.get("detected_datetime") == []
475+
assert (
476+
email_1.get("pseudo_content")
477+
== "Claude [email] viendra au bâtiment à [number]h[number]. "
478+
"Nous nous rendrons ensuite au [location]"
479+
)
480+
assert email_1.get("sentences") == [
481+
"Alice (alice@gmail.com) viendra au bâtiment à 10h00.",
482+
"Nous nous rendrons ensuite au MeetingPoint",
483+
]
484+
assert email_1.get("sentences_after_email") == [
485+
"Alice [email] viendra au bâtiment à 10h00.",
486+
"Nous nous rendrons ensuite au MeetingPoint",
487+
]
488+
489+
assert email_2.get("cleaned_content") == email_2.get("content")
490+
assert email_2.get("lang") == "es"
491+
assert email_2.get("detected_datetime") == ["28.03.2025 a las 10:30"]
492+
assert (
493+
email_2.get("pseudo_content")
494+
== "Esta foto fue tomada por Angel el 28.03.2025 a las 10:30. "
495+
"Compruébelo en el archivo adjunto"
496+
)
497+
assert email_2.get("sentences") == [
498+
"Esta foto fue tomada por Alice el 28.03.2025 a las 10:30.",
499+
"Compruébelo en el archivo adjunto",
500+
]
501+
assert email_2.get("sentences_after_email") == [
502+
"Esta foto fue tomada por Alice el 28.03.2025 a las 10:30.",
503+
"Compruébelo en el archivo adjunto",
504+
]
505+
506+
507+
def test_process_data_multiple_same_pseudonyms(
508+
get_data_small, get_settings, get_inout_hl
509+
):
510+
get_inout_hl.email_list = get_data_small
511+
new_settings = {
512+
"pseudo_first_names": {
513+
"es": [
514+
"Alice",
515+
"Alaya",
516+
"Angel",
517+
],
518+
}
519+
}
520+
main._update_new_settings(get_settings, new_settings=new_settings)
521+
# check that pseudonyms have been updated
522+
assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
523+
main.process_data(get_inout_hl.get_email_list(), get_settings)
524+
525+
emails = get_inout_hl.get_email_list()
526+
email_2 = next(emails)
527+
528+
assert email_2.get("cleaned_content") == email_2.get("content")
529+
assert email_2.get("lang") == "es"
530+
assert email_2.get("detected_datetime") == ["28.03.2025 a las 10:30"]
531+
assert (
532+
email_2.get("pseudo_content")
533+
== "Esta foto fue tomada por Alaya e Alaya el 28.03.2025 a las 10:30. "
534+
"Compruébelo en el archivo adjunto"
535+
)
536+
assert email_2.get("sentences") == [
537+
"Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30.",
538+
"Compruébelo en el archivo adjunto",
539+
]
540+
assert email_2.get("sentences_after_email") == [
541+
"Esta foto fue tomada por Alice e Angel el 28.03.2025 a las 10:30.",
542+
"Compruébelo en el archivo adjunto",
543+
]
544+
545+
546+
def test_process_data_same_pseudonym_not_enough_pseudos(
547+
get_data_small, get_settings, get_inout_hl
548+
):
549+
get_inout_hl.email_list = get_data_small
550+
new_settings = {
551+
"pseudo_first_names": {
552+
"es": [
553+
"Alice",
554+
],
555+
}
556+
}
557+
main._update_new_settings(get_settings, new_settings=new_settings)
558+
# check that pseudonyms have been updated
559+
assert get_settings["pseudo_first_names"] == new_settings["pseudo_first_names"]
560+
with pytest.raises(ValueError):
561+
main.process_data(get_inout_hl.get_email_list(), get_settings)
562+
563+
443564
def test_process_data_no_numbers(get_data, get_settings, get_inout_hl):
444565
get_settings["pseudo_numbers"] = False
445566
get_inout_hl.email_list = get_data

0 commit comments

Comments
 (0)