Skip to content

Commit 29d6b0b

Browse files
authored
Merge pull request #124 from vmenger/detect-multi-token-names
Detect multi token names
2 parents 0a12310 + 3aa0961 commit 29d6b0b

File tree

10 files changed

+131
-53
lines changed

10 files changed

+131
-53
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## 2.4.2 (2023-11-22)
9+
10+
### Changed
11+
- multi-token lookup for first- and last names, so multi token names are now detected
12+
- some small lookup list additions
13+
814
## 2.4.3 (2023-11-22)
915

1016
### Changed

config.json

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -128,49 +128,19 @@
128128
}
129129
},
130130
"first_name_lookup": {
131-
"annotator_type": "token_pattern",
131+
"annotator_type": "multi_token",
132132
"group": "names",
133133
"args": {
134134
"tag": "voornaam",
135-
"skip": [],
136-
"pattern": [
137-
{
138-
"and": [
139-
{
140-
"lookup": "first_names"
141-
},
142-
{
143-
"neg_lookup": "first_name_exceptions"
144-
},
145-
{
146-
"neg_lookup": "whitelist"
147-
}
148-
]
149-
}
150-
]
135+
"lookup_values": "first_names"
151136
}
152137
},
153138
"surname_lookup": {
154-
"annotator_type": "token_pattern",
139+
"annotator_type": "multi_token",
155140
"group": "names",
156141
"args": {
157142
"tag": "achternaam",
158-
"skip": [],
159-
"pattern": [
160-
{
161-
"and": [
162-
{
163-
"lookup": "surnames"
164-
},
165-
{
166-
"neg_lookup": "surname_exceptions"
167-
},
168-
{
169-
"neg_lookup": "whitelist"
170-
}
171-
]
172-
}
173-
]
143+
"lookup_values": "surnames"
174144
}
175145
},
176146
"person_first_name": {

deduce-data/lookup_lists/medical_terms.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,6 +3386,7 @@ lange
33863386
langs
33873387
langzame
33883388
langzamer
3389+
lap
33893390
laquo
33903391
largactil
33913392
laryngeus
@@ -4872,6 +4873,7 @@ population
48724873
porfyrie
48734874
porphyria
48744875
portae
4876+
pos
48754877
positief
48764878
positive
48774879
postbus
@@ -5421,6 +5423,7 @@ scheikunde
54215423
scheikundig
54225424
scheikundige
54235425
schele
5426+
schep
54245427
scherp
54255428
scherpe
54265429
scherpstelling
@@ -5500,6 +5503,7 @@ sensitization
55005503
sensorieel
55015504
sensorisch
55025505
sensory
5506+
sep
55035507
sepsis
55045508
septi
55055509
septic
@@ -5681,6 +5685,7 @@ spondylitis
56815685
spondylolyse
56825686
spongiforme
56835687
spontaan
5688+
spoor
56845689
spoorelement
56855690
sporadic
56865691
sporadisch

deduce-data/lookup_lists/names/first_names.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,6 @@ Anne-Roos
854854
Anne-Ruth
855855
Anne-Sophie
856856
Anne-Wil
857-
Anne-marie
858857
Annebel
859858
Annebelle
860859
Annebet
@@ -7134,8 +7133,6 @@ Jo-An
71347133
Jo-Ann
71357134
Jo-Anna
71367135
Jo-Anne
7137-
Jo-ann
7138-
Jo-anne
71397136
Joa
71407137
Joab
71417138
Joachem
@@ -8009,7 +8006,6 @@ Kwint
80098006
Kwinten
80108007
Kwok
80118008
Ky-Mani
8012-
Ky-mani
80138009
Kyan
80148010
Kyana
80158011
Kyandro

deduce-data/lookup_lists/names/interfixes.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ in het
1818
l'
1919
la
2020
le
21+
lo
2122
op 't
2223
op de
2324
op den

deduce-data/lookup_lists/top_1000_terms.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ bui
128128
buiten
129129
bureau
130130
buren
131+
burger
131132
bus
132133
buurman
133134
buurvrouw
@@ -184,8 +185,10 @@ drinken
184185
drogen
185186
dromen
186187
droog
188+
droog
187189
druk
188190
dubbel
191+
duits
189192
dun
190193
dus
191194
duur
@@ -207,6 +210,7 @@ elektrisch
207210
elk
208211
elke
209212
en
213+
engels
210214
enkele
211215
enthousiast
212216
er
@@ -267,6 +271,7 @@ gevaar
267271
gevaarlijk
268272
gevangenis
269273
geven
274+
geven
270275
gevolg
271276
gewicht
272277
gewoon
@@ -550,6 +555,7 @@ naam
550555
naar
551556
naast
552557
nacht
558+
nader
553559
nat
554560
natuur
555561
natuurlijk
@@ -621,6 +627,7 @@ opnemen
621627
oranje
622628
orde
623629
oud
630+
oud
624631
ouder
625632
over
626633
overeenkomen
@@ -637,6 +644,7 @@ park
637644
partner
638645
pas
639646
passeren
647+
pauw
640648
pen
641649
peper
642650
per

deduce/lookup_sets.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,17 @@ def _get_first_names() -> dd.ds.LookupSet:
3434
cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
3535
)
3636

37+
first_name_exceptions = _get_first_name_exceptions()
38+
39+
first_names.remove_items_from_iterable(first_name_exceptions)
40+
41+
first_names.add_items_from_self(
42+
cleaning_pipeline=[
43+
FilterBasedOnLookupSet(filter_set=_get_whitelist(), case_sensitive=False),
44+
],
45+
replace=True,
46+
)
47+
3748
return first_names
3849

3950

@@ -92,6 +103,17 @@ def _get_surnames() -> dd.ds.LookupSet:
92103
cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
93104
)
94105

106+
surname_exceptions = _get_surname_exceptions()
107+
108+
surnames.remove_items_from_iterable(surname_exceptions)
109+
110+
surnames.add_items_from_self(
111+
cleaning_pipeline=[
112+
FilterBasedOnLookupSet(filter_set=_get_whitelist(), case_sensitive=False),
113+
],
114+
replace=True,
115+
)
116+
95117
return surnames
96118

97119

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "deduce"
3-
version = "2.4.3"
3+
version = "2.4.4"
44
description = "Deduce: de-identification method for Dutch medical text"
55
authors = ["Vincent Menger <vmenger@protonmail.com>"]
66
maintainers = ["Vincent Menger <vmenger@protonmail.com>"]

tests/regression/data/names.json

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -843,19 +843,6 @@
843843
}
844844
]
845845
},
846-
{
847-
"id": 94,
848-
"text": "Voornaam-Voornaam Achternaam",
849-
"annotations": [
850-
{
851-
"text": "Voornaam-Voornaam Achternaam",
852-
"start_char": 0,
853-
"end_char": 28,
854-
"tag": "persoon"
855-
}
856-
]
857-
},
858-
859846
{
860847
"id": 95,
861848
"text": "de Heer",
@@ -1043,6 +1030,90 @@
10431030
"tag": "persoon"
10441031
}
10451032
]
1033+
},
1034+
{
1035+
"id": 117,
1036+
"text": "Jan-Willem",
1037+
"annotations": [
1038+
{
1039+
"text": "Jan-Willem",
1040+
"start_char": 0,
1041+
"end_char": 10,
1042+
"tag": "persoon"
1043+
}
1044+
]
1045+
},
1046+
{
1047+
"id": 118,
1048+
"text": "Jan-Onbekendenaam",
1049+
"annotations": [
1050+
{
1051+
"text": "Jan-Onbekendenaam",
1052+
"start_char": 0,
1053+
"end_char": 17,
1054+
"tag": "persoon"
1055+
}
1056+
]
1057+
},
1058+
{
1059+
"id": 119,
1060+
"text": "Onbekendenaam-Willem",
1061+
"annotations": [
1062+
{
1063+
"text": "Onbekendenaam-Willem",
1064+
"start_char": 0,
1065+
"end_char": 20,
1066+
"tag": "persoon"
1067+
}
1068+
]
1069+
},
1070+
{
1071+
"id": 120,
1072+
"text": "El Ahmadi",
1073+
"annotations": [
1074+
{
1075+
"text": "Ahmadi",
1076+
"start_char": 3,
1077+
"end_char": 9,
1078+
"tag": "persoon"
1079+
}
1080+
]
1081+
},
1082+
{
1083+
"id": 121,
1084+
"text": "Bruins Slot",
1085+
"annotations": [
1086+
{
1087+
"text": "Bruins Slot",
1088+
"start_char": 0,
1089+
"end_char": 11,
1090+
"tag": "persoon"
1091+
}
1092+
]
1093+
},
1094+
{
1095+
"id": 122,
1096+
"text": "Groot Wassink",
1097+
"annotations": [
1098+
{
1099+
"text": "Groot Wassink",
1100+
"start_char": 0,
1101+
"end_char": 13,
1102+
"tag": "persoon"
1103+
}
1104+
]
1105+
},
1106+
{
1107+
"id": 123,
1108+
"text": "Pieter Oude Nijhuis",
1109+
"annotations": [
1110+
{
1111+
"text": "Pieter Oude Nijhuis",
1112+
"start_char": 0,
1113+
"end_char": 19,
1114+
"tag": "persoon"
1115+
}
1116+
]
10461117
}
10471118
]
10481119
}

tests/regression/test_regression.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def test_regression_name(self, model):
5353
"name_context",
5454
"person_annotation_converter",
5555
},
56-
known_failures={94},
5756
)
5857

5958
def test_regression_location(self, model):

0 commit comments

Comments
 (0)