File tree Expand file tree Collapse file tree 10 files changed +131
-53
lines changed
Expand file tree Collapse file tree 10 files changed +131
-53
lines changed Original file line number Diff line number Diff line change @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55The format is based on [ Keep a Changelog] ( https://keepachangelog.com/en/1.0.0/ ) ,
66and this project adheres to [ Semantic Versioning] ( https://semver.org/spec/v2.0.0.html ) .
77
8+ ## 2.4.2 (2023-11-22)
9+
10+ ### Changed
11+ - multi-token lookup for first- and last names, so multi token names are now detected
12+ - some small lookup list additions
13+
814## 2.4.3 (2023-11-22)
915
1016### Changed
Original file line number Diff line number Diff line change 128128 }
129129 },
130130 "first_name_lookup" : {
131- "annotator_type" : " token_pattern " ,
131+ "annotator_type" : " multi_token " ,
132132 "group" : " names" ,
133133 "args" : {
134134 "tag" : " voornaam" ,
135- "skip" : [],
136- "pattern" : [
137- {
138- "and" : [
139- {
140- "lookup" : " first_names"
141- },
142- {
143- "neg_lookup" : " first_name_exceptions"
144- },
145- {
146- "neg_lookup" : " whitelist"
147- }
148- ]
149- }
150- ]
135+ "lookup_values" : " first_names"
151136 }
152137 },
153138 "surname_lookup" : {
154- "annotator_type" : " token_pattern " ,
139+ "annotator_type" : " multi_token " ,
155140 "group" : " names" ,
156141 "args" : {
157142 "tag" : " achternaam" ,
158- "skip" : [],
159- "pattern" : [
160- {
161- "and" : [
162- {
163- "lookup" : " surnames"
164- },
165- {
166- "neg_lookup" : " surname_exceptions"
167- },
168- {
169- "neg_lookup" : " whitelist"
170- }
171- ]
172- }
173- ]
143+ "lookup_values" : " surnames"
174144 }
175145 },
176146 "person_first_name" : {
Original file line number Diff line number Diff line change @@ -3386,6 +3386,7 @@ lange
33863386langs
33873387langzame
33883388langzamer
3389+ lap
33893390laquo
33903391largactil
33913392laryngeus
@@ -4872,6 +4873,7 @@ population
48724873porfyrie
48734874porphyria
48744875portae
4876+ pos
48754877positief
48764878positive
48774879postbus
@@ -5421,6 +5423,7 @@ scheikunde
54215423scheikundig
54225424scheikundige
54235425schele
5426+ schep
54245427scherp
54255428scherpe
54265429scherpstelling
@@ -5500,6 +5503,7 @@ sensitization
55005503sensorieel
55015504sensorisch
55025505sensory
5506+ sep
55035507sepsis
55045508septi
55055509septic
@@ -5681,6 +5685,7 @@ spondylitis
56815685spondylolyse
56825686spongiforme
56835687spontaan
5688+ spoor
56845689spoorelement
56855690sporadic
56865691sporadisch
Original file line number Diff line number Diff line change @@ -854,7 +854,6 @@ Anne-Roos
854854Anne-Ruth
855855Anne-Sophie
856856Anne-Wil
857- Anne-marie
858857Annebel
859858Annebelle
860859Annebet
@@ -7134,8 +7133,6 @@ Jo-An
71347133Jo-Ann
71357134Jo-Anna
71367135Jo-Anne
7137- Jo-ann
7138- Jo-anne
71397136Joa
71407137Joab
71417138Joachem
@@ -8009,7 +8006,6 @@ Kwint
80098006Kwinten
80108007Kwok
80118008Ky-Mani
8012- Ky-mani
80138009Kyan
80148010Kyana
80158011Kyandro
Original file line number Diff line number Diff line change 1818l'
1919la
2020le
21+ lo
2122op 't
2223op de
2324op den
Original file line number Diff line number Diff line change 128128buiten
129129bureau
130130buren
131+ burger
131132bus
132133buurman
133134buurvrouw
@@ -184,8 +185,10 @@ drinken
184185drogen
185186dromen
186187droog
188+ droog
187189druk
188190dubbel
191+ duits
189192dun
190193dus
191194duur
@@ -207,6 +210,7 @@ elektrisch
207210elk
208211elke
209212en
213+ engels
210214enkele
211215enthousiast
212216er
@@ -267,6 +271,7 @@ gevaar
267271gevaarlijk
268272gevangenis
269273geven
274+ geven
270275gevolg
271276gewicht
272277gewoon
550555naar
551556naast
552557nacht
558+ nader
553559nat
554560natuur
555561natuurlijk
@@ -621,6 +627,7 @@ opnemen
621627oranje
622628orde
623629oud
630+ oud
624631ouder
625632over
626633overeenkomen
637644partner
638645pas
639646passeren
647+ pauw
640648pen
641649peper
642650per
Original file line number Diff line number Diff line change @@ -34,6 +34,17 @@ def _get_first_names() -> dd.ds.LookupSet:
3434 cleaning_pipeline = [dd .str .FilterByLength (min_len = 2 )],
3535 )
3636
37+ first_name_exceptions = _get_first_name_exceptions ()
38+
39+ first_names .remove_items_from_iterable (first_name_exceptions )
40+
41+ first_names .add_items_from_self (
42+ cleaning_pipeline = [
43+ FilterBasedOnLookupSet (filter_set = _get_whitelist (), case_sensitive = False ),
44+ ],
45+ replace = True ,
46+ )
47+
3748 return first_names
3849
3950
@@ -92,6 +103,17 @@ def _get_surnames() -> dd.ds.LookupSet:
92103 cleaning_pipeline = [dd .str .FilterByLength (min_len = 2 )],
93104 )
94105
106+ surname_exceptions = _get_surname_exceptions ()
107+
108+ surnames .remove_items_from_iterable (surname_exceptions )
109+
110+ surnames .add_items_from_self (
111+ cleaning_pipeline = [
112+ FilterBasedOnLookupSet (filter_set = _get_whitelist (), case_sensitive = False ),
113+ ],
114+ replace = True ,
115+ )
116+
95117 return surnames
96118
97119
Original file line number Diff line number Diff line change 11[tool .poetry ]
22name = " deduce"
3- version = " 2.4.3 "
3+ version = " 2.4.4 "
44description = " Deduce: de-identification method for Dutch medical text"
55authors = [" Vincent Menger <vmenger@protonmail.com>" ]
66maintainers = [" Vincent Menger <vmenger@protonmail.com>" ]
Original file line number Diff line number Diff line change 843843 }
844844 ]
845845 },
846- {
847- "id" : 94 ,
848- "text" : " Voornaam-Voornaam Achternaam" ,
849- "annotations" : [
850- {
851- "text" : " Voornaam-Voornaam Achternaam" ,
852- "start_char" : 0 ,
853- "end_char" : 28 ,
854- "tag" : " persoon"
855- }
856- ]
857- },
858-
859846 {
860847 "id" : 95 ,
861848 "text" : " de Heer" ,
10431030 "tag" : " persoon"
10441031 }
10451032 ]
1033+ },
1034+ {
1035+ "id" : 117 ,
1036+ "text" : " Jan-Willem" ,
1037+ "annotations" : [
1038+ {
1039+ "text" : " Jan-Willem" ,
1040+ "start_char" : 0 ,
1041+ "end_char" : 10 ,
1042+ "tag" : " persoon"
1043+ }
1044+ ]
1045+ },
1046+ {
1047+ "id" : 118 ,
1048+ "text" : " Jan-Onbekendenaam" ,
1049+ "annotations" : [
1050+ {
1051+ "text" : " Jan-Onbekendenaam" ,
1052+ "start_char" : 0 ,
1053+ "end_char" : 17 ,
1054+ "tag" : " persoon"
1055+ }
1056+ ]
1057+ },
1058+ {
1059+ "id" : 119 ,
1060+ "text" : " Onbekendenaam-Willem" ,
1061+ "annotations" : [
1062+ {
1063+ "text" : " Onbekendenaam-Willem" ,
1064+ "start_char" : 0 ,
1065+ "end_char" : 20 ,
1066+ "tag" : " persoon"
1067+ }
1068+ ]
1069+ },
1070+ {
1071+ "id" : 120 ,
1072+ "text" : " El Ahmadi" ,
1073+ "annotations" : [
1074+ {
1075+ "text" : " Ahmadi" ,
1076+ "start_char" : 3 ,
1077+ "end_char" : 9 ,
1078+ "tag" : " persoon"
1079+ }
1080+ ]
1081+ },
1082+ {
1083+ "id" : 121 ,
1084+ "text" : " Bruins Slot" ,
1085+ "annotations" : [
1086+ {
1087+ "text" : " Bruins Slot" ,
1088+ "start_char" : 0 ,
1089+ "end_char" : 11 ,
1090+ "tag" : " persoon"
1091+ }
1092+ ]
1093+ },
1094+ {
1095+ "id" : 122 ,
1096+ "text" : " Groot Wassink" ,
1097+ "annotations" : [
1098+ {
1099+ "text" : " Groot Wassink" ,
1100+ "start_char" : 0 ,
1101+ "end_char" : 13 ,
1102+ "tag" : " persoon"
1103+ }
1104+ ]
1105+ },
1106+ {
1107+ "id" : 123 ,
1108+ "text" : " Pieter Oude Nijhuis" ,
1109+ "annotations" : [
1110+ {
1111+ "text" : " Pieter Oude Nijhuis" ,
1112+ "start_char" : 0 ,
1113+ "end_char" : 19 ,
1114+ "tag" : " persoon"
1115+ }
1116+ ]
10461117 }
10471118 ]
10481119}
Original file line number Diff line number Diff line change @@ -53,7 +53,6 @@ def test_regression_name(self, model):
5353 "name_context" ,
5454 "person_annotation_converter" ,
5555 },
56- known_failures = {94 },
5756 )
5857
5958 def test_regression_location (self , model ):
You can’t perform that action at this time.
0 commit comments