Skip to content

Commit 2b28022

Browse files
authored
Merge pull request #117 from vmenger/fix-double-initials
Fix double initials
2 parents 4f74303 + 380ee2b commit 2b28022

File tree

5 files changed

+106
-3
lines changed

5 files changed

+106
-3
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## 2.4.1 (2023-11-15)
9+
10+
### Added
11+
- detection of initials `Ch.`, `Chr.`, `Ph.` and `Th.`
12+
813
## 2.4.0 (2023-11-15)
914

1015
### Added

deduce/annotator.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,11 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09
5050
return re.match(value, kwargs.get("token").text) is not None
5151
if func == "is_initial":
5252
return (
53-
len(kwargs.get("token").text) == 1
54-
and kwargs.get("token").text[0].isupper()
53+
(
54+
len(kwargs.get("token").text) == 1
55+
and kwargs.get("token").text[0].isupper()
56+
)
57+
or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"}
5558
) == value
5659
if func == "is_initials":
5760
return (

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "deduce"
3-
version = "2.4.0"
3+
version = "2.4.1"
44
description = "Deduce: de-identification method for Dutch medical text"
55
authors = ["Vincent Menger <vmenger@protonmail.com>"]
66
maintainers = ["Vincent Menger <vmenger@protonmail.com>"]

tests/regression/data/names.json

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,96 @@
901901
"tag": "persoon"
902902
}
903903
]
904+
},
905+
{
906+
"id": 101,
907+
"text": "Ph. Van der Laan",
908+
"annotations": [
909+
{
910+
"text": "Ph. Van der Laan",
911+
"start_char": 0,
912+
"end_char": 16,
913+
"tag": "persoon"
914+
}
915+
]
916+
},
917+
{
918+
"id": 102,
919+
"text": "A.Th.Chr. Van der Laan",
920+
"annotations": [
921+
{
922+
"text": "A.Th.Chr. Van der Laan",
923+
"start_char": 0,
924+
"end_char": 22,
925+
"tag": "persoon"
926+
}
927+
]
928+
},
929+
{
930+
"id": 103,
931+
"text": "Ah. Van der Laan",
932+
"annotations": [
933+
{
934+
"text": "Van der Laan",
935+
"start_char": 4,
936+
"end_char": 16,
937+
"tag": "persoon"
938+
}
939+
]
940+
},
941+
{
942+
"id": 104,
943+
"text": "J. Th. Bakker",
944+
"annotations": [
945+
{
946+
"text": "J. Th. Bakker",
947+
"start_char": 0,
948+
"end_char": 13,
949+
"tag": "persoon"
950+
}
951+
]
952+
},
953+
{
954+
"id": 105,
955+
"text": "J. Th. A. Bakker",
956+
"annotations": [
957+
{
958+
"text": "J. Th. A. Bakker",
959+
"start_char": 0,
960+
"end_char": 16,
961+
"tag": "persoon"
962+
}
963+
]
964+
},
965+
{
966+
"id": 106,
967+
"text": "Prof. Dr. Th. Bakker",
968+
"annotations": [
969+
{
970+
"text": "Prof. Dr. Th. Bakker",
971+
"start_char": 0,
972+
"end_char": 20,
973+
"tag": "persoon"
974+
}
975+
]
976+
},
977+
{
978+
"id": 107,
979+
"text": "Prof. Dr. Th. Ir. Bakker",
980+
"annotations": [
981+
{
982+
"text": "Prof. Dr. Th",
983+
"start_char": 0,
984+
"end_char": 12,
985+
"tag": "persoon"
986+
},
987+
{
988+
"text": "Ir. Bakker",
989+
"start_char": 14,
990+
"end_char": 24,
991+
"tag": "persoon"
992+
}
993+
]
904994
}
905995
]
906996
}

tests/unit/test_annotator.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,12 @@ def test_match_is_initial(self):
8282
pattern_position = {"is_initial": True}
8383

8484
assert _PatternPositionMatcher.match(pattern_position, token=token("A"))
85+
assert _PatternPositionMatcher.match(pattern_position, token=token("Ch"))
86+
assert _PatternPositionMatcher.match(pattern_position, token=token("Chr"))
87+
assert _PatternPositionMatcher.match(pattern_position, token=token("Ph"))
88+
assert _PatternPositionMatcher.match(pattern_position, token=token("Th"))
8589
assert not _PatternPositionMatcher.match(pattern_position, token=token("a"))
90+
assert not _PatternPositionMatcher.match(pattern_position, token=token("Ah"))
8691
assert not _PatternPositionMatcher.match(pattern_position, token=token("Abcd"))
8792

8893
def test_match_like_name(self):

0 commit comments

Comments
 (0)