Merge pull request #117 from vmenger/fix-double-initials

vmenger · web-flow · commit 2b2802227e18 · 2023-11-15T15:46:13.000+01:00
Fix double initials
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 2.4.1 (2023-11-15)
+
+### Added
+- detection of initials `Ch.`, `Chr.`, `Ph.` and `Th.` 
+
 ## 2.4.0 (2023-11-15)
 
 ### Added
diff --git a/deduce/annotator.py b/deduce/annotator.py
@@ -50,8 +50,11 @@ def match(cls, pattern_position: dict, **kwargs) -> bool:  # pylint: disable=R09
             return re.match(value, kwargs.get("token").text) is not None
         if func == "is_initial":
             return (
-                len(kwargs.get("token").text) == 1
-                and kwargs.get("token").text[0].isupper()
+                (
+                    len(kwargs.get("token").text) == 1
+                    and kwargs.get("token").text[0].isupper()
+                )
+                or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"}
             ) == value
         if func == "is_initials":
             return (
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deduce"
-version = "2.4.0"
+version = "2.4.1"
 description = "Deduce: de-identification method for Dutch medical text"
 authors = ["Vincent Menger <vmenger@protonmail.com>"]
 maintainers = ["Vincent Menger <vmenger@protonmail.com>"]
diff --git a/tests/regression/data/names.json b/tests/regression/data/names.json
@@ -901,6 +901,96 @@
                     "tag": "persoon"
                 }
             ]
+        },
+        {
+            "id": 101,
+            "text": "Ph. Van der Laan",
+            "annotations": [
+                {
+                    "text": "Ph. Van der Laan",
+                    "start_char": 0,
+                    "end_char": 16,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 102,
+            "text": "A.Th.Chr. Van der Laan",
+            "annotations": [
+                {
+                    "text": "A.Th.Chr. Van der Laan",
+                    "start_char": 0,
+                    "end_char": 22,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 103,
+            "text": "Ah. Van der Laan",
+            "annotations": [
+                {
+                    "text": "Van der Laan",
+                    "start_char": 4,
+                    "end_char": 16,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 104,
+            "text": "J. Th. Bakker",
+            "annotations": [
+                {
+                    "text": "J. Th. Bakker",
+                    "start_char": 0,
+                    "end_char": 13,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 105,
+            "text": "J. Th. A. Bakker",
+            "annotations": [
+                {
+                    "text": "J. Th. A. Bakker",
+                    "start_char": 0,
+                    "end_char": 16,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 106,
+            "text": "Prof. Dr. Th. Bakker",
+            "annotations": [
+                {
+                    "text": "Prof. Dr. Th. Bakker",
+                    "start_char": 0,
+                    "end_char": 20,
+                    "tag": "persoon"
+                }
+            ]
+        },
+        {
+            "id": 107,
+            "text": "Prof. Dr. Th. Ir. Bakker",
+            "annotations": [
+                {
+                    "text": "Prof. Dr. Th",
+                    "start_char": 0,
+                    "end_char": 12,
+                    "tag": "persoon"
+                },
+                {
+                    "text": "Ir. Bakker",
+                    "start_char": 14,
+                    "end_char": 24,
+                    "tag": "persoon"
+                }
+            ]
         }
     ]
 }
diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py
@@ -82,7 +82,12 @@ def test_match_is_initial(self):
         pattern_position = {"is_initial": True}
 
         assert _PatternPositionMatcher.match(pattern_position, token=token("A"))
+        assert _PatternPositionMatcher.match(pattern_position, token=token("Ch"))
+        assert _PatternPositionMatcher.match(pattern_position, token=token("Chr"))
+        assert _PatternPositionMatcher.match(pattern_position, token=token("Ph"))
+        assert _PatternPositionMatcher.match(pattern_position, token=token("Th"))
         assert not _PatternPositionMatcher.match(pattern_position, token=token("a"))
+        assert not _PatternPositionMatcher.match(pattern_position, token=token("Ah"))
         assert not _PatternPositionMatcher.match(pattern_position, token=token("Abcd"))
 
     def test_match_like_name(self):