We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents 199bedd + 37fd838 commit 8d09277Copy full SHA for 8d09277
.gitignore
@@ -124,4 +124,6 @@ ENV/
124
# mypy
125
.mypy_cache/
126
127
-.idea
+# ide
128
+.idea
129
+.vscode
CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
5
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
8
+## 3.0.2 (2023-02-15)
9
+
10
+### Changed
11
+- recognize 4+ spaces as a token, blocking annotations
12
13
## 3.0.1 (2023-12-20)
14
15
### Fixed
deduce/tokenizer.py
@@ -3,7 +3,7 @@
3
import docdeid as dd
4
import regex
-_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]|.(?<! )", flags=regex.I | regex.M)
+_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]| {4,}|[^ ]", flags=regex.I | regex.M)
class DeduceTokenizer(dd.tokenizer.Tokenizer): # pylint: disable=R0903
pyproject.toml
@@ -1,6 +1,6 @@
1
[tool.poetry]
2
name = "deduce"
-version = "3.0.1"
+version = "3.0.2"
description = "Deduce: de-identification method for Dutch medical text"
authors = ["Vincent Menger <vmenger@protonmail.com>"]
maintainers = ["Vincent Menger <vmenger@protonmail.com>"]
tests/unit/test_tokenizer.py
@@ -47,6 +47,22 @@ def test_split_nonalpha(self):
47
48
assert tokenizer._split_text(text=text) == expected_tokens
49
50
+ def test_split_multiple_spaces(self):
51
+ tokenizer = DeduceTokenizer()
52
+ text = "Pieter van der Zee Bergen Op Zoom"
53
+ expected_tokens = [
54
+ dd.Token(text="Pieter", start_char=0, end_char=6),
55
+ dd.Token(text="van", start_char=7, end_char=10),
56
+ dd.Token(text="der", start_char=11, end_char=14),
57
+ dd.Token(text="Zee", start_char=15, end_char=18),
58
+ dd.Token(text=" ", start_char=18, end_char=23),
59
+ dd.Token(text="Bergen", start_char=23, end_char=29),
60
+ dd.Token(text="Op", start_char=30, end_char=32),
61
+ dd.Token(text="Zoom", start_char=34, end_char=38),
62
+ ]
63
64
+ assert tokenizer._split_text(text=text) == expected_tokens
65
66
def test_split_newline(self):
67
tokenizer = DeduceTokenizer()
68
text = "regel 1 \n gevolgd door regel 2"
0 commit comments