Skip to content

Commit 8d09277

Browse files
authored
Merge pull request #137 from vmenger/tab-spaces-as-token
Recognize multiple spaces as token
2 parents 199bedd + 37fd838 commit 8d09277

File tree

5 files changed

+26
-3
lines changed

5 files changed

+26
-3
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,4 +124,6 @@ ENV/
124124
# mypy
125125
.mypy_cache/
126126

127-
.idea
127+
# ide
128+
.idea
129+
.vscode

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## 3.0.2 (2023-02-15)
9+
10+
### Changed
11+
- recognize 4+ spaces as a token, blocking annotations
12+
813
## 3.0.1 (2023-12-20)
914

1015
### Fixed

deduce/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import docdeid as dd
44
import regex
55

6-
_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]|.(?<! )", flags=regex.I | regex.M)
6+
_TOKENIZER_PATTERN = regex.compile(r"\w+|[\n\r\t]| {4,}|[^ ]", flags=regex.I | regex.M)
77

88

99
class DeduceTokenizer(dd.tokenizer.Tokenizer): # pylint: disable=R0903

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "deduce"
3-
version = "3.0.1"
3+
version = "3.0.2"
44
description = "Deduce: de-identification method for Dutch medical text"
55
authors = ["Vincent Menger <vmenger@protonmail.com>"]
66
maintainers = ["Vincent Menger <vmenger@protonmail.com>"]

tests/unit/test_tokenizer.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,22 @@ def test_split_nonalpha(self):
4747

4848
assert tokenizer._split_text(text=text) == expected_tokens
4949

50+
def test_split_multiple_spaces(self):
51+
tokenizer = DeduceTokenizer()
52+
text = "Pieter van der Zee Bergen Op Zoom"
53+
expected_tokens = [
54+
dd.Token(text="Pieter", start_char=0, end_char=6),
55+
dd.Token(text="van", start_char=7, end_char=10),
56+
dd.Token(text="der", start_char=11, end_char=14),
57+
dd.Token(text="Zee", start_char=15, end_char=18),
58+
dd.Token(text=" ", start_char=18, end_char=23),
59+
dd.Token(text="Bergen", start_char=23, end_char=29),
60+
dd.Token(text="Op", start_char=30, end_char=32),
61+
dd.Token(text="Zoom", start_char=34, end_char=38),
62+
]
63+
64+
assert tokenizer._split_text(text=text) == expected_tokens
65+
5066
def test_split_newline(self):
5167
tokenizer = DeduceTokenizer()
5268
text = "regel 1 \n gevolgd door regel 2"

0 commit comments

Comments
 (0)