Skip to content

Commit 2d71545

Browse files
authored
Revert "Convert custom user_data to token extension format for Japanese tokenizer (#5652)" (#5665)
This reverts commit 1dd3819.
1 parent 1dd3819 commit 2d71545

File tree

2 files changed

+15
-35
lines changed

2 files changed

+15
-35
lines changed

spacy/lang/ja/__init__.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ def __call__(self, text):
145145
dtokens, spaces = get_dtokens_and_spaces(dtokens, text)
146146

147147
# create Doc with tag bi-gram based part-of-speech identification rules
148-
words = [dtoken.surface for dtoken in dtokens]
148+
words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6
149+
sub_tokens_list = list(sub_tokens_list)
149150
doc = Doc(self.vocab, words=words, spaces=spaces)
150151
next_pos = None # for bi-gram rules
151152
for idx, (token, dtoken) in enumerate(zip(doc, dtokens)):
@@ -157,14 +158,14 @@ def __call__(self, text):
157158
token.pos, next_pos = resolve_pos(
158159
token.orth_,
159160
dtoken.tag,
160-
dtokens[idx + 1].tag if idx + 1 < len(dtokens) else None
161+
tags[idx + 1] if idx + 1 < len(tags) else None
161162
)
162163
# if there's no lemma info (it's an unk) just use the surface
163164
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
164-
doc.user_data[('._.', 'inflection', token.idx, None)] = dtoken.inf
165-
doc.user_data[('._.', 'reading_form', token.idx, None)] = dtoken.reading
166-
doc.user_data[('._.', 'sub_tokens', token.idx, None)] = dtoken.sub_tokens
167-
doc.user_data[('._.', 'lemma', token.idx, None)] = token.lemma_
165+
166+
doc.user_data["inflections"] = inflections
167+
doc.user_data["reading_forms"] = readings
168+
doc.user_data["sub_tokens"] = sub_tokens_list
168169

169170
return doc
170171

spacy/tests/lang/ja/test_tokenizer.py

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,6 @@
55

66
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
77
from spacy.lang.ja import Japanese, DetailedToken
8-
from spacy.tokens import Token
9-
from spacy.tokens.underscore import Underscore
10-
11-
12-
@pytest.fixture(scope="function", autouse=True)
13-
def clean_underscore():
14-
# reset the Underscore object after the test, to avoid having state copied across tests
15-
yield
16-
Underscore.doc_extensions = {}
17-
Underscore.span_extensions = {}
18-
Underscore.token_extensions = {}
19-
208

219
# fmt: off
2210
TOKENIZER_TESTS = [
@@ -139,33 +127,24 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke
139127
nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
140128
nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
141129

142-
doc = ja_tokenizer(text)
143-
doc_a = nlp_a(text)
144-
doc_b = nlp_b(text)
145-
doc_c = nlp_c(text)
146-
147-
Token.set_extension("sub_tokens", default="")
148-
assert [t._.sub_tokens for t in doc] == sub_tokens_list_a
149-
assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a
150-
assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b
151-
assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c
130+
assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a
131+
assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a
132+
assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b
133+
assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c
152134

153135

154136
@pytest.mark.parametrize("text,inflections,reading_forms",
155137
[
156138
(
157139
"取ってつけた",
158-
["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"],
159-
["トッ", "テ", "ツケ", "タ"],
140+
("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"),
141+
("トッ", "テ", "ツケ", "タ"),
160142
),
161143
]
162144
)
163145
def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms):
164-
Token.set_extension("inflection", default="")
165-
Token.set_extension("reading_form", default="")
166-
doc = ja_tokenizer(text)
167-
assert [t._.inflection for t in doc] == inflections
168-
assert [t._.reading_form for t in doc] == reading_forms
146+
assert ja_tokenizer(text).user_data["inflections"] == inflections
147+
assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms
169148

170149

171150
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):

0 commit comments

Comments
 (0)