|
5 | 5 |
|
6 | 6 | from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS |
7 | 7 | from spacy.lang.ja import Japanese, DetailedToken |
8 | | -from spacy.tokens import Token |
9 | | -from spacy.tokens.underscore import Underscore |
10 | | - |
11 | | - |
12 | | -@pytest.fixture(scope="function", autouse=True) |
13 | | -def clean_underscore(): |
14 | | - # reset the Underscore object after the test, to avoid having state copied across tests |
15 | | - yield |
16 | | - Underscore.doc_extensions = {} |
17 | | - Underscore.span_extensions = {} |
18 | | - Underscore.token_extensions = {} |
19 | | - |
20 | 8 |
|
21 | 9 | # fmt: off |
22 | 10 | TOKENIZER_TESTS = [ |
@@ -139,33 +127,24 @@ def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_toke |
139 | 127 | nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) |
140 | 128 | nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) |
141 | 129 |
|
142 | | - doc = ja_tokenizer(text) |
143 | | - doc_a = nlp_a(text) |
144 | | - doc_b = nlp_b(text) |
145 | | - doc_c = nlp_c(text) |
146 | | - |
147 | | - Token.set_extension("sub_tokens", default="") |
148 | | - assert [t._.sub_tokens for t in doc] == sub_tokens_list_a |
149 | | - assert [t._.sub_tokens for t in doc_a] == sub_tokens_list_a |
150 | | - assert [t._.sub_tokens for t in doc_b] == sub_tokens_list_b |
151 | | - assert [t._.sub_tokens for t in doc_c] == sub_tokens_list_c |
| 130 | + assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a |
| 131 | + assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a |
| 132 | + assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b |
| 133 | + assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c |
152 | 134 |
|
153 | 135 |
|
154 | 136 | @pytest.mark.parametrize("text,inflections,reading_forms", |
155 | 137 | [ |
156 | 138 | ( |
157 | 139 | "取ってつけた", |
158 | | - ["五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"], |
159 | | - ["トッ", "テ", "ツケ", "タ"], |
| 140 | + ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), |
| 141 | + ("トッ", "テ", "ツケ", "タ"), |
160 | 142 | ), |
161 | 143 | ] |
162 | 144 | ) |
163 | 145 | def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): |
164 | | - Token.set_extension("inflection", default="") |
165 | | - Token.set_extension("reading_form", default="") |
166 | | - doc = ja_tokenizer(text) |
167 | | - assert [t._.inflection for t in doc] == inflections |
168 | | - assert [t._.reading_form for t in doc] == reading_forms |
| 146 | + assert ja_tokenizer(text).user_data["inflections"] == inflections |
| 147 | + assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms |
169 | 148 |
|
170 | 149 |
|
171 | 150 | def test_ja_tokenizer_emptyish_texts(ja_tokenizer): |
|
0 commit comments