Skip to content
This repository was archived by the owner on Mar 9, 2023. It is now read-only.

Commit 1a6649b

Browse files
author
Sorami Hisamoto
authored
Fix a bug causing … is converted to "", "", "…" (#121)
* Change a variable name to modifield_to_original, to make it align with the original Java implementation * Fix a bug causing … is converted to "", "", "…" * Fix tests for according to the new replace method * Fix comment format
1 parent 80cdf94 commit 1a6649b

File tree

4 files changed

+32
-19
lines changed

4 files changed

+32
-19
lines changed

sudachipy/utf8inputtextbuilder.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, text, grammar):
2424
self.grammar = grammar
2525
self.original_text = text
2626
self.modified_text = text
27-
self.text_offsets = list(range(len(self.original_text) + 1))
27+
self.modified_to_original = list(range(len(self.original_text) + 1))
2828
# 注: サロゲートペア文字は考慮していない
2929

3030
def replace(self, begin, end, str_):
@@ -42,15 +42,17 @@ def replace(self, begin, end, str_):
4242

4343
self.modified_text = str_.join([self.modified_text[:begin], self.modified_text[end:]])
4444

45-
offset = self.text_offsets[begin]
45+
modified_begin = self.modified_to_original[begin]
46+
modified_end = self.modified_to_original[end]
4647
length = len(str_)
4748
if end - begin > length:
48-
del self.text_offsets[begin + length:end]
49-
for i in range(length):
49+
del self.modified_to_original[begin + length:end]
50+
self.modified_to_original[begin] = modified_begin
51+
for i in range(1, length):
5052
if begin + i < end:
51-
self.text_offsets[begin + i] = offset
53+
self.modified_to_original[begin + i] = modified_end
5254
else:
53-
self.text_offsets.insert(begin + i, offset)
55+
self.modified_to_original.insert(begin + i, modified_end)
5456

5557
def get_original_text(self):
5658
return self.original_text
@@ -70,10 +72,10 @@ def build(self):
7072
# 注: サロゲートペア文字は考慮していない
7173
for _ in range(self.utf8_byte_length(ord(self.modified_text[i]))):
7274
byte_indexes[j] = i
73-
offsets[j] = self.text_offsets[i]
75+
offsets[j] = self.modified_to_original[i]
7476
j += 1
7577
byte_indexes[length] = len(modified_string_text)
76-
offsets[length] = self.text_offsets[-1]
78+
offsets[length] = self.modified_to_original[-1]
7779

7880
char_categories = self.get_char_category_types(modified_string_text)
7981
char_category_continuities = self.get_char_category_continuities(modified_string_text, length, char_categories)

tests/plugin/test_default_input_text_plugin.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_before_rewrite(self):
6060
self.assertEqual(9, text.get_original_index(24))
6161
self.assertEqual(9, text.get_original_index(26))
6262

63-
def test_after_write(self):
63+
def test_after_rewrite(self):
6464
self.assertEqual(self.original_text, self.builder.get_original_text())
6565
self.assertEqual(self.original_text, self.builder.get_text())
6666
self.plugin.rewrite(self.builder)
@@ -76,7 +76,8 @@ def test_after_write(self):
7676
self.assertEqual(1, text.get_original_index(2))
7777
self.assertEqual(2, text.get_original_index(3))
7878
self.assertEqual(4, text.get_original_index(7))
79-
self.assertEqual(4, text.get_original_index(11))
79+
self.assertEqual(5, text.get_original_index(8))
80+
self.assertEqual(5, text.get_original_index(11))
8081
self.assertEqual(7, text.get_original_index(15))
8182
self.assertEqual(7, text.get_original_index(17))
8283

tests/test_tokenizer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,16 @@ def test_tokenize_kanji_alphabet_word(self):
7070
self.assertEqual(len(self.tokenizer_obj.tokenize('ab')), 1)
7171
self.assertEqual(len(self.tokenizer_obj.tokenize('特ab')), 2)
7272

73+
def test_tokenizer_with_dots(self):
74+
ms = self.tokenizer_obj.tokenize('京都…')
75+
self.assertEqual(4, ms.size())
76+
self.assertEqual(ms[1].surface(), '…')
77+
self.assertEqual(ms[1].normalized_form(), '.')
78+
self.assertEqual(ms[2].surface(), '')
79+
self.assertEqual(ms[2].normalized_form(), '.')
80+
self.assertEqual(ms[3].surface(), '')
81+
self.assertEqual(ms[3].normalized_form(), '.')
82+
7383

7484
if __name__ == '__main__':
7585
unittest.main()

tests/test_utf8inputtext.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,8 @@ def test_replace_with_same_length(self):
117117
self.assertEqual(input_.get_original_index(12), 7)
118118
self.assertEqual(input_.get_original_index(13), 8)
119119
self.assertEqual(input_.get_original_index(15), 8)
120-
self.assertEqual(input_.get_original_index(16), 8)
121-
self.assertEqual(input_.get_original_index(18), 8)
120+
self.assertEqual(input_.get_original_index(16), 10)
121+
self.assertEqual(input_.get_original_index(18), 10)
122122
self.assertEqual(input_.get_original_index(19), 10)
123123
self.assertEqual(input_.get_original_index(22), 10)
124124
self.assertEqual(input_.get_original_index(31), 13)
@@ -147,13 +147,13 @@ def test_replaceWithInsertion(self):
147147
self.assertEqual(input_.get_original_text(), self.TEXT)
148148
self.assertEqual(input_.get_text(), "âbC1あ234あああ𡈽アゴ")
149149
self.assertEqual(len(input_.get_byte_text()), 35)
150-
self.assertEqual(input_.get_original_index(0), 0)
151-
self.assertEqual(input_.get_original_index(12), 7)
152-
self.assertEqual(input_.get_original_index(13), 8)
153-
self.assertEqual(input_.get_original_index(21), 8)
154-
self.assertEqual(input_.get_original_index(22), 10)
155-
self.assertEqual(input_.get_original_index(25), 10)
156-
self.assertEqual(input_.get_original_index(35), 14)
150+
self.assertEqual(input_.get_original_index(0), 0) # â
151+
self.assertEqual(input_.get_original_index(12), 7) # 4
152+
self.assertEqual(input_.get_original_index(13), 8) # >あ< ああ
153+
self.assertEqual(input_.get_original_index(21), 10) # ああ >あ<
154+
self.assertEqual(input_.get_original_index(22), 10) # 𡈽
155+
self.assertEqual(input_.get_original_index(25), 10) # 𡈽
156+
self.assertEqual(input_.get_original_index(35), 14) # ゙
157157

158158
def test_replaceMultiTimes(self):
159159
self.builder.replace(0, 1, "a")

0 commit comments

Comments
 (0)