Skip to content

Commit e002ac5

Browse files
author
José Valim
committed
Incorporate new grapheme rules in Unicode 9
Signed-off-by: José Valim <[email protected]>
1 parent fb89d55 commit e002ac5

File tree

2 files changed

+121
-43
lines changed

2 files changed

+121
-43
lines changed

lib/elixir/test/elixir/string_test.exs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,8 @@ defmodule StringTest do
449449
# CLRF
450450
assert String.graphemes("\r\n\f") == ["\r\n", "\f"]
451451
# Regional indicator
452-
assert String.graphemes("\u{1F1E6}\u{1F1E7}\u{1F1E8}") == ["\u{1F1E6}\u{1F1E7}\u{1F1E8}"]
452+
assert String.graphemes("\u{1F1E6}\u{1F1E7}") == ["\u{1F1E6}\u{1F1E7}"]
453+
assert String.graphemes("\u{1F1E6}\u{1F1E7}\u{1F1E8}") == ["\u{1F1E6}\u{1F1E7}", "\u{1F1E8}"]
453454
# Hangul
454455
assert String.graphemes("\u1100\u115D\uB4A4") == ["ᄀᅝ뒤"]
455456
# Special Marking with Extended

lib/elixir/unicode/unicode.ex

Lines changed: 119 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -52,35 +52,55 @@ defmodule String.Unicode do
5252
end
5353
end
5454

55+
# Handle Regional
56+
for codepoint <- cluster["Regional_Indicator"] do
57+
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
58+
next_regional_size(rest, unquote(byte_size(codepoint)))
59+
end
60+
end
61+
5562
# Handle Hangul L
5663
for codepoint <- cluster["L"] do
5764
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
5865
next_hangul_l_size(rest, unquote(byte_size(codepoint)))
5966
end
6067
end
6168

69+
# Handle Hangul V
70+
for codepoint <- cluster["LV"] ++ cluster["V"] do
71+
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
72+
next_hangul_v_size(rest, unquote(byte_size(codepoint)))
73+
end
74+
end
75+
6276
# Handle Hangul T
63-
for codepoint <- cluster["T"] do
77+
for codepoint <- cluster["LVT"] ++ cluster["T"] do
6478
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
6579
next_hangul_t_size(rest, unquote(byte_size(codepoint)))
6680
end
6781
end
6882

69-
# Handle Regional
70-
for codepoint <- cluster["Regional_Indicator"] do
83+
# Handle E_Base
84+
for codepoint <- cluster["E_Base"] ++ cluster["E_Base_GAZ"] do
7185
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
72-
next_regional_size(rest, unquote(byte_size(codepoint)))
86+
next_extend_size(rest, unquote(byte_size(codepoint)), :e_base)
7387
end
7488
end
7589

76-
# Handle extended entries
90+
# Handle ZWJ
91+
for codepoint <- cluster["ZWJ"] do
92+
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do
93+
next_extend_size(rest, unquote(byte_size(codepoint)), :zwj)
94+
end
95+
end
7796

97+
# Handle extended entries
7898
def next_grapheme_size(<<cp::utf8, rest::binary>>) do
7999
case cp do
80-
x when x <= 0x007F -> next_extend_size(rest, 1)
81-
x when x <= 0x07FF -> next_extend_size(rest, 2)
82-
x when x <= 0xFFFF -> next_extend_size(rest, 3)
83-
_ -> next_extend_size(rest, 4)
100+
x when x <= 0x007F -> next_extend_size(rest, 1, :other)
101+
x when x <= 0x07FF -> next_extend_size(rest, 2, :other)
102+
x when x <= 0xFFFF -> next_extend_size(rest, 3, :other)
103+
_ -> next_extend_size(rest, 4, :other)
84104
end
85105
end
86106

@@ -92,82 +112,139 @@ defmodule String.Unicode do
92112
nil
93113
end
94114

95-
# Handle Hangul L
96-
for codepoint <- cluster["L"] do
97-
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do
98-
next_hangul_l_size(rest, size + unquote(byte_size(codepoint)))
115+
# Handle hanguls
116+
defp next_hangul_l_size(rest, size) do
117+
case next_hangul(rest, size) do
118+
{:l, rest, size} -> next_hangul_l_size(rest, size)
119+
{:v, rest, size} -> next_hangul_v_size(rest, size)
120+
{:lv, rest, size} -> next_hangul_v_size(rest, size)
121+
{:lvt, rest, size} -> next_hangul_t_size(rest, size)
122+
_ -> next_extend_size(rest, size, :other)
99123
end
100124
end
101125

102-
for codepoint <- cluster["LV"] do
103-
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do
104-
next_hangul_v_size(rest, size + unquote(byte_size(codepoint)))
126+
defp next_hangul_v_size(rest, size) do
127+
case next_hangul(rest, size) do
128+
{:v, rest, size} -> next_hangul_v_size(rest, size)
129+
{:t, rest, size} -> next_hangul_t_size(rest, size)
130+
_ -> next_extend_size(rest, size, :other)
105131
end
106132
end
107133

108-
for codepoint <- cluster["LVT"] do
109-
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do
110-
next_hangul_t_size(rest, size + unquote(byte_size(codepoint)))
134+
defp next_hangul_t_size(rest, size) do
135+
case next_hangul(rest, size) do
136+
{:t, rest, size} -> next_hangul_t_size(rest, size)
137+
_ -> next_extend_size(rest, size, :other)
111138
end
112139
end
113140

114-
defp next_hangul_l_size(rest, size) do
115-
next_hangul_v_size(rest, size)
141+
for codepoint <- cluster["L"] do
142+
defp next_hangul(<<unquote(codepoint), rest::binary>>, size) do
143+
{:l, rest, size + unquote(byte_size(codepoint))}
144+
end
116145
end
117146

118-
# Handle Hangul V
119147
for codepoint <- cluster["V"] do
120-
defp next_hangul_v_size(<<unquote(codepoint), rest::binary>>, size) do
121-
next_hangul_v_size(rest, size + unquote(byte_size(codepoint)))
148+
defp next_hangul(<<unquote(codepoint), rest::binary>>, size) do
149+
{:v, rest, size + unquote(byte_size(codepoint))}
122150
end
123151
end
124152

125-
defp next_hangul_v_size(rest, size) do
126-
next_hangul_t_size(rest, size)
153+
for codepoint <- cluster["T"] do
154+
defp next_hangul(<<unquote(codepoint), rest::binary>>, size) do
155+
{:t, rest, size + unquote(byte_size(codepoint))}
156+
end
127157
end
128158

129-
# Handle Hangul T
130-
for codepoint <- cluster["T"] do
131-
defp next_hangul_t_size(<<unquote(codepoint), rest::binary>>, size) do
132-
next_hangul_t_size(rest, size + unquote(byte_size(codepoint)))
159+
for codepoint <- cluster["LV"] do
160+
defp next_hangul(<<unquote(codepoint), rest::binary>>, size) do
161+
{:lv, rest, size + unquote(byte_size(codepoint))}
133162
end
134163
end
135164

136-
defp next_hangul_t_size(rest, size) do
137-
next_extend_size(rest, size)
165+
for codepoint <- cluster["LVT"] do
166+
defp next_hangul(<<unquote(codepoint), rest::binary>>, size) do
167+
{:lvt, rest, size + unquote(byte_size(codepoint))}
168+
end
169+
end
170+
171+
defp next_hangul(_, _) do
172+
false
138173
end
139174

140175
# Handle regional
141176
for codepoint <- cluster["Regional_Indicator"] do
142177
defp next_regional_size(<<unquote(codepoint), rest::binary>>, size) do
143-
next_regional_size(rest, size + unquote(byte_size(codepoint)))
178+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :other)
144179
end
145180
end
146-
147181
defp next_regional_size(rest, size) do
148-
next_extend_size(rest, size)
182+
next_extend_size(rest, size, :other)
183+
end
184+
185+
# Handle Extend+SpacingMark+ZWJ
186+
for codepoint <- cluster["Extend"] do
187+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, marker) do
188+
next_extend_size(rest, size + unquote(byte_size(codepoint)), keep_ebase(marker))
189+
end
149190
end
150191

151-
# Handle Extend+SpacingMark
152-
for codepoint <- cluster["Extend"] ++ cluster["SpacingMark"] do
153-
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size) do
154-
next_extend_size(rest, size + unquote(byte_size(codepoint)))
192+
for codepoint <- cluster["SpacingMark"] do
193+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, _marker) do
194+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :other)
155195
end
156196
end
157197

158-
defp next_extend_size(rest, size) do
198+
for codepoint <- cluster["ZWJ"] do
199+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, _marker) do
200+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :zwj)
201+
end
202+
end
203+
204+
for codepoint <- cluster["E_Modifier"] do
205+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, :e_base) do
206+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :other)
207+
end
208+
end
209+
210+
for codepoint <- cluster["Glue_After_Zwj"] do
211+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, :zwj) do
212+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :other)
213+
end
214+
end
215+
216+
for codepoint <- cluster["E_Base_GAZ"] do
217+
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size, :zwj) do
218+
next_extend_size(rest, size + unquote(byte_size(codepoint)), :e_base)
219+
end
220+
end
221+
222+
defp next_extend_size(rest, size, _) do
159223
{size, rest}
160224
end
161225

226+
defp keep_ebase(:e_base), do: :e_base
227+
defp keep_ebase(_), do: :other
228+
162229
# Handle Prepend
163230
for codepoint <- cluster["Prepend"] do
164231
defp next_prepend_size(<<unquote(codepoint), rest::binary>>, size) do
165232
next_prepend_size(rest, size + unquote(byte_size(codepoint)))
166233
end
167234
end
168-
235+
236+
# However, if we see a control character, we have to break it
237+
for codepoint <- cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do
238+
defp next_prepend_size(<<unquote(codepoint), _::binary>> = rest, size) do
239+
{size, rest}
240+
end
241+
end
242+
169243
defp next_prepend_size(rest, size) do
170-
{size, rest}
244+
case next_grapheme_size(rest) do
245+
{more, rest} -> {more + size, rest}
246+
nil -> {size, rest}
247+
end
171248
end
172249

173250
# Graphemes

0 commit comments

Comments
 (0)