@@ -52,35 +52,55 @@ defmodule String.Unicode do
52
52
end
53
53
end
54
54
55
+ # Handle Regional
56
+ for codepoint <- cluster [ "Regional_Indicator" ] do
57
+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
58
+ next_regional_size ( rest , unquote ( byte_size ( codepoint ) ) )
59
+ end
60
+ end
61
+
55
62
# Handle Hangul L
56
63
for codepoint <- cluster [ "L" ] do
57
64
def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
58
65
next_hangul_l_size ( rest , unquote ( byte_size ( codepoint ) ) )
59
66
end
60
67
end
61
68
69
+ # Handle Hangul V
70
+ for codepoint <- cluster [ "LV" ] ++ cluster [ "V" ] do
71
+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
72
+ next_hangul_v_size ( rest , unquote ( byte_size ( codepoint ) ) )
73
+ end
74
+ end
75
+
62
76
# Handle Hangul T
63
- for codepoint <- cluster [ "T" ] do
77
+ for codepoint <- cluster [ "LVT" ] ++ cluster [ " T"] do
64
78
def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
65
79
next_hangul_t_size ( rest , unquote ( byte_size ( codepoint ) ) )
66
80
end
67
81
end
68
82
69
- # Handle Regional
70
- for codepoint <- cluster [ "Regional_Indicator " ] do
83
+ # Handle E_Base
84
+ for codepoint <- cluster [ "E_Base" ] ++ cluster [ "E_Base_GAZ "] do
71
85
def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
72
- next_regional_size ( rest , unquote ( byte_size ( codepoint ) ) )
86
+ next_extend_size ( rest , unquote ( byte_size ( codepoint ) ) , :e_base )
73
87
end
74
88
end
75
89
76
- # Handle extended entries
90
+ # Handle ZWJ
91
+ for codepoint <- cluster [ "ZWJ" ] do
92
+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
93
+ next_extend_size ( rest , unquote ( byte_size ( codepoint ) ) , :zwj )
94
+ end
95
+ end
77
96
97
+ # Handle extended entries
78
98
def next_grapheme_size ( << cp :: utf8 , rest :: binary >> ) do
79
99
case cp do
80
- x when x <= 0x007F -> next_extend_size ( rest , 1 )
81
- x when x <= 0x07FF -> next_extend_size ( rest , 2 )
82
- x when x <= 0xFFFF -> next_extend_size ( rest , 3 )
83
- _ -> next_extend_size ( rest , 4 )
100
+ x when x <= 0x007F -> next_extend_size ( rest , 1 , :other )
101
+ x when x <= 0x07FF -> next_extend_size ( rest , 2 , :other )
102
+ x when x <= 0xFFFF -> next_extend_size ( rest , 3 , :other )
103
+ _ -> next_extend_size ( rest , 4 , :other )
84
104
end
85
105
end
86
106
@@ -92,82 +112,139 @@ defmodule String.Unicode do
92
112
nil
93
113
end
94
114
95
- # Handle Hangul L
96
- for codepoint <- cluster [ "L" ] do
97
- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
98
- next_hangul_l_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
115
+ # Handle hanguls
116
+ defp next_hangul_l_size ( rest , size ) do
117
+ case next_hangul ( rest , size ) do
118
+ { :l , rest , size } -> next_hangul_l_size ( rest , size )
119
+ { :v , rest , size } -> next_hangul_v_size ( rest , size )
120
+ { :lv , rest , size } -> next_hangul_v_size ( rest , size )
121
+ { :lvt , rest , size } -> next_hangul_t_size ( rest , size )
122
+ _ -> next_extend_size ( rest , size , :other )
99
123
end
100
124
end
101
125
102
- for codepoint <- cluster [ "LV" ] do
103
- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
104
- next_hangul_v_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
126
+ defp next_hangul_v_size ( rest , size ) do
127
+ case next_hangul ( rest , size ) do
128
+ { :v , rest , size } -> next_hangul_v_size ( rest , size )
129
+ { :t , rest , size } -> next_hangul_t_size ( rest , size )
130
+ _ -> next_extend_size ( rest , size , :other )
105
131
end
106
132
end
107
133
108
- for codepoint <- cluster [ "LVT" ] do
109
- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
110
- next_hangul_t_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
134
+ defp next_hangul_t_size ( rest , size ) do
135
+ case next_hangul ( rest , size ) do
136
+ { :t , rest , size } -> next_hangul_t_size ( rest , size )
137
+ _ -> next_extend_size ( rest , size , :other )
111
138
end
112
139
end
113
140
114
- defp next_hangul_l_size ( rest , size ) do
115
- next_hangul_v_size ( rest , size )
141
+ for codepoint <- cluster [ "L" ] do
142
+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
143
+ { :l , rest , size + unquote ( byte_size ( codepoint ) ) }
144
+ end
116
145
end
117
146
118
- # Handle Hangul V
119
147
for codepoint <- cluster [ "V" ] do
120
- defp next_hangul_v_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
121
- next_hangul_v_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
148
+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
149
+ { :v , rest , size + unquote ( byte_size ( codepoint ) ) }
122
150
end
123
151
end
124
152
125
- defp next_hangul_v_size ( rest , size ) do
126
- next_hangul_t_size ( rest , size )
153
+ for codepoint <- cluster [ "T" ] do
154
+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
155
+ { :t , rest , size + unquote ( byte_size ( codepoint ) ) }
156
+ end
127
157
end
128
158
129
- # Handle Hangul T
130
- for codepoint <- cluster [ "T" ] do
131
- defp next_hangul_t_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
132
- next_hangul_t_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
159
+ for codepoint <- cluster [ "LV" ] do
160
+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
161
+ { :lv , rest , size + unquote ( byte_size ( codepoint ) ) }
133
162
end
134
163
end
135
164
136
- defp next_hangul_t_size ( rest , size ) do
137
- next_extend_size ( rest , size )
165
+ for codepoint <- cluster [ "LVT" ] do
166
+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
167
+ { :lvt , rest , size + unquote ( byte_size ( codepoint ) ) }
168
+ end
169
+ end
170
+
171
+ defp next_hangul ( _ , _ ) do
172
+ false
138
173
end
139
174
140
175
# Handle regional
141
176
for codepoint <- cluster [ "Regional_Indicator" ] do
142
177
defp next_regional_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
143
- next_regional_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
178
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
144
179
end
145
180
end
146
-
147
181
defp next_regional_size ( rest , size ) do
148
- next_extend_size ( rest , size )
182
+ next_extend_size ( rest , size , :other )
183
+ end
184
+
185
+ # Handle Extend+SpacingMark+ZWJ
186
+ for codepoint <- cluster [ "Extend" ] do
187
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , marker ) do
188
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , keep_ebase ( marker ) )
189
+ end
149
190
end
150
191
151
- # Handle Extend+SpacingMark
152
- for codepoint <- cluster [ "Extend" ] ++ cluster [ "SpacingMark" ] do
153
- defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
154
- next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
192
+ for codepoint <- cluster [ "SpacingMark" ] do
193
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , _marker ) do
194
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
155
195
end
156
196
end
157
197
158
- defp next_extend_size ( rest , size ) do
198
+ for codepoint <- cluster [ "ZWJ" ] do
199
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , _marker ) do
200
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :zwj )
201
+ end
202
+ end
203
+
204
+ for codepoint <- cluster [ "E_Modifier" ] do
205
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :e_base ) do
206
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
207
+ end
208
+ end
209
+
210
+ for codepoint <- cluster [ "Glue_After_Zwj" ] do
211
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :zwj ) do
212
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
213
+ end
214
+ end
215
+
216
+ for codepoint <- cluster [ "E_Base_GAZ" ] do
217
+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :zwj ) do
218
+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :e_base )
219
+ end
220
+ end
221
+
222
+ defp next_extend_size ( rest , size , _ ) do
159
223
{ size , rest }
160
224
end
161
225
226
+ defp keep_ebase ( :e_base ) , do: :e_base
227
+ defp keep_ebase ( _ ) , do: :other
228
+
162
229
# Handle Prepend
163
230
for codepoint <- cluster [ "Prepend" ] do
164
231
defp next_prepend_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
165
232
next_prepend_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
166
233
end
167
234
end
168
-
235
+
236
+ # However, if we see a control character, we have to break it
237
+ for codepoint <- cluster [ "CR" ] ++ cluster [ "LF" ] ++ cluster [ "Control" ] do
238
+ defp next_prepend_size ( << unquote ( codepoint ) , _ :: binary >> = rest , size ) do
239
+ { size , rest }
240
+ end
241
+ end
242
+
169
243
defp next_prepend_size ( rest , size ) do
170
- { size , rest }
244
+ case next_grapheme_size ( rest ) do
245
+ { more , rest } -> { more + size , rest }
246
+ nil -> { size , rest }
247
+ end
171
248
end
172
249
173
250
# Graphemes
0 commit comments