@@ -174,3 +174,179 @@ defmodule String.Unicode do
174
174
[ ]
175
175
end
176
176
end
177
+
178
+ defmodule String.Graphemes do
179
+ @ moduledoc false
180
+
181
+ cluster_path = Path . join ( __DIR__ , "GraphemeBreakProperty.txt" )
182
+ regex = % r / ( ?: ^ ( [ 0 - 9 A- F ] + ) ( ?: \.\. ( [ 0 - 9 A- F ] +) ) ?) \s+; \s( \w+) /m
183
+
184
+ to_range = fn
185
+ first , "" ->
186
+ [ << binary_to_integer ( first , 16 ) :: utf8 >> ]
187
+ first , last ->
188
+ range = binary_to_integer ( first , 16 ) .. binary_to_integer ( last , 16 )
189
+ Enum . map ( range , fn ( int ) -> << int :: utf8 >> end )
190
+ end
191
+
192
+ cluster = Enum . reduce File . stream! ( cluster_path ) , HashDict . new , fn ( line , dict ) ->
193
+ [ _full , first , last , class ] = Regex . run ( regex , line )
194
+
195
+ # Skip surrogates
196
+ if first == "D800" and last == "DFFF" do
197
+ dict
198
+ else
199
+ list = to_range . ( first , last )
200
+ Dict . update ( dict , class , list , & ( & 1 ++ list ) )
201
+ end
202
+ end
203
+
204
+ # There is no codepoint marked as Prepend by Unicode 6.3.0
205
+ if cluster [ "Prepend" ] do
206
+ raise "It seems this new unicode version has added Prepend items. " <>
207
+ "Please remove this error and uncomment the code below."
208
+ end
209
+
210
+ # Don't break CRLF
211
+ def next_grapheme( << ?\n , ?\r , rest :: binary > > ) do
212
+ { "\n \r " , rest }
213
+ end
214
+
215
+ # Break on control
216
+ lc codepoint inlist cluster [ "CR" ] ++ cluster [ "LF" ] ++ cluster [ "Control" ] do
217
+ def next_grapheme ( << unquote ( codepoint ) , rest :: binary >> = string ) do
218
+ { :binary . part ( string , 0 , unquote ( size ( codepoint ) ) ) , rest }
219
+ end
220
+ end
221
+
222
+ # Break on Prepend*
223
+ # lc codepoint inlist cluster["Prepend"] do
224
+ # def next_grapheme(<< unquote(codepoint), rest :: binary >> = string) do
225
+ # next_prepend(rest, string, unquote(size(codepoint)))
226
+ # end
227
+ # end
228
+
229
+ # Handle Hangul L
230
+ lc codepoint inlist cluster[ "L" ] do
231
+ def next_grapheme( << unquote( codepoint) , rest :: binary > > = string) do
232
+ next_hangul_l ( rest , string , unquote ( size ( codepoint ) ) )
233
+ end
234
+ end
235
+
236
+ # Handle Hangul T
237
+ lc codepoint inlist cluster [ "T" ] do
238
+ def next_grapheme ( << unquote ( codepoint ) , rest :: binary >> = string ) do
239
+ next_hangul_t ( rest , string , unquote ( size ( codepoint ) ) )
240
+ end
241
+ end
242
+
243
+ # Handle Regional
244
+ lc codepoint inlist cluster[ "Regional_Indicator" ] do
245
+ def next_grapheme( << unquote( codepoint) , rest :: binary > > = string) do
246
+ next_regional ( rest , string , unquote ( size ( codepoint ) ) )
247
+ end
248
+ end
249
+
250
+ # Handle extended entries
251
+ def next_grapheme ( << cp :: utf8 , rest :: binary >> = string ) do
252
+ next_extend ( rest , string , byte_size ( << cp :: utf8 >> ) )
253
+ end
254
+
255
+ def next_grapheme( << cp, rest :: binary > > ) do
256
+ { << cp >> , rest }
257
+ end
258
+
259
+ def next_grapheme ( << >> ) do
260
+ nil
261
+ end
262
+
263
+ # Handle Hangul L
264
+ lc codepoint inlist cluster [ "L" ] do
265
+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
266
+ next_hangul_l ( rest , string , size + unquote ( size ( codepoint ) ) )
267
+ end
268
+ end
269
+
270
+ lc codepoint inlist cluster[ "LV" ] do
271
+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
272
+ next_hangul_v ( rest , string , size + unquote ( size ( codepoint ) ) )
273
+ end
274
+ end
275
+
276
+ lc codepoint inlist cluster[ "LVT" ] do
277
+ defp next_hangul_l ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
278
+ next_hangul_t ( rest , string , size + unquote ( size ( codepoint ) ) )
279
+ end
280
+ end
281
+
282
+ defp next_hangul_l ( rest , string , size ) do
283
+ next_hangul_v ( rest , string , size )
284
+ end
285
+
286
+ # Handle Hangul V
287
+ lc codepoint inlist cluster [ "V" ] do
288
+ defp next_hangul_v ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
289
+ next_hangul_v ( rest , string , size + unquote ( size ( codepoint ) ) )
290
+ end
291
+ end
292
+
293
+ defp next_hangul_v( rest, string , size ) do
294
+ next_hangul_t ( rest , string , size )
295
+ end
296
+
297
+ # Handle Hangul T
298
+ lc codepoint inlist cluster [ "T" ] do
299
+ defp next_hangul_t ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
300
+ next_hangul_t ( rest , string , size + unquote ( size ( codepoint ) ) )
301
+ end
302
+ end
303
+
304
+ defp next_hangul_t( rest, string , size ) do
305
+ next_extend ( rest , string , size )
306
+ end
307
+
308
+ # Handle regional
309
+ lc codepoint inlist cluster [ "Regional_Indicator" ] do
310
+ defp next_regional ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
311
+ next_regional ( rest , string , size + unquote ( size ( codepoint ) ) )
312
+ end
313
+ end
314
+
315
+ defp next_regional( rest, string , size ) do
316
+ next_extend ( rest , string , size )
317
+ end
318
+
319
+ # Handle Extend+SpacingMark
320
+ lc codepoint inlist cluster [ "Extend" ] ++ cluster [ "SpacingMark" ] do
321
+ defp next_extend ( << unquote ( codepoint ) , rest :: binary >> , string , size ) do
322
+ next_extend ( rest , string , size + unquote ( size ( codepoint ) ) )
323
+ end
324
+ end
325
+
326
+ defp next_extend( rest, string , size ) do
327
+ { :binary . part ( string , 0 , size ) , rest }
328
+ end
329
+
330
+ # Handle Prepend
331
+ # lc codepoint inlist cluster["Prepend"] do
332
+ # defp next_prepend(<< unquote(codepoint), rest :: binary >>, string, size) do
333
+ # next_prepend(rest, string, size + unquote(size(codepoint)))
334
+ # end
335
+ # end
336
+ #
337
+ # defp next_prepend(rest, string, size) do
338
+ # { :binary.part(string, 0, size), rest }
339
+ # end
340
+
341
+ def graphemes ( binary ) when is_binary ( binary ) do
342
+ do_graphemes ( next_grapheme ( binary ) )
343
+ end
344
+
345
+ defp do_graphemes ( { c , rest } ) do
346
+ [ c | do_graphemes ( next_grapheme ( rest ) ) ]
347
+ end
348
+
349
+ defp do_graphemes ( nil ) do
350
+ [ ]
351
+ end
352
+ end
0 commit comments