@@ -169,6 +169,45 @@ defmodule String.Unicode do
169
169
)
170
170
end )
171
171
172
+ # The function computes byte lookups based on the prefix. For example,
173
+ # Á, É, etc all have the same prefix <<195>>, so they are lumped
174
+ # together for lookup and then we just do a byte lookup later. We
175
+ # tried doing the byte lookup on 64-element tuple (since the byte
176
+ # is always within 0b10000000 and 0b10111111) but that's slower,
177
+ # especially because we need to check the byte range for invalid
178
+ # Unicode, instead the last byte lookup is a case. Grouping the
179
+ # top-level lookup makes the cost of a miss 3x cheaper albeit a
180
+ # hit is 10% more expensive) and reduces bytecode size.
181
+ compute_lookup = fn key_values ->
182
+ prefixes =
183
+ Enum . reduce ( key_values , % { } , fn { codepoint , result } , acc ->
184
+ prefix_size = bit_size ( codepoint ) - 8
185
+ << prefix :: size ( prefix_size ) - bits , byte >> = codepoint
186
+ Map . update ( acc , prefix , [ { byte , result } ] , & [ { byte , result } | & 1 ] )
187
+ end )
188
+
189
+ { singles , tables } =
190
+ Enum . reduce ( Map . delete ( prefixes , "" ) , { [ ] , [ ] } , fn { prefix , pairs } , { singles , tables } ->
191
+ case pairs do
192
+ [ { byte , result } ] ->
193
+ { [ { prefix <> << byte >> , result } | singles ] , tables }
194
+
195
+ _ ->
196
+ clauses =
197
+ Enum . flat_map ( pairs , fn { byte , result } ->
198
+ quote do
199
+ unquote ( byte ) -> unquote ( result )
200
+ end
201
+ end )
202
+
203
+ clauses = clauses ++ quote do: ( byte -> << unquote ( prefix ) , byte >> )
204
+ { singles , [ { prefix , clauses } | tables ] }
205
+ end
206
+ end )
207
+
208
+ { Enum . sort ( singles ) , Enum . sort_by ( tables , & ( - byte_size ( elem ( & 1 , 0 ) ) ) ) }
209
+ end
210
+
172
211
# Sigma variants for Greek
173
212
@ letter_sigma << 0x03A3 :: utf8 >>
174
213
@ letter_small_sigma_final << 0x03C2 :: utf8 >>
@@ -214,16 +253,33 @@ defmodule String.Unicode do
214
253
215
254
conditional_downcase = [ @ letter_I , @ letter_I_dot_above , @ letter_sigma ]
216
255
217
- for { codepoint , _upper , lower , _title } <- codes ,
218
- lower && lower != codepoint ,
219
- codepoint not in conditional_downcase do
256
+ { singles , tables } =
257
+ compute_lookup . (
258
+ for { codepoint , _upper , lower , _title } <- codes ,
259
+ lower && lower != codepoint ,
260
+ codepoint not in conditional_downcase ,
261
+ do: { codepoint , lower }
262
+ )
263
+
264
+ for { codepoint , lower } <- singles do
220
265
def downcase ( << unquote ( codepoint ) , rest :: bits >> , acc , mode ) do
221
266
downcase ( rest , [ unquote ( lower ) | acc ] , mode )
222
267
end
223
268
end
224
269
225
- def downcase ( << char , rest :: bits >> , acc , mode ) do
226
- downcase ( rest , [ << char >> | acc ] , mode )
270
+ for { prefix , clauses } <- tables do
271
+ def downcase ( << unquote ( prefix ) , byte , rest :: bits >> , acc , mode ) do
272
+ value = case byte , do: unquote ( clauses )
273
+ downcase ( rest , [ value | acc ] , mode )
274
+ end
275
+ end
276
+
277
+ def downcase ( << byte , rest :: bits >> , acc , mode ) do
278
+ if byte >= ?A and byte <= ?Z do
279
+ downcase ( rest , [ byte + 32 | acc ] , mode )
280
+ else
281
+ downcase ( rest , [ byte | acc ] , mode )
282
+ end
227
283
end
228
284
229
285
def downcase ( "" , acc , _mode ) , do: IO . iodata_to_binary ( :lists . reverse ( acc ) )
@@ -284,16 +340,33 @@ defmodule String.Unicode do
284
340
285
341
conditional_upcase = [ @ letter_i ]
286
342
287
- for { codepoint , upper , _lower , _title } <- codes ,
288
- upper && upper != codepoint ,
289
- codepoint not in conditional_upcase do
343
+ { singles , tables } =
344
+ compute_lookup . (
345
+ for { codepoint , upper , _lower , _title } <- codes ,
346
+ upper && upper != codepoint ,
347
+ codepoint not in conditional_upcase ,
348
+ do: { codepoint , upper }
349
+ )
350
+
351
+ for { codepoint , upper } <- singles do
290
352
def upcase ( << unquote ( codepoint ) , rest :: bits >> , acc , mode ) do
291
353
upcase ( rest , [ unquote ( upper ) | acc ] , mode )
292
354
end
293
355
end
294
356
295
- def upcase ( << char , rest :: bits >> , acc , mode ) do
296
- upcase ( rest , [ char | acc ] , mode )
357
+ for { prefix , clauses } <- tables do
358
+ def upcase ( << unquote ( prefix ) , byte , rest :: bits >> , acc , mode ) do
359
+ value = case byte , do: unquote ( clauses )
360
+ upcase ( rest , [ value | acc ] , mode )
361
+ end
362
+ end
363
+
364
+ def upcase ( << byte , rest :: bits >> , acc , mode ) do
365
+ if byte >= ?a and byte <= ?z do
366
+ upcase ( rest , [ byte - 32 | acc ] , mode )
367
+ else
368
+ upcase ( rest , [ byte | acc ] , mode )
369
+ end
297
370
end
298
371
299
372
def upcase ( "" , acc , _mode ) , do: IO . iodata_to_binary ( :lists . reverse ( acc ) )
@@ -310,16 +383,33 @@ defmodule String.Unicode do
310
383
311
384
conditional_titlecase = [ @ letter_i ]
312
385
313
- for { codepoint , _upper , _lower , title } <- codes ,
314
- title && title != codepoint ,
315
- codepoint not in conditional_titlecase do
316
- def titlecase_once ( unquote ( codepoint ) <> rest , _mode ) do
386
+ { singles , tables } =
387
+ compute_lookup . (
388
+ for { codepoint , _upper , _lower , title } <- codes ,
389
+ title && title != codepoint ,
390
+ codepoint not in conditional_titlecase ,
391
+ do: { codepoint , title }
392
+ )
393
+
394
+ for { codepoint , title } <- singles do
395
+ def titlecase_once ( << unquote ( codepoint ) , rest :: bits >> , _mode ) do
317
396
{ unquote ( title ) , rest }
318
397
end
319
398
end
320
399
400
+ for { prefix , clauses } <- tables do
401
+ def titlecase_once ( << unquote ( prefix ) , byte , rest :: bits >> , _mode ) do
402
+ value = case byte , do: unquote ( clauses )
403
+ { value , rest }
404
+ end
405
+ end
406
+
321
407
def titlecase_once ( << char :: utf8 , rest :: binary >> , _mode ) do
322
- { << char :: utf8 >> , rest }
408
+ if char >= ?a and char <= ?z do
409
+ { << char - 32 :: utf8 >> , rest }
410
+ else
411
+ { << char :: utf8 >> , rest }
412
+ end
323
413
end
324
414
325
415
def titlecase_once ( << char , rest :: binary >> , _mode ) do
0 commit comments