Skip to content

Commit 5a2b24d

Browse files
authored
Group Unicode upcase/downcase by prefix (#11310)
The patch computes byte lookups based on the prefix. For example, Á, É, etc all have the same prefix <<195>>, so they are lumped together for lookup and then we just do a byte lookup later. We tried doing the byte lookup on 64-element tuple (since the byte is always within 0b10000000 and 0b10111111) but that's slower, especially because we need to check the byte range for invalid Unicode, so instead the last byte lookup is a case. Grouping the top-level lookup makes the cost of a miss 3x cheaper albeit a hit is 10% more expensive and reduces bytecode size.
1 parent d962ddb commit 5a2b24d

File tree

1 file changed

+105
-15
lines changed

1 file changed

+105
-15
lines changed

lib/elixir/unicode/unicode.ex

Lines changed: 105 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,45 @@ defmodule String.Unicode do
169169
)
170170
end)
171171

172+
# The function computes byte lookups based on the prefix. For example,
173+
# Á, É, etc all have the same prefix <<195>>, so they are lumped
174+
# together for lookup and then we just do a byte lookup later. We
175+
# tried doing the byte lookup on 64-element tuple (since the byte
176+
# is always within 0b10000000 and 0b10111111) but that's slower,
177+
# especially because we need to check the byte range for invalid
178+
# Unicode, instead the last byte lookup is a case. Grouping the
179+
# top-level lookup makes the cost of a miss 3x cheaper albeit a
180+
# hit is 10% more expensive) and reduces bytecode size.
181+
compute_lookup = fn key_values ->
182+
prefixes =
183+
Enum.reduce(key_values, %{}, fn {codepoint, result}, acc ->
184+
prefix_size = bit_size(codepoint) - 8
185+
<<prefix::size(prefix_size)-bits, byte>> = codepoint
186+
Map.update(acc, prefix, [{byte, result}], &[{byte, result} | &1])
187+
end)
188+
189+
{singles, tables} =
190+
Enum.reduce(Map.delete(prefixes, ""), {[], []}, fn {prefix, pairs}, {singles, tables} ->
191+
case pairs do
192+
[{byte, result}] ->
193+
{[{prefix <> <<byte>>, result} | singles], tables}
194+
195+
_ ->
196+
clauses =
197+
Enum.flat_map(pairs, fn {byte, result} ->
198+
quote do
199+
unquote(byte) -> unquote(result)
200+
end
201+
end)
202+
203+
clauses = clauses ++ quote do: (byte -> <<unquote(prefix), byte>>)
204+
{singles, [{prefix, clauses} | tables]}
205+
end
206+
end)
207+
208+
{Enum.sort(singles), Enum.sort_by(tables, &(-byte_size(elem(&1, 0))))}
209+
end
210+
172211
# Sigma variants for Greek
173212
@letter_sigma <<0x03A3::utf8>>
174213
@letter_small_sigma_final <<0x03C2::utf8>>
@@ -214,16 +253,33 @@ defmodule String.Unicode do
214253

215254
conditional_downcase = [@letter_I, @letter_I_dot_above, @letter_sigma]
216255

217-
for {codepoint, _upper, lower, _title} <- codes,
218-
lower && lower != codepoint,
219-
codepoint not in conditional_downcase do
256+
{singles, tables} =
257+
compute_lookup.(
258+
for {codepoint, _upper, lower, _title} <- codes,
259+
lower && lower != codepoint,
260+
codepoint not in conditional_downcase,
261+
do: {codepoint, lower}
262+
)
263+
264+
for {codepoint, lower} <- singles do
220265
def downcase(<<unquote(codepoint), rest::bits>>, acc, mode) do
221266
downcase(rest, [unquote(lower) | acc], mode)
222267
end
223268
end
224269

225-
def downcase(<<char, rest::bits>>, acc, mode) do
226-
downcase(rest, [<<char>> | acc], mode)
270+
for {prefix, clauses} <- tables do
271+
def downcase(<<unquote(prefix), byte, rest::bits>>, acc, mode) do
272+
value = case byte, do: unquote(clauses)
273+
downcase(rest, [value | acc], mode)
274+
end
275+
end
276+
277+
def downcase(<<byte, rest::bits>>, acc, mode) do
278+
if byte >= ?A and byte <= ?Z do
279+
downcase(rest, [byte + 32 | acc], mode)
280+
else
281+
downcase(rest, [byte | acc], mode)
282+
end
227283
end
228284

229285
def downcase("", acc, _mode), do: IO.iodata_to_binary(:lists.reverse(acc))
@@ -284,16 +340,33 @@ defmodule String.Unicode do
284340

285341
conditional_upcase = [@letter_i]
286342

287-
for {codepoint, upper, _lower, _title} <- codes,
288-
upper && upper != codepoint,
289-
codepoint not in conditional_upcase do
343+
{singles, tables} =
344+
compute_lookup.(
345+
for {codepoint, upper, _lower, _title} <- codes,
346+
upper && upper != codepoint,
347+
codepoint not in conditional_upcase,
348+
do: {codepoint, upper}
349+
)
350+
351+
for {codepoint, upper} <- singles do
290352
def upcase(<<unquote(codepoint), rest::bits>>, acc, mode) do
291353
upcase(rest, [unquote(upper) | acc], mode)
292354
end
293355
end
294356

295-
def upcase(<<char, rest::bits>>, acc, mode) do
296-
upcase(rest, [char | acc], mode)
357+
for {prefix, clauses} <- tables do
358+
def upcase(<<unquote(prefix), byte, rest::bits>>, acc, mode) do
359+
value = case byte, do: unquote(clauses)
360+
upcase(rest, [value | acc], mode)
361+
end
362+
end
363+
364+
def upcase(<<byte, rest::bits>>, acc, mode) do
365+
if byte >= ?a and byte <= ?z do
366+
upcase(rest, [byte - 32 | acc], mode)
367+
else
368+
upcase(rest, [byte | acc], mode)
369+
end
297370
end
298371

299372
def upcase("", acc, _mode), do: IO.iodata_to_binary(:lists.reverse(acc))
@@ -310,16 +383,33 @@ defmodule String.Unicode do
310383

311384
conditional_titlecase = [@letter_i]
312385

313-
for {codepoint, _upper, _lower, title} <- codes,
314-
title && title != codepoint,
315-
codepoint not in conditional_titlecase do
316-
def titlecase_once(unquote(codepoint) <> rest, _mode) do
386+
{singles, tables} =
387+
compute_lookup.(
388+
for {codepoint, _upper, _lower, title} <- codes,
389+
title && title != codepoint,
390+
codepoint not in conditional_titlecase,
391+
do: {codepoint, title}
392+
)
393+
394+
for {codepoint, title} <- singles do
395+
def titlecase_once(<<unquote(codepoint), rest::bits>>, _mode) do
317396
{unquote(title), rest}
318397
end
319398
end
320399

400+
for {prefix, clauses} <- tables do
401+
def titlecase_once(<<unquote(prefix), byte, rest::bits>>, _mode) do
402+
value = case byte, do: unquote(clauses)
403+
{value, rest}
404+
end
405+
end
406+
321407
def titlecase_once(<<char::utf8, rest::binary>>, _mode) do
322-
{<<char::utf8>>, rest}
408+
if char >= ?a and char <= ?z do
409+
{<<char - 32::utf8>>, rest}
410+
else
411+
{<<char::utf8>>, rest}
412+
end
323413
end
324414

325415
def titlecase_once(<<char, rest::binary>>, _mode) do

0 commit comments

Comments
 (0)