Skip to content

Commit ae2da55

Browse files
author
José Valim
committed
More optimizations and improvements to normalization
1 parent e96fd72 commit ae2da55

File tree

3 files changed

+49
-31
lines changed

3 files changed

+49
-31
lines changed

lib/elixir/lib/string.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ defmodule String do
504504
## Examples
505505
506506
iex> String.normalize("yêṩ", :nfd)
507-
"yêṩ"
507+
"yêṩ"
508508
509509
iex> String.normalize("leña", :nfc)
510510
"leña"

lib/elixir/test/elixir/string_test.exs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,13 +314,30 @@ defmodule StringTest do
314314

315315
test "normalize" do
316316
assert String.normalize("ŝ", :nfd) == "ŝ"
317-
assert String.normalize("ḇravô", :nfd) == "ḇravô"
317+
assert String.normalize("ḇravô", :nfd) == "ḇravô"
318318
assert String.normalize("ṩierra", :nfd) == "ṩierra"
319319
assert String.normalize("뢴", :nfd) == "뢴"
320320
assert String.normalize("êchǭ", :nfc) == "êchǭ"
321321
assert String.normalize("거̄", :nfc) == "거̄"
322322
assert String.normalize("뢴", :nfc) == "뢴"
323+
324+
# 05B8 05B9 05B1 0591 05C3 05B0 05AC 059F
325+
# 05B1 05B8 05B9 0591 05C3 05B0 05AC 059F
326+
# HEBREW POINT QAMATS, HEBREW POINT HOLAM, HEBREW POINT HATAF SEGOL,
327+
# HEBREW ACCENT ETNAHTA, HEBREW PUNCTUATION SOF PASUQ, HEBREW POINT SHEVA,
328+
# HEBREW ACCENT ILUY, HEBREW ACCENT QARNEY PARA
323329
assert String.normalize("ֱָֹ֑׃ְ֬֟", :nfc) == "ֱָֹ֑׃ְ֬֟"
330+
331+
# 095D (exclusion list)
332+
# 0922 093C
333+
# DEVANAGARI LETTER RHA
334+
assert String.normalize("ढ़", :nfc) == "ढ़"
335+
336+
# 0061 0315 0300 05AE 0340 0062
337+
# 00E0 05AE 0300 0315 0062
338+
# LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
339+
# HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
340+
assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
324341
end
325342

326343
test "graphemes" do

lib/elixir/unicode/unicode.ex

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -490,18 +490,12 @@ defmodule String.Normalizer do
490490
normalize_nfd(rest, acc <> binary)
491491
end
492492

493-
for {binary, decomposition} <- decompositions do
494-
defp normalize_nfd(unquote(binary) <> rest, acc) do
495-
normalize_nfd(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc)
496-
end
497-
end
498-
499493
defp normalize_nfd(binary, acc) do
500494
{n, rest} = String.Unicode.next_grapheme_size(binary)
501495
part = :binary.part(binary, 0, n)
502496
case n do
503-
1 -> normalize_nfd(rest, acc <> part)
504-
_ -> normalize_nfd(rest, acc <> canonical_order(part))
497+
1 -> normalize_nfc(rest, acc <> part)
498+
_ -> normalize_nfd(rest, acc <> canonical_order(part, []))
505499
end
506500
end
507501

@@ -520,11 +514,21 @@ defmodule String.Normalizer do
520514
end
521515
end
522516

523-
defp canonical_order(binary) do
524-
binary
525-
|> :unicode.characters_to_list()
526-
|> Enum.sort_by(&combining_class/1)
527-
|> :unicode.characters_to_binary()
517+
for {binary, decomposition} <- decompositions do
518+
defp canonical_order(unquote(binary) <> rest, acc) do
519+
canonical_order(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc)
520+
end
521+
end
522+
defp canonical_order(<<h::utf8, t::binary>>, acc) do
523+
canonical_order(t, [{h, combining_class(h)}|acc])
524+
end
525+
defp canonical_order(<<>>, [{x, _}]) do
526+
<<x::utf8>>
527+
end
528+
defp canonical_order(<<>>, acc) do
529+
:lists.keysort(2, Enum.reverse(acc))
530+
|> Enum.map(&<<elem(&1, 0)::utf8>>)
531+
|> IO.iodata_to_binary
528532
end
529533

530534
for {codepoint, class} <- combining_classes do
@@ -533,8 +537,6 @@ defmodule String.Normalizer do
533537

534538
defp combining_class(_), do: 0
535539

536-
defp compose(<<_::utf8>> = binary), do: binary
537-
538540
defp compose(<<lead::utf8, vowel::utf8, rest::binary>>) when lead in 0x1100..0x1112 and vowel in 0x1161..0x1175 do
539541
codepoint = 0xAC00 + ((lead - 0x1100) * 588) + ((vowel - 0x1161) * 28)
540542
case rest do
@@ -545,29 +547,28 @@ defmodule String.Normalizer do
545547
end
546548
end
547549

548-
for {composition, [_, _] = binary} <- compositions do
549-
defp compose(unquote(IO.iodata_to_binary(binary))), do: unquote(composition)
550-
end
551-
552-
defp compose(<<cp::utf8, rest::binary>>) do
553-
compose(rest, <<cp::utf8>>, "", combining_class(cp) - 1)
550+
defp compose(binary) do
551+
compose_one(binary) || (
552+
<<cp::utf8, rest::binary>> = binary
553+
compose_many(rest, <<cp::utf8>>, "", combining_class(cp) - 1)
554+
)
554555
end
555556

556-
defp compose("", base, accents, _), do: base <> accents
557+
defp compose_many("", base, accents, _), do: base <> accents
557558

558-
defp compose(<<cp::utf8, rest::binary>>, base, accents, last_class) do
559+
defp compose_many(<<cp::utf8, rest::binary>>, base, accents, last_class) do
559560
part_class = combining_class(cp)
560561
combined = <<base::binary, cp::utf8>>
561-
if last_class < part_class and composable?(combined) do
562-
compose(rest, compose(combined), accents, last_class)
562+
if composed = (last_class < part_class && compose_one(combined)) do
563+
compose_many(rest, composed, accents, last_class)
563564
else
564-
compose(rest, base, <<accents::binary, cp::utf8>>, part_class)
565+
compose_many(rest, base, <<accents::binary, cp::utf8>>, part_class)
565566
end
566567
end
567568

568-
for {_, [_, _] = binary} <- compositions do
569-
defp composable?(unquote(IO.iodata_to_binary(binary))), do: true
569+
for {composition, [_, _] = binary} <- compositions do
570+
defp compose_one(unquote(IO.iodata_to_binary(binary))), do: unquote(composition)
570571
end
571572

572-
defp composable?(_), do: false
573+
defp compose_one(_), do: nil
573574
end

0 commit comments

Comments
 (0)