Skip to content

Commit 28f553a

Browse files
author
José Valim
committed
Handle compositios with non-zero combining class
1 parent ae2da55 commit 28f553a

File tree

2 files changed

+20
-10
lines changed

2 files changed

+20
-10
lines changed

lib/elixir/test/elixir/string_test.exs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,11 @@ defmodule StringTest do
338338
# LATIN SMALL LETTER A, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT,
339339
# HEBREW ACCENT ZINOR, COMBINING GRAVE TONE MARK, LATIN SMALL LETTER B
340340
assert String.normalize("à֮̀̕b", :nfc) == "à֮̀̕b"
341+
342+
# 0344
343+
# 0308 0301
344+
# COMBINING GREEK DIALYTIKA TONOS
345+
assert String.normalize("\u0344", :nfc) == "\u0308\u0301"
341346
end
342347

343348
test "graphemes" do

lib/elixir/unicode/unicode.ex

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ end
248248
data_path = Path.join(__DIR__, "UnicodeData.txt")
249249

250250
{codes, non_breakable, decompositions, combining_classes} =
251-
Enum.reduce File.stream!(data_path), {[], [], %{}, []}, fn line, {cacc, wacc, dacc, kacc} ->
251+
Enum.reduce File.stream!(data_path), {[], [], %{}, %{}}, fn line, {cacc, wacc, dacc, kacc} ->
252252
[codepoint, _name, _category,
253253
class, _bidi, decomposition,
254254
_numeric_1, _numeric_2, _numeric_3,
@@ -276,16 +276,16 @@ data_path = Path.join(__DIR__, "UnicodeData.txt")
276276
decomposition =
277277
decomposition
278278
|> :binary.split(" ", [:global])
279-
|> Enum.map(&<<String.to_integer(&1, 16)::utf8>>)
280-
Map.put(dacc, to_binary.(codepoint), decomposition)
279+
|> Enum.map(&String.to_integer(&1, 16))
280+
Map.put(dacc, String.to_integer(codepoint, 16), decomposition)
281281
_ ->
282282
dacc
283283
end
284284

285285
kacc =
286286
case Integer.parse(class) do
287287
{0, ""} -> kacc
288-
{n, ""} -> [{String.to_integer(codepoint, 16), n}|kacc]
288+
{n, ""} -> Map.put(kacc, String.to_integer(codepoint, 16), n)
289289
end
290290

291291
{cacc, wacc, dacc, kacc}
@@ -459,7 +459,7 @@ defmodule String.Normalizer do
459459
compositions = Enum.reduce File.stream!(exclusions_path), decompositions, fn
460460
<<h, _::binary>> = line, acc when h in ?0..?9 or h in ?A..?F ->
461461
[codepoint, _] = :binary.split(line, " ")
462-
Map.delete(acc, to_binary.(codepoint))
462+
Map.delete(acc, String.to_integer(codepoint, 16))
463463
_, acc ->
464464
acc
465465
end
@@ -514,9 +514,10 @@ defmodule String.Normalizer do
514514
end
515515
end
516516

517-
for {binary, decomposition} <- decompositions do
518-
defp canonical_order(unquote(binary) <> rest, acc) do
519-
canonical_order(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc)
517+
for {cp, decomposition} <- decompositions do
518+
decomposition = decomposition |> Enum.map(&<<&1::utf8>>) |> IO.iodata_to_binary()
519+
defp canonical_order(unquote(<<cp::utf8>>) <> rest, acc) do
520+
canonical_order(unquote(decomposition) <> rest, acc)
520521
end
521522
end
522523
defp canonical_order(<<h::utf8, t::binary>>, acc) do
@@ -566,8 +567,12 @@ defmodule String.Normalizer do
566567
end
567568
end
568569

569-
for {composition, [_, _] = binary} <- compositions do
570-
defp compose_one(unquote(IO.iodata_to_binary(binary))), do: unquote(composition)
570+
# Compositions:
571+
# 1. We must exclude compositions with a single codepoint
572+
# 2. We must exclude compositions that do not start with 0 combining class
573+
for {cp, [fst, snd]} <- compositions,
574+
Map.get(combining_classes, fst, 0) == 0 do
575+
defp compose_one(unquote(<<fst::utf8, snd::utf8>>)), do: unquote(<<cp::utf8>>)
571576
end
572577

573578
defp compose_one(_), do: nil

0 commit comments

Comments
 (0)