|
248 | 248 | data_path = Path.join(__DIR__, "UnicodeData.txt")
|
249 | 249 |
|
250 | 250 | {codes, non_breakable, decompositions, combining_classes} =
|
251 |
| - Enum.reduce File.stream!(data_path), {[], [], %{}, []}, fn line, {cacc, wacc, dacc, kacc} -> |
| 251 | + Enum.reduce File.stream!(data_path), {[], [], %{}, %{}}, fn line, {cacc, wacc, dacc, kacc} -> |
252 | 252 | [codepoint, _name, _category,
|
253 | 253 | class, _bidi, decomposition,
|
254 | 254 | _numeric_1, _numeric_2, _numeric_3,
|
@@ -276,16 +276,16 @@ data_path = Path.join(__DIR__, "UnicodeData.txt")
|
276 | 276 | decomposition =
|
277 | 277 | decomposition
|
278 | 278 | |> :binary.split(" ", [:global])
|
279 |
| - |> Enum.map(&<<String.to_integer(&1, 16)::utf8>>) |
280 |
| - Map.put(dacc, to_binary.(codepoint), decomposition) |
| 279 | + |> Enum.map(&String.to_integer(&1, 16)) |
| 280 | + Map.put(dacc, String.to_integer(codepoint, 16), decomposition) |
281 | 281 | _ ->
|
282 | 282 | dacc
|
283 | 283 | end
|
284 | 284 |
|
285 | 285 | kacc =
|
286 | 286 | case Integer.parse(class) do
|
287 | 287 | {0, ""} -> kacc
|
288 |
| - {n, ""} -> [{String.to_integer(codepoint, 16), n}|kacc] |
| 288 | + {n, ""} -> Map.put(kacc, String.to_integer(codepoint, 16), n) |
289 | 289 | end
|
290 | 290 |
|
291 | 291 | {cacc, wacc, dacc, kacc}
|
@@ -459,7 +459,7 @@ defmodule String.Normalizer do
|
459 | 459 | compositions = Enum.reduce File.stream!(exclusions_path), decompositions, fn
|
460 | 460 | <<h, _::binary>> = line, acc when h in ?0..?9 or h in ?A..?F ->
|
461 | 461 | [codepoint, _] = :binary.split(line, " ")
|
462 |
| - Map.delete(acc, to_binary.(codepoint)) |
| 462 | + Map.delete(acc, String.to_integer(codepoint, 16)) |
463 | 463 | _, acc ->
|
464 | 464 | acc
|
465 | 465 | end
|
@@ -514,9 +514,10 @@ defmodule String.Normalizer do
|
514 | 514 | end
|
515 | 515 | end
|
516 | 516 |
|
517 |
| - for {binary, decomposition} <- decompositions do |
518 |
| - defp canonical_order(unquote(binary) <> rest, acc) do |
519 |
| - canonical_order(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc) |
| 517 | + for {cp, decomposition} <- decompositions do |
| 518 | + decomposition = decomposition |> Enum.map(&<<&1::utf8>>) |> IO.iodata_to_binary() |
| 519 | + defp canonical_order(unquote(<<cp::utf8>>) <> rest, acc) do |
| 520 | + canonical_order(unquote(decomposition) <> rest, acc) |
520 | 521 | end
|
521 | 522 | end
|
522 | 523 | defp canonical_order(<<h::utf8, t::binary>>, acc) do
|
@@ -566,8 +567,12 @@ defmodule String.Normalizer do
|
566 | 567 | end
|
567 | 568 | end
|
568 | 569 |
|
569 |
| - for {composition, [_, _] = binary} <- compositions do |
570 |
| - defp compose_one(unquote(IO.iodata_to_binary(binary))), do: unquote(composition) |
| 570 | + # Compositions: |
| 571 | + # 1. We must exclude compositions with a single codepoint |
| 572 | + # 2. We must exclude compositions that do not start with 0 combining class |
| 573 | + for {cp, [fst, snd]} <- compositions, |
| 574 | + Map.get(combining_classes, fst, 0) == 0 do |
| 575 | + defp compose_one(unquote(<<fst::utf8, snd::utf8>>)), do: unquote(<<cp::utf8>>) |
571 | 576 | end
|
572 | 577 |
|
573 | 578 | defp compose_one(_), do: nil
|
|
0 commit comments