Skip to content

Commit e96fd72

Browse files
author
José Valim
committed
Make decomposition recursive and consider exclusion list
1 parent 9d9417e commit e96fd72

File tree

2 files changed

+219
-17
lines changed

2 files changed

+219
-17
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# CompositionExclusions-8.0.0.txt
2+
# Date: 2015-02-19, 00:30:00 GMT [KW, LI]
3+
#
4+
# This file lists the characters for the Composition Exclusion Table
5+
# defined in UAX #15, Unicode Normalization Forms.
6+
#
7+
# This file is a normative contributory data file in the
8+
# Unicode Character Database.
9+
#
10+
# Copyright (c) 1991-2015 Unicode, Inc.
11+
# For terms of use, see http://www.unicode.org/terms_of_use.html
12+
#
13+
# For more information, see
14+
# http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
15+
#
16+
# For a full derivation of composition exclusions, see the derived property
17+
# Full_Composition_Exclusion in DerivedNormalizationProps.txt
18+
#
19+
20+
# ================================================
21+
# (1) Script Specifics
22+
#
23+
# This list of characters cannot be derived from the UnicodeData.txt file.
24+
# ================================================
25+
26+
0958 # DEVANAGARI LETTER QA
27+
0959 # DEVANAGARI LETTER KHHA
28+
095A # DEVANAGARI LETTER GHHA
29+
095B # DEVANAGARI LETTER ZA
30+
095C # DEVANAGARI LETTER DDDHA
31+
095D # DEVANAGARI LETTER RHA
32+
095E # DEVANAGARI LETTER FA
33+
095F # DEVANAGARI LETTER YYA
34+
09DC # BENGALI LETTER RRA
35+
09DD # BENGALI LETTER RHA
36+
09DF # BENGALI LETTER YYA
37+
0A33 # GURMUKHI LETTER LLA
38+
0A36 # GURMUKHI LETTER SHA
39+
0A59 # GURMUKHI LETTER KHHA
40+
0A5A # GURMUKHI LETTER GHHA
41+
0A5B # GURMUKHI LETTER ZA
42+
0A5E # GURMUKHI LETTER FA
43+
0B5C # ORIYA LETTER RRA
44+
0B5D # ORIYA LETTER RHA
45+
0F43 # TIBETAN LETTER GHA
46+
0F4D # TIBETAN LETTER DDHA
47+
0F52 # TIBETAN LETTER DHA
48+
0F57 # TIBETAN LETTER BHA
49+
0F5C # TIBETAN LETTER DZHA
50+
0F69 # TIBETAN LETTER KSSA
51+
0F76 # TIBETAN VOWEL SIGN VOCALIC R
52+
0F78 # TIBETAN VOWEL SIGN VOCALIC L
53+
0F93 # TIBETAN SUBJOINED LETTER GHA
54+
0F9D # TIBETAN SUBJOINED LETTER DDHA
55+
0FA2 # TIBETAN SUBJOINED LETTER DHA
56+
0FA7 # TIBETAN SUBJOINED LETTER BHA
57+
0FAC # TIBETAN SUBJOINED LETTER DZHA
58+
0FB9 # TIBETAN SUBJOINED LETTER KSSA
59+
FB1D # HEBREW LETTER YOD WITH HIRIQ
60+
FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
61+
FB2A # HEBREW LETTER SHIN WITH SHIN DOT
62+
FB2B # HEBREW LETTER SHIN WITH SIN DOT
63+
FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
64+
FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
65+
FB2E # HEBREW LETTER ALEF WITH PATAH
66+
FB2F # HEBREW LETTER ALEF WITH QAMATS
67+
FB30 # HEBREW LETTER ALEF WITH MAPIQ
68+
FB31 # HEBREW LETTER BET WITH DAGESH
69+
FB32 # HEBREW LETTER GIMEL WITH DAGESH
70+
FB33 # HEBREW LETTER DALET WITH DAGESH
71+
FB34 # HEBREW LETTER HE WITH MAPIQ
72+
FB35 # HEBREW LETTER VAV WITH DAGESH
73+
FB36 # HEBREW LETTER ZAYIN WITH DAGESH
74+
FB38 # HEBREW LETTER TET WITH DAGESH
75+
FB39 # HEBREW LETTER YOD WITH DAGESH
76+
FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
77+
FB3B # HEBREW LETTER KAF WITH DAGESH
78+
FB3C # HEBREW LETTER LAMED WITH DAGESH
79+
FB3E # HEBREW LETTER MEM WITH DAGESH
80+
FB40 # HEBREW LETTER NUN WITH DAGESH
81+
FB41 # HEBREW LETTER SAMEKH WITH DAGESH
82+
FB43 # HEBREW LETTER FINAL PE WITH DAGESH
83+
FB44 # HEBREW LETTER PE WITH DAGESH
84+
FB46 # HEBREW LETTER TSADI WITH DAGESH
85+
FB47 # HEBREW LETTER QOF WITH DAGESH
86+
FB48 # HEBREW LETTER RESH WITH DAGESH
87+
FB49 # HEBREW LETTER SHIN WITH DAGESH
88+
FB4A # HEBREW LETTER TAV WITH DAGESH
89+
FB4B # HEBREW LETTER VAV WITH HOLAM
90+
FB4C # HEBREW LETTER BET WITH RAFE
91+
FB4D # HEBREW LETTER KAF WITH RAFE
92+
FB4E # HEBREW LETTER PE WITH RAFE
93+
94+
# Total code points: 67
95+
96+
# ================================================
97+
# (2) Post Composition Version precomposed characters
98+
#
99+
# These characters cannot be derived solely from the UnicodeData.txt file
100+
# in this version of Unicode.
101+
#
102+
# Note that characters added to the standard after the
103+
# Composition Version and which have canonical decomposition mappings
104+
# are not automatically added to this list of Post Composition
105+
# Version precomposed characters.
106+
# ================================================
107+
108+
2ADC # FORKING
109+
1D15E # MUSICAL SYMBOL HALF NOTE
110+
1D15F # MUSICAL SYMBOL QUARTER NOTE
111+
1D160 # MUSICAL SYMBOL EIGHTH NOTE
112+
1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
113+
1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
114+
1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
115+
1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
116+
1D1BB # MUSICAL SYMBOL MINIMA
117+
1D1BC # MUSICAL SYMBOL MINIMA BLACK
118+
1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
119+
1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
120+
1D1BF # MUSICAL SYMBOL FUSA WHITE
121+
1D1C0 # MUSICAL SYMBOL FUSA BLACK
122+
123+
# Total code points: 14
124+
125+
# ================================================
126+
# (3) Singleton Decompositions
127+
#
128+
# These characters can be derived from the UnicodeData.txt file
129+
# by including all canonically decomposable characters whose
130+
# canonical decomposition consists of a single character.
131+
#
132+
# These characters are simply quoted here for reference.
133+
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
134+
# ================================================
135+
136+
# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
137+
# 0343 COMBINING GREEK KORONIS
138+
# 0374 GREEK NUMERAL SIGN
139+
# 037E GREEK QUESTION MARK
140+
# 0387 GREEK ANO TELEIA
141+
# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
142+
# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
143+
# 1F75 GREEK SMALL LETTER ETA WITH OXIA
144+
# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
145+
# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
146+
# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
147+
# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
148+
# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
149+
# 1FBE GREEK PROSGEGRAMMENI
150+
# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
151+
# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
152+
# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
153+
# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
154+
# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
155+
# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
156+
# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
157+
# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
158+
# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
159+
# 1FFD GREEK OXIA
160+
# 2000..2001 [2] EN QUAD..EM QUAD
161+
# 2126 OHM SIGN
162+
# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
163+
# 2329 LEFT-POINTING ANGLE BRACKET
164+
# 232A RIGHT-POINTING ANGLE BRACKET
165+
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
166+
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
167+
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
168+
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
169+
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
170+
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
171+
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
172+
# FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
173+
# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
174+
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
175+
176+
# Total code points: 1035
177+
178+
# ================================================
179+
# (4) Non-Starter Decompositions
180+
#
181+
# These characters can be derived from the UnicodeData.txt file
182+
# by including each expanding canonical decomposition
183+
# (i.e., those which canonically decompose to a sequence
184+
# of characters instead of a single character), such that:
185+
#
186+
# A. The character is not a Starter.
187+
#
188+
# OR (inclusive)
189+
#
190+
# B. The character's canonical decomposition begins
191+
# with a character that is not a Starter.
192+
#
193+
# Note that a "Starter" is any character with a zero combining class.
194+
#
195+
# These characters are simply quoted here for reference.
196+
# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
197+
# ================================================
198+
199+
# 0344 COMBINING GREEK DIALYTIKA TONOS
200+
# 0F73 TIBETAN VOWEL SIGN II
201+
# 0F75 TIBETAN VOWEL SIGN UU
202+
# 0F81 TIBETAN VOWEL SIGN REVERSED II
203+
204+
# Total code points: 4
205+
206+
# EOF

lib/elixir/unicode/unicode.ex

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -449,24 +449,21 @@ defmodule String.Break do
449449

450450
defp add_buffer_to_acc("", acc), do: acc
451451
defp add_buffer_to_acc(buffer, acc), do: [buffer|acc]
452-
453-
# Decompose
454-
455-
def decompose(entries, map) do
456-
entries
457-
|> Enum.map(fn entry ->
458-
case map do
459-
%{^entry => match} -> decompose(match, map)
460-
%{} -> entry
461-
end
462-
end)
463-
|> IO.iodata_to_binary()
464-
end
465452
end
466453

467454
defmodule String.Normalizer do
468455
@moduledoc false
469456

457+
exclusions_path = Path.join(__DIR__, "CompositionExclusions.txt")
458+
459+
compositions = Enum.reduce File.stream!(exclusions_path), decompositions, fn
460+
<<h, _::binary>> = line, acc when h in ?0..?9 or h in ?A..?F ->
461+
[codepoint, _] = :binary.split(line, " ")
462+
Map.delete(acc, to_binary.(codepoint))
463+
_, acc ->
464+
acc
465+
end
466+
470467
# Normalize
471468

472469
def normalize(string, :nfd) when is_binary(string) do
@@ -494,9 +491,8 @@ defmodule String.Normalizer do
494491
end
495492

496493
for {binary, decomposition} <- decompositions do
497-
decomposition = String.Break.decompose(decomposition, decompositions)
498494
defp normalize_nfd(unquote(binary) <> rest, acc) do
499-
normalize_nfd(rest, acc <> unquote(decomposition))
495+
normalize_nfd(unquote(IO.iodata_to_binary(decomposition)) <> rest, acc)
500496
end
501497
end
502498

@@ -549,7 +545,7 @@ defmodule String.Normalizer do
549545
end
550546
end
551547

552-
for {composition, [_, _] = binary} <- decompositions do
548+
for {composition, [_, _] = binary} <- compositions do
553549
defp compose(unquote(IO.iodata_to_binary(binary))), do: unquote(composition)
554550
end
555551

@@ -569,7 +565,7 @@ defmodule String.Normalizer do
569565
end
570566
end
571567

572-
for {_, [_, _] = binary} <- decompositions do
568+
for {_, [_, _] = binary} <- compositions do
573569
defp composable?(unquote(IO.iodata_to_binary(binary))), do: true
574570
end
575571

0 commit comments

Comments
 (0)