Skip to content

Commit fdc9c95

Browse files
authored
Fix the IndexUnicodeProperties Joining_Type (#657)
1 parent bf38a00 commit fdc9c95

File tree

7 files changed

+64
-32
lines changed

7 files changed

+64
-32
lines changed

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.unicode.props.UcdPropertyValues.NFKC_Quick_Check_Values;
3030
import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values;
3131
import org.unicode.props.UcdPropertyValues.Numeric_Type_Values;
32+
import org.unicode.props.UcdPropertyValues.Other_Joining_Type_Values;
3233
import org.unicode.props.UcdPropertyValues.Script_Values;
3334
import org.unicode.props.UcdPropertyValues.Sentence_Break_Values;
3435
import org.unicode.props.UcdPropertyValues.Vertical_Orientation_Values;
@@ -241,6 +242,8 @@ public enum UcdProperty {
241242
NFKC_Quick_Check(PropertyType.Enumerated, NFKC_Quick_Check_Values.class, null, "NFKC_QC"),
242243
NFKD_Quick_Check(PropertyType.Enumerated, NFKD_Quick_Check_Values.class, null, "NFKD_QC"),
243244
Numeric_Type(PropertyType.Enumerated, Numeric_Type_Values.class, null, "nt"),
245+
Other_Joining_Type(
246+
PropertyType.Enumerated, Other_Joining_Type_Values.class, null, "Other_Joining_Type"),
244247
Sentence_Break(PropertyType.Enumerated, Sentence_Break_Values.class, null, "SB"),
245248
Vertical_Orientation(PropertyType.Enumerated, Vertical_Orientation_Values.class, null, "vo"),
246249
Word_Break(PropertyType.Enumerated, Word_Break_Values.class, null, "WB"),

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1708,6 +1708,40 @@ public static Numeric_Type_Values forName(String name) {
17081708
}
17091709

17101710
// Numeric_Value
1711+
public enum Other_Joining_Type_Values implements Named {
1712+
Join_Causing("C"),
1713+
Dual_Joining("D"),
1714+
Left_Joining("L"),
1715+
Right_Joining("R"),
1716+
Transparent("T"),
1717+
Non_Joining("U"),
1718+
Deduce_From_General_Category("Deduce_From_General_Category");
1719+
private final PropertyNames<Other_Joining_Type_Values> names;
1720+
1721+
private Other_Joining_Type_Values(String shortName, String... otherNames) {
1722+
names =
1723+
new PropertyNames<Other_Joining_Type_Values>(
1724+
Other_Joining_Type_Values.class, this, shortName, otherNames);
1725+
}
1726+
1727+
@Override
1728+
public PropertyNames<Other_Joining_Type_Values> getNames() {
1729+
return names;
1730+
}
1731+
1732+
@Override
1733+
public String getShortName() {
1734+
return names.getShortName();
1735+
}
1736+
1737+
private static final NameMatcher<Other_Joining_Type_Values> NAME_MATCHER =
1738+
PropertyNames.getNameToEnums(Other_Joining_Type_Values.class);
1739+
1740+
public static Other_Joining_Type_Values forName(String name) {
1741+
return NAME_MATCHER.get(name);
1742+
}
1743+
}
1744+
17111745
public enum Script_Values implements Named {
17121746
Adlam("Adlm"),
17131747
Caucasian_Albanian("Aghb"),

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ idtype ; Identifier_Type
2727
idns ; Idn_Status
2828
idn8 ; Idn_2008
2929

30+
# Unofficial contributory property used in the derivation of Joining_Type.
31+
Other_Joining_Type ; Other_Joining_Type
32+
3033
# ================================================
3134
# String Properties
3235
# ================================================

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@
9696
# @missing: 0000..10FFFF; kTraditionalVariant ; <none>
9797

9898
# @missing: 0000..10FFFF; Joining_Group ; No_Joining_Group
99-
# @missing: 0000..10FFFF; Joining_Type ; Non_Joining
10099

101100
# Overrides for bugs
102101

@@ -124,7 +123,6 @@ idn8 ; na ; na
124123

125124
# @missing: 0000..10FFFF; Idn_Mapping ; <code point>
126125

127-
128126
# @missing: 0000..10FFFF; Identifier_Status ; r
129127

130128
idstatus ; r ; Restricted
@@ -162,3 +160,12 @@ sc ; Zxxx ; Unwritten
162160

163161
# TODO: there is no Unicode 13.1, see https://github.com/unicode-org/unicodetools/issues/100
164162
age; 13.1 ; V13_1
163+
164+
# @missing: 0000..10FFFF; Other_Joining_Type ; Deduce_From_General_Category
165+
Other_Joining_Type ; C ; Join_Causing
166+
Other_Joining_Type ; D ; Dual_Joining
167+
Other_Joining_Type ; L ; Left_Joining
168+
Other_Joining_Type ; R ; Right_Joining
169+
Other_Joining_Type ; T ; Transparent
170+
Other_Joining_Type ; U ; Non_Joining
171+
Other_Joining_Type ; Deduce_From_General_Category ; Deduce_From_General_Category

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ UnicodeData; Simple_Lowercase_Mapping ; 13
9292
UnicodeData; Simple_Titlecase_Mapping ; 14
9393
UnicodeData; Unicode_1_Name ; 10
9494
UnicodeData; ISO_Comment ; 11
95-
ArabicShaping; Joining_Type; 2
95+
# Handle the complex default of ArabicShaping.txt by introducing an unofficial
96+
# contributory property, to be used when deriving Joining_Type.
97+
ArabicShaping; Other_Joining_Type; 2
98+
DerivedJoiningType; Joining_Type; 1
9699
ArabicShaping; Joining_Group; 3
97100
BidiMirroring; Bidi_Mirroring_Glyph;
98101
Blocks ; Block

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,17 @@ Let $nonAlphabeticDependentVowels = [\N{ORIYA SIGN OVERLINE}\N{THAI CHARACTER MA
546546
Let $nonDiacriticNuktas = [\u1BE6\U00010A38\U00010A39\U00010A3A\U0001133B]
547547
[\p{InSc=Nukta} - \p{Diacritic}] = $nonDiacriticNuktas
548548

549+
## Joining_Type and Joining_Group
550+
# Where defined, the Joining_Group refines the Joining_Type.
551+
EquivalencesOf \P{Joining_Group=No_Joining_Group} Joining_Group ⇒ Joining_Type
552+
\p{gc=Mn} ⊆ \p{Joining_Type=Transparent}
553+
\p{gc=Me} ⊆ \p{Joining_Type=Transparent}
554+
555+
# Derivation of Joining_Type from the second column of ArabicShaping.txt (unofficially Other_Joining_Type).
556+
In \P{Other_Joining_Type=Deduce_From_General_Category} Joining_Type = Other_Joining_Type
557+
[ \p{Other_Joining_Type=Deduce_From_General_Category} & [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Transparent}
558+
[ \p{Other_Joining_Type=Deduce_From_General_Category} - [\p{gc=Mn}\p{gc=Me}\p{gc=Cf}] ] ⊆ \p{Joining_Type=Non_Joining}
559+
549560
##########################
550561
# LineBreak property
551562
##########################

unicodetools/src/test/java/org/unicode/propstest/TestProperties.java

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -214,35 +214,6 @@ public void TestAAScripts() {
214214
}
215215
}
216216

217-
@Test
218-
public void TestJoiningGroupConsistency() {
219-
// TODO(egg): I would like to be able to put that in the invariants tests as « the partition
220-
// defined by Joining_Group is finer than that defined by Joining_Type ».
221-
UnicodeMap<String> joiningGroup = iup.load(UcdProperty.Joining_Group);
222-
UnicodeMap<String> joiningType = iup.load(UcdProperty.Joining_Type);
223-
var charactersByJoiningGroup = new HashMap<String, UnicodeSet>();
224-
joiningGroup.addInverseTo(charactersByJoiningGroup).remove("No_Joining_Group");
225-
charactersByJoiningGroup.forEach(
226-
(group, set) -> {
227-
final int first = set.getRangeStart(0);
228-
final String firstType = joiningType.get(first);
229-
set.forEach(
230-
(c) -> {
231-
assertEquals(
232-
"U+"
233-
+ getCodeAndName(Character.toString(first))
234-
+ "\nand\nU+"
235-
+ getCodeAndName(c)
236-
+ "\nhave different joining types but are in the"
237-
+ " same joining group ("
238-
+ group
239-
+ ")\n",
240-
firstType,
241-
joiningType.get(c));
242-
});
243-
});
244-
}
245-
246217
@Test
247218
public void TestScripts() {
248219

0 commit comments

Comments
 (0)