Skip to content

Commit 3182a20

Browse files
authored
Add non-UCD properties to IUP (#1139)
* RGI_Emoji_Qualification * GenerateEnums * component * GenerateEnums * non-fully-qualified * GenerateEnums * Tweak comment * IDNA2008_Category * GenerateEnums * Look for the file in the right place * A monstrous test. * RGI_Emoji * GenerateEnums * Special handling * Remove redundant JSP unversioned properties * spots * mixed * redundant alias * @#$%@#$% break; * Another one * new names
1 parent daed3b5 commit 3182a20

File tree

12 files changed

+256
-59
lines changed

12 files changed

+256
-59
lines changed

UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ public UnicodeProperty getProperty(String propertyAlias) {
8686
}
8787

8888
add(new IDNA2003());
89-
add(new UTS46());
90-
add(new IDNA2008());
9189
add(new IDNA2008c());
9290
// add(new Usage());
9391
add(new HanType());
@@ -196,37 +194,6 @@ public String transform(String source) {
196194
addExamplarProperty(LocaleData.ES_STANDARD, "exem", "exemplar");
197195
addExamplarProperty(LocaleData.ES_AUXILIARY, "exema", "exemplar_aux");
198196
addExamplarProperty(LocaleData.ES_PUNCTUATION, "exemp", "exemplar_punct");
199-
200-
UnicodeSet Basic_Emoji =
201-
getProperty("Basic_Emoji").getSet("Yes", null); // TODO: was .getTrueSet();
202-
UnicodeSet Emoji_Keycap_Sequence =
203-
getProperty("RGI_Emoji_Keycap_Sequence")
204-
.getSet("Yes", null); // TODO: was .getTrueSet();
205-
UnicodeSet RGI_Emoji_Modifier_Sequence =
206-
getProperty("RGI_Emoji_Modifier_Sequence")
207-
.getSet("Yes", null); // TODO: was .getTrueSet();
208-
UnicodeSet RGI_Emoji_Tag_Sequence =
209-
getProperty("RGI_Emoji_Tag_Sequence")
210-
.getSet("Yes", null); // TODO: was .getTrueSet();
211-
UnicodeSet RGI_Emoji_Flag_Sequence =
212-
getProperty("RGI_Emoji_Flag_Sequence")
213-
.getSet("Yes", null); // TODO: was .getTrueSet();
214-
UnicodeSet RGI_Emoji_Zwj_Sequence =
215-
getProperty("RGI_Emoji_Zwj_Sequence")
216-
.getSet("Yes", null); // TODO: was .getTrueSet();
217-
UnicodeSet RGI_Emoji =
218-
new UnicodeSet()
219-
.add(Basic_Emoji)
220-
.add(Emoji_Keycap_Sequence)
221-
.add(RGI_Emoji_Modifier_Sequence)
222-
.add(RGI_Emoji_Flag_Sequence)
223-
.add(RGI_Emoji_Tag_Sequence)
224-
.add(RGI_Emoji_Zwj_Sequence)
225-
.freeze();
226-
add(
227-
new UnicodeSetProperty()
228-
.set(RGI_Emoji)
229-
.setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0"));
230197
}
231198

232199
private void addExamplarProperty(
@@ -534,17 +501,6 @@ protected String _getValue(int codepoint) {
534501
}
535502
}
536503

537-
private static class IDNA2008 extends XEnumUnicodeProperty {
538-
public IDNA2008() {
539-
super("idna2008", Idna2008.Idna2008Type.values());
540-
}
541-
542-
@Override
543-
protected String _getValue(int codepoint) {
544-
return Idna2008.getTypeMapping().get(codepoint).toString();
545-
}
546-
}
547-
548504
private static class IDNA2008c extends XEnumUnicodeProperty {
549505
public IDNA2008c() {
550506
super("idna2008c", IdnaType.values());

UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,8 +1120,8 @@ public void TestBnfGen() {
11201120
public void TestSimpleSet() {
11211121
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{idna2003=valid}");
11221122
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{idna=valid}");
1123-
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{uts46=valid}");
1124-
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{idna2008=PVALID}");
1123+
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{Idn_Status=valid}");
1124+
checkUnicodeSetParseContains("[a-z\u00e4\u03b1]", "\\p{IDNA2008_Category=PVALID}");
11251125
checkUnicodeSetParse("[\\u1234\\uABCD-\\uAC00]", "U+1234 U+ABCD-U+AC00");
11261126
checkUnicodeSetParse("[\\u1234\\uABCD-\\uAC00]", "U+1234 U+ABCD..U+AC00");
11271127
}

unicodetools/src/main/java/org/unicode/props/PropertyParsingInfo.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,13 @@ private static void parseFields(
14921492
}
14931493
}
14941494
}
1495+
if (propInfo.property == UcdProperty.RGI_Emoji) {
1496+
if (value.equals("fully-qualified") || value.equals("component")) {
1497+
value = "Yes";
1498+
} else {
1499+
value = "No";
1500+
}
1501+
}
14951502
propInfo.put(
14961503
data,
14971504
line.getMissingSet(),

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.unicode.props.UcdPropertyValues.General_Category_Values;
1616
import org.unicode.props.UcdPropertyValues.Grapheme_Cluster_Break_Values;
1717
import org.unicode.props.UcdPropertyValues.Hangul_Syllable_Type_Values;
18+
import org.unicode.props.UcdPropertyValues.IDNA2008_Category_Values;
1819
import org.unicode.props.UcdPropertyValues.Identifier_Status_Values;
1920
import org.unicode.props.UcdPropertyValues.Identifier_Type_Values;
2021
import org.unicode.props.UcdPropertyValues.Idn_2008_Values;
@@ -31,6 +32,7 @@
3132
import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values;
3233
import org.unicode.props.UcdPropertyValues.Numeric_Type_Values;
3334
import org.unicode.props.UcdPropertyValues.Other_Joining_Type_Values;
35+
import org.unicode.props.UcdPropertyValues.RGI_Emoji_Qualification_Values;
3436
import org.unicode.props.UcdPropertyValues.Script_Values;
3537
import org.unicode.props.UcdPropertyValues.Sentence_Break_Values;
3638
import org.unicode.props.UcdPropertyValues.Vertical_Orientation_Values;
@@ -620,6 +622,12 @@ public enum UcdProperty {
620622
Hangul_Syllable_Type_Values.class,
621623
null,
622624
"hst"),
625+
IDNA2008_Category(
626+
PropertyType.Enumerated,
627+
DerivedPropertyStatus.NonUCDProperty,
628+
IDNA2008_Category_Values.class,
629+
null,
630+
"IDNA2008_Category"),
623631
Identifier_Status(
624632
PropertyType.Enumerated,
625633
DerivedPropertyStatus.NonUCDProperty,
@@ -716,6 +724,12 @@ public enum UcdProperty {
716724
Other_Joining_Type_Values.class,
717725
null,
718726
"Other_Joining_Type"),
727+
RGI_Emoji_Qualification(
728+
PropertyType.Enumerated,
729+
DerivedPropertyStatus.NonUCDProperty,
730+
RGI_Emoji_Qualification_Values.class,
731+
null,
732+
"RGI_Emoji_Qualification"),
719733
Sentence_Break(
720734
PropertyType.Enumerated,
721735
DerivedPropertyStatus.Approved,
@@ -852,6 +866,12 @@ public enum UcdProperty {
852866
PropertyType.Binary, DerivedPropertyStatus.Approved, Binary.class, null, "PCM"),
853867
Quotation_Mark(
854868
PropertyType.Binary, DerivedPropertyStatus.Approved, Binary.class, null, "QMark"),
869+
RGI_Emoji(
870+
PropertyType.Binary,
871+
DerivedPropertyStatus.NonUCDProperty,
872+
Binary.class,
873+
null,
874+
"RGI_Emoji"),
855875
RGI_Emoji_Flag_Sequence(
856876
PropertyType.Binary,
857877
DerivedPropertyStatus.NonUCDProperty,

unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,6 +1147,38 @@ public static Idn_Status_Values forName(String name) {
11471147
}
11481148
}
11491149

1150+
public enum IDNA2008_Category_Values implements Named {
1151+
Protocol_Valid("PVALID"),
1152+
Contextual_Rule_Required_Join_Controls("CONTEXTJ"),
1153+
Contextual_Rule_Required_Other("CONTEXTO"),
1154+
Disallowed("DISALLOWED"),
1155+
Unassigned("UNASSIGNED");
1156+
private final PropertyNames<IDNA2008_Category_Values> names;
1157+
1158+
private IDNA2008_Category_Values(String shortName, String... otherNames) {
1159+
names =
1160+
new PropertyNames<IDNA2008_Category_Values>(
1161+
IDNA2008_Category_Values.class, this, shortName, otherNames);
1162+
}
1163+
1164+
@Override
1165+
public PropertyNames<IDNA2008_Category_Values> getNames() {
1166+
return names;
1167+
}
1168+
1169+
@Override
1170+
public String getShortName() {
1171+
return names.getShortName();
1172+
}
1173+
1174+
private static final NameMatcher<IDNA2008_Category_Values> NAME_MATCHER =
1175+
PropertyNames.getNameToEnums(IDNA2008_Category_Values.class);
1176+
1177+
public static IDNA2008_Category_Values forName(String name) {
1178+
return NAME_MATCHER.get(name);
1179+
}
1180+
}
1181+
11501182
public enum Indic_Conjunct_Break_Values implements Named {
11511183
Consonant("Consonant"),
11521184
Extend("Extend"),
@@ -1984,6 +2016,39 @@ public static Other_Joining_Type_Values forName(String name) {
19842016
}
19852017
}
19862018

2019+
public enum RGI_Emoji_Qualification_Values implements Named {
2020+
None("None"),
2021+
Fully_Qualified("FQE"),
2022+
Minimally_Qualified("MQE"),
2023+
Unqualified("UQE"),
2024+
Standalone_Component("component"),
2025+
Non_Fully_Qualified("Non_Fully_Qualified");
2026+
private final PropertyNames<RGI_Emoji_Qualification_Values> names;
2027+
2028+
private RGI_Emoji_Qualification_Values(String shortName, String... otherNames) {
2029+
names =
2030+
new PropertyNames<RGI_Emoji_Qualification_Values>(
2031+
RGI_Emoji_Qualification_Values.class, this, shortName, otherNames);
2032+
}
2033+
2034+
@Override
2035+
public PropertyNames<RGI_Emoji_Qualification_Values> getNames() {
2036+
return names;
2037+
}
2038+
2039+
@Override
2040+
public String getShortName() {
2041+
return names.getShortName();
2042+
}
2043+
2044+
private static final NameMatcher<RGI_Emoji_Qualification_Values> NAME_MATCHER =
2045+
PropertyNames.getNameToEnums(RGI_Emoji_Qualification_Values.class);
2046+
2047+
public static RGI_Emoji_Qualification_Values forName(String name) {
2048+
return NAME_MATCHER.get(name);
2049+
}
2050+
}
2051+
19872052
public enum Script_Values implements Named {
19882053
Adlam("Adlm"),
19892054
Caucasian_Albanian("Aghb"),

unicodetools/src/main/java/org/unicode/text/utility/Utility.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,13 +1506,25 @@ public static String getMostRecentUnicodeDataFile(
15061506
return null;
15071507
}
15081508
}
1509-
Path path = Settings.UnicodeTools.getDataPath(base, element);
1510-
if (path != null) {
1511-
var filePath = path.resolve(parts[2] + fileType);
1512-
if (filePath.toFile().exists()) {
1513-
result = filePath.toString();
1509+
if (parts[2].equals("Idna2008")
1510+
&& currentVersion.compareTo(VersionInfo.UNICODE_16_0) <= 0) {
1511+
Path path = Settings.UnicodeTools.getDataPath(base, "idna2008derived");
1512+
if (path != null) {
1513+
var filePath = path.resolve(parts[2] + "-" + element + fileType);
1514+
if (filePath.toFile().exists()) {
1515+
result = filePath.toString();
1516+
}
1517+
break;
1518+
}
1519+
} else {
1520+
Path path = Settings.UnicodeTools.getDataPath(base, element);
1521+
if (path != null) {
1522+
var filePath = path.resolve(parts[2] + fileType);
1523+
if (filePath.toFile().exists()) {
1524+
result = filePath.toString();
1525+
}
1526+
break;
15141527
}
1515-
break;
15161528
}
15171529
continue;
15181530
}

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ REFS ; RGI_Emoji_Flag_Sequence ; Emoji_Flag_Sequence ; NonUCDProperty
88
REKS ; RGI_Emoji_Keycap_Sequence ; Emoji_Keycap_Sequence ; Emoji_Combining_Sequence ; NonUCDProperty
99
RETS ; RGI_Emoji_Tag_Sequence ; Emoji_Tag_Sequence ; NonUCDProperty
1010
REZS ; RGI_Emoji_Zwj_Sequence ; Emoji_Zwj_Sequence ; NonUCDProperty
11-
# TODO(egg): Add this.
12-
# RE ; RGI_Emoji ; NonUCDProperty
11+
RGI_Emoji ; RGI_Emoji ; NonUCDProperty
1312

1413
# ================================================
1514
# Enumerated Properties
@@ -21,8 +20,7 @@ ID_Type ; Identifier_Type ; NonUCDProperty
2120
idns ; Idn_Status ; NonUCDNonProperty
2221
idn8 ; Idn_2008 ; NonUCDNonProperty
2322

24-
# TODO(egg): Add this.
25-
# IDNA2008_Category ; IDNA2008_Category ; NonUCDProperty
23+
IDNA2008_Category ; IDNA2008_Category ; NonUCDProperty
2624

2725
# Unofficial contributory property used in the derivation of Joining_Type.
2826
Other_Joining_Type ; Other_Joining_Type ; UCDNonProperty
@@ -31,8 +29,7 @@ Do_Not_Emit_Type ; Do_Not_Emit_Type ; UCDNonProperty
3129

3230
kEH_Core ; kEH_Core ; Provisional
3331

34-
# TODO(egg): Add this.
35-
# RGI_Emoji_Qualification ; RGI_Emoji_Qualification ; NonUCDProperty
32+
RGI_Emoji_Qualification ; RGI_Emoji_Qualification ; NonUCDProperty
3633

3734
# ================================================
3835
# String Properties

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
# @missing: 0000..10FFFF; RGI_Emoji_Modifier_Sequence ; No
7878
# @missing: 0000..10FFFF; RGI_Emoji_Tag_Sequence ; No
7979
# @missing: 0000..10FFFF; RGI_Emoji_Zwj_Sequence ; No
80+
# @missing: 0000..10FFFF; RGI_Emoji ; No
8081

8182
# @missing: 0000..10FFFF; Emoji ; No
8283
# @missing: 0000..10FFFF; Emoji_Presentation ; No
@@ -258,6 +259,14 @@ idn8 ; NV8 ; NV8
258259
idn8 ; XV8 ; XV8
259260
idn8 ; na ; na
260261

262+
# @missing: 0000..10FFFF; IDNA2008_Category ; Unassigned
263+
# Long names after the text of https://www.rfc-editor.org/rfc/rfc5892.txt.
264+
IDNA2008_Category ; PVALID ; Protocol_Valid
265+
IDNA2008_Category ; CONTEXTJ ; Contextual_Rule_Required_Join_Controls
266+
IDNA2008_Category ; CONTEXTO ; Contextual_Rule_Required_Other
267+
IDNA2008_Category ; DISALLOWED ; Disallowed
268+
IDNA2008_Category ; UNASSIGNED ; Unassigned
269+
261270
# @missing: 0000..10FFFF; Idn_Mapping ; <code point>
262271

263272
# @missing: 0000..10FFFF; Identifier_Status ; r
@@ -352,3 +361,17 @@ kEH_Core ; N ; None
352361
# @missing: 0000..10FFFF; kEH_AltSeq ; <none>
353362

354363
# @missing: 0000..10FFFF; Names_List_Cross_Ref ; <none>
364+
365+
# @missing: 0000..10FFFF; RGI_Emoji_Qualification ; None
366+
RGI_Emoji_Qualification ; None ; None
367+
# Value aliases from UTS #51.
368+
RGI_Emoji_Qualification ; FQE ; Fully_Qualified
369+
RGI_Emoji_Qualification ; MQE ; Minimally_Qualified
370+
RGI_Emoji_Qualification ; UQE ; Unqualified
371+
# [181-C56] Consensus: In UTS #51 ED-28, add a new property value with long name
372+
# "Standalone_Component" and short name "component" corresponding to the
373+
# "component" field value in the associated data file. For Unicode Version 17.0.
374+
# See L2/24-224 item 9.1.
375+
RGI_Emoji_Qualification ; component ; Standalone_Component
376+
# Value used in 11.0 and earlier, refined in 12.0.
377+
RGI_Emoji_Qualification ; Non_Fully_Qualified ; Non_Fully_Qualified

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,8 @@ idna/*/IdnaMappingTable; Idn_Status;
485485
idna/*/IdnaMappingTable; Idn_Mapping; 2
486486
idna/*/IdnaMappingTable; Idn_2008; 3
487487

488+
idna/*/Idna2008; IDNA2008_Category ; 1
489+
488490
security/*/IdentifierStatus; Identifier_Status;
489491
security/*/IdentifierType; Identifier_Type
490492

@@ -526,7 +528,15 @@ emoji/*/emoji-sequences; RGI_Emoji_Keycap_Sequence
526528
emoji/*/emoji-sequences; RGI_Emoji_Tag_Sequence
527529
emoji/*/emoji-zwj-sequences; RGI_Emoji_Zwj_Sequence
528530

529-
#emoji/*/emoji-test ; Emoji_Short_Name
531+
emoji/*/emoji-test ; RGI_Emoji_Qualification ; 1
532+
# This binary property is defined by a derivation, but not provided directly in
533+
# data files. The derivation depends on multiple files, which is highly
534+
# inconvenient for the index. Instead we use a simpler alternate derivation and
535+
# verify in the invariant tests that they are equivalent.
536+
# The simpler derivation is
537+
# \p{RGI_Emoji} := [\p{RGI_Emoji_Qualification=Fully_Qualified}
538+
# \p{RGI_Emoji_Qualification=Standalone_Component}].
539+
emoji/*/emoji-test ; RGI_Emoji ; 1
530540

531541
FileType ; Unikemet ; PropertyValue
532542
Unikemet ; kEH_Cat

0 commit comments

Comments
 (0)