Skip to content

Commit 3743816

Browse files
authored
Supply the field indices and correct the types of NormalizationCorrections data (#1087)
* Supply the field indices and correct the types of NormalizationCorrections data * A test. * Better test and comment * @missing * Minimal fix to emoji_variation_sequence getSet * spots
1 parent 86c22fd commit 3743816

File tree

7 files changed

+50
-20
lines changed

7 files changed

+50
-20
lines changed

unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -732,8 +732,7 @@ public IndexUnicodeProperties getFactory() {
732732
@Override
733733
public boolean isTrivial() {
734734
return _getRawUnicodeMap().isEmpty()
735-
|| ((_getRawUnicodeMap().stringKeys() == null
736-
|| _getRawUnicodeMap().stringKeys().isEmpty())
735+
|| (!hasStrings()
737736
&& _getRawUnicodeMap()
738737
.keySet(_getRawUnicodeMap().getValue(0))
739738
.equals(UnicodeSet.ALL_CODE_POINTS));
@@ -798,6 +797,12 @@ protected UnicodeMap<String> _getRawUnicodeMap() {
798797
return load(prop);
799798
}
800799

800+
@Override
801+
protected boolean hasStrings() {
802+
return _getRawUnicodeMap().stringKeys() != null
803+
&& !_getRawUnicodeMap().stringKeys().isEmpty();
804+
}
805+
801806
private UnicodeSet getDiffSet() {
802807
if (diffSet == null) {
803808
diffSet =
@@ -836,10 +841,7 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
836841
}
837842
// We only do the delta thing for code points; for strings, we need to do the lookup
838843
// directly (and clean whatever was added by walking through history).
839-
if (baseVersionProperties != null
840-
&& (result.hasStrings()
841-
|| (_getRawUnicodeMap().stringKeys() != null
842-
&& !_getRawUnicodeMap().stringKeys().isEmpty()))) {
844+
if (baseVersionProperties != null && (result.hasStrings() || hasStrings())) {
843845
result.removeAllStrings().addAll(super.getSet(matcher, new UnicodeSet()).strings());
844846
}
845847
return result;

unicodetools/src/main/java/org/unicode/props/UcdProperty.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,14 @@ public enum UcdProperty {
9494
null,
9595
ValueCardinality.Unordered,
9696
"cjkTraditionalVariant"),
97+
normalization_correction_corrected(
98+
PropertyType.String,
99+
DerivedPropertyStatus.UCDNonProperty,
100+
"normalization_correction_corrected"),
101+
normalization_correction_original(
102+
PropertyType.String,
103+
DerivedPropertyStatus.UCDNonProperty,
104+
"normalization_correction_original"),
97105

98106
// Miscellaneous
99107
CJK_Radical(
@@ -497,14 +505,6 @@ public enum UcdProperty {
497505
null,
498506
ValueCardinality.Unordered,
499507
"cjkZhuangNumeric"),
500-
normalization_correction_corrected(
501-
PropertyType.Miscellaneous,
502-
DerivedPropertyStatus.UCDNonProperty,
503-
"normalization_correction_corrected"),
504-
normalization_correction_original(
505-
PropertyType.Miscellaneous,
506-
DerivedPropertyStatus.UCDNonProperty,
507-
"normalization_correction_original"),
508508
normalization_correction_version(
509509
PropertyType.Miscellaneous,
510510
DerivedPropertyStatus.UCDNonProperty,

unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,10 +470,14 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) {
470470

471471
public static final String UNUSED = "??";
472472

473+
protected boolean hasStrings() {
474+
return false;
475+
}
476+
473477
public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) {
474478
if (result == null) result = new UnicodeSet();
475479
boolean uniformUnassigned = hasUniformUnassigned();
476-
if (isType(STRING_OR_MISC_MASK) && !isMultivalued) {
480+
if (isType(STRING_OR_MISC_MASK) && !isMultivalued && !hasStrings()) {
477481
for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned);
478482
usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i
479483
int i = usi.codepoint;

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyAliases.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ cjkTraditionalVariant ; kTraditionalVariant ; Provisional
5252

5353
Do_Not_Emit_Preferred ; Do_Not_Emit_Preferred ; UCDNonProperty
5454

55+
normalization_correction_original ; normalization_correction_original ; UCDNonProperty
56+
normalization_correction_corrected ; normalization_correction_corrected ; UCDNonProperty
57+
5558
# ================================================
5659
# Miscellaneous Properties
5760
# ================================================
@@ -65,8 +68,6 @@ NS ; Named_Sequences ; UCDNonProperty
6568
NSP ; Named_Sequences_Prov ; UCDNonProperty
6669
SV ; Standardized_Variant ; UCDNonProperty
6770

68-
normalization_correction_original ; normalization_correction_original ; UCDNonProperty
69-
normalization_correction_corrected ; normalization_correction_corrected ; UCDNonProperty
7071
normalization_correction_version ; normalization_correction_version ; UCDNonProperty
7172

7273
emoji_variation_sequence ; emoji_variation_sequence ; UCDNonProperty

unicodetools/src/main/resources/org/unicode/props/ExtraPropertyValueAliases.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ Do_Not_Emit_Type ; Discouraged ; Discouraged
326326
Do_Not_Emit_Type ; Preferred_Spelling ; Preferred_Spelling
327327
Do_Not_Emit_Type ; Arabic_Tashkil ; Arabic_Tashkil
328328

329+
# @missing: 0000..10FFFF; normalization_correction_original; <none>
330+
# @missing: 0000..10FFFF; normalization_correction_corrected; <none>
331+
329332
# Values from the old Provisional Indic_Matra_Category (identified with the Informative InPC in the
330333
# tools) and InSC.
331334
Indic_Positional_Category ; Invisible ; Invisible

unicodetools/src/main/resources/org/unicode/props/IndexUnicodeProperties.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -328,9 +328,9 @@ NushuSources ; kReading
328328
TangutSources ; kRSTUnicode
329329
TangutSources ; kTGT_MergedSrc
330330

331-
NormalizationCorrections ; normalization_correction_original
332-
NormalizationCorrections ; normalization_correction_corrected
333-
NormalizationCorrections ; normalization_correction_version
331+
NormalizationCorrections ; normalization_correction_original ; 1
332+
NormalizationCorrections ; normalization_correction_corrected ; 2
333+
NormalizationCorrections ; normalization_correction_version ; 3
334334

335335
# Properties removed from Unihan before 5.1.
336336
# Point to a nonexistent file so that we don’t try to read them from the most recent monolithic

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,17 @@ $combiningExclusions ⊇ [$singletons & \p{dt=canonical}]
616616
$combiningExclusions ⊇ [$nonstarter & \p{dt=canonical}]
617617
$combiningExclusions ⊇ [$firstNonStarter & \p{dt=canonical}]
618618

619+
620+
\p{normalization_correction_version≠@none@} = [陋 㛼 当 𤎫 竮 䗗]
621+
# Everything is corrected now.
622+
In \p{normalization_correction_version≠@none@}, normalization_correction_corrected = Decomposition_Mapping
623+
# This all happened between 3.1 and 4.0.
624+
In \p{normalization_correction_version≠@none@}, normalization_correction_corrected = U4.0:Decomposition_Mapping
625+
In \p{normalization_correction_version≠@none@}, normalization_correction_original = U3.1:Decomposition_Mapping
626+
# The version field tells us whether the 3.2 mapping was wrong.
627+
In \p{normalization_correction_version=3.2.0}, U3.2:Decomposition_Mapping = normalization_correction_corrected
628+
In \p{normalization_correction_version=4.0.0}, U3.2:Decomposition_Mapping = normalization_correction_original
629+
619630
##########################
620631
# Other Invariant Tests, not in Stability Policies
621632
##########################
@@ -1097,6 +1108,11 @@ Let $HairComponents := [\U0001F9B0-\U0001F9B3]
10971108
# And nearly all emoji are Extended_Pictographic.
10981109
[\p{Extended_Pictographic}-\p{gc=Cn}] = [\p{Emoji}-\p{Regional_Indicator}-\p{Emoji_Modifier}-\p{Block=Basic Latin}]
10991110

1111+
# TODO(egg): We could check that the variation selectors are used consistently, but this would
1112+
# require UnicodeProperty to support strings properly.
1113+
\p{emoji_variation_sequence=text style} ⊃ [{♈︎}]
1114+
\p{emoji_variation_sequence=emoji style} ⊃ [{♈️}]
1115+
11001116
##########################
11011117
# POSIX Compatibility Properties (UTS#18)
11021118
# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
@@ -1321,6 +1337,10 @@ $unikemetScope ⊃ \P{kEH_Core=None}
13211337
\p{kEH_Core=Legacy} ⊂ \p{Block=Egyptian Hieroglyphs}
13221338
[\p{kEH_Core=None} & $unikemetScope] ⊂ \p{Block=Egyptian Hieroglyphs Extended-A}
13231339

1340+
# TODO(egg): We could check that the variation selectors are used consistently, but this would
1341+
# require UnicodeProperty to support strings properly.
1342+
\p{Standardized_Variant=rotated 90 degrees} ⊃ [{𓏲\uFE00}]
1343+
13241344
# InPC-InSC-gc invariants
13251345
# See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.
13261346
\p{InPC=/(Left|Right)/} ⊆ [\p{gc=Mc}\p{gc=Lo}\p{gc=Lm}]

0 commit comments

Comments
 (0)