Skip to content

Commit 0108d39

Browse files
authored
fix TestUnicodeInvariants failures (#181)
1 parent c3a0ad7 commit 0108d39

File tree

9 files changed

+35
-25
lines changed

9 files changed

+35
-25
lines changed

unicodetools/data/ucd/dev/IndicPositionalCategory.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# IndicPositionalCategory-15.0.0.txt
2-
# Date: 2021-12-07, 20:42:00 GMT [KW, RP]
2+
# Date: 2021-12-10, 23:49:00 GMT [KW, RP]
33
# © 2021 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -158,6 +158,7 @@
158158
0CBE ; Right # Mc KANNADA VOWEL SIGN AA
159159
0CC1..0CC4 ; Right # Mc [4] KANNADA VOWEL SIGN U..KANNADA VOWEL SIGN VOCALIC RR
160160
0CD5..0CD6 ; Right # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
161+
0CF3 ; Right # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
161162
0D02..0D03 ; Right # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
162163
0D3E..0D40 ; Right # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II
163164
0D41..0D42 ; Right # Mn [2] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN UU
@@ -296,9 +297,9 @@ ABEC ; Right # Mc MEETEI MAYEK LUM IYEK
296297
11D93..11D94 ; Right # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
297298
11D96 ; Right # Mc GUNJALA GONDI SIGN VISARGA
298299
11EF6 ; Right # Mc MAKASAR VOWEL SIGN O
299-
11F03 ; Right # Mn KAWI SIGN VISARGA
300+
11F03 ; Right # Mc KAWI SIGN VISARGA
300301
11F34..11F35 ; Right # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
301-
11F41 ; Right # Mn KAWI SIGN KILLER
302+
11F41 ; Right # Mc KAWI SIGN KILLER
302303

303304
# Indic_Positional_Category=Left
304305

unicodetools/data/ucd/dev/IndicSyllabicCategory.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# IndicSyllabicCategory-15.0.0.txt
2-
# Date: 2021-12-02, 20:43:00 GMT [KW, RP]
2+
# Date: 2021-12-09, 23:43:00 GMT [KW, RP]
33
# © 2021 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see http://www.unicode.org/terms_of_use.html
@@ -84,6 +84,7 @@
8484
0C80 ; Bindu # Lo KANNADA SIGN SPACING CANDRABINDU
8585
0C81 ; Bindu # Mn KANNADA SIGN CANDRABINDU
8686
0C82 ; Bindu # Mc KANNADA SIGN ANUSVARA
87+
0CF3 ; Bindu # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
8788
0D00..0D01 ; Bindu # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
8889
0D02 ; Bindu # Mc MALAYALAM SIGN ANUSVARA
8990
0D04 ; Bindu # Lo MALAYALAM LETTER VEDIC ANUSVARA

unicodetools/data/ucd/dev/LineBreak.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3117,7 +3117,7 @@ FFFD;AI # So REPLACEMENT CHARACTER
31173117
11F40;SA # Mn KAWI VOWEL SIGN EU
31183118
11F41;SA # Mc KAWI SIGN KILLER
31193119
11F42;SA # Mn KAWI CONJOINER
3120-
11F43..11F4F;SA # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
3120+
11F43..11F4F;BA # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
31213121
11F50..11F59;NU # Nd [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
31223122
11FB0;AL # Lo LISU LETTER YHA
31233123
11FC0..11FD4;AL # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH

unicodetools/data/ucd/dev/UnicodeData.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2975,7 +2975,7 @@
29752975
0CEF;KANNADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
29762976
0CF1;KANNADA SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;;
29772977
0CF2;KANNADA SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;;
2978-
0CF3;KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT;Mc;0;NSM;;;;;N;;;;;
2978+
0CF3;KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT;Mc;0;L;;;;;N;;;;;
29792979
0D00;MALAYALAM SIGN COMBINING ANUSVARA ABOVE;Mn;0;NSM;;;;;N;;;;;
29802980
0D01;MALAYALAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
29812981
0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;

unicodetools/data/ucd/dev/auxiliary/GraphemeBreakProperty.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# GraphemeBreakProperty-15.0.0.txt
2-
# Date: 2021-12-09, 17:39:42 GMT
2+
# Date: 2021-12-11, 03:22:55 GMT
33
# © 2021 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -623,16 +623,13 @@ ABEC ; SpacingMark # Mc MEETEI MAYEK LUM IYEK
623623
11D93..11D94 ; SpacingMark # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
624624
11D96 ; SpacingMark # Mc GUNJALA GONDI SIGN VISARGA
625625
11EF5..11EF6 ; SpacingMark # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
626-
11F03 ; SpacingMark # Mc KAWI SIGN VISARGA
627-
11F34..11F35 ; SpacingMark # Mc [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
628626
11F3E..11F3F ; SpacingMark # Mc [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
629-
11F41 ; SpacingMark # Mc KAWI SIGN KILLER
630627
16F51..16F87 ; SpacingMark # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
631628
16FF0..16FF1 ; SpacingMark # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
632629
1D166 ; SpacingMark # Mc MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
633630
1D16D ; SpacingMark # Mc MUSICAL SYMBOL COMBINING AUGMENTATION DOT
634631

635-
# Total code points: 395
632+
# Total code points: 391
636633

637634
# ================================================
638635

unicodetools/data/ucd/dev/extracted/DerivedBidiClass.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# DerivedBidiClass-15.0.0.txt
2-
# Date: 2021-12-09, 22:21:24 GMT
2+
# Date: 2021-12-11, 03:22:45 GMT
33
# © 2021 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -219,6 +219,7 @@
219219
0CE0..0CE1 ; L # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
220220
0CE6..0CEF ; L # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
221221
0CF1..0CF2 ; L # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
222+
0CF3 ; L # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
222223
0D02..0D03 ; L # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
223224
0D04..0D0C ; L # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
224225
0D0E..0D10 ; L # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
@@ -1101,7 +1102,7 @@ F0000..FFFFD ; L # Co [65534] <private-use-F0000>..<private-use-FFFFD>
11011102
100000..10FFFD; L # Co [65534] <private-use-100000>..<private-use-10FFFD>
11021103

11031104
# The above property value applies to 821129 code points not listed here.
1104-
# Total code points: 1096275
1105+
# Total code points: 1096276
11051106

11061107
# ================================================
11071108

@@ -2057,7 +2058,6 @@ FFFFE..FFFFF ; BN # Cn [2] <noncharacter-FFFFE>..<noncharacter-FFFFF>
20572058
0CBC ; NSM # Mn KANNADA SIGN NUKTA
20582059
0CCC..0CCD ; NSM # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
20592060
0CE2..0CE3 ; NSM # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
2060-
0CF3 ; NSM # Mc KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
20612061
0D00..0D01 ; NSM # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
20622062
0D3B..0D3C ; NSM # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
20632063
0D41..0D44 ; NSM # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
@@ -2328,7 +2328,7 @@ FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC
23282328
1E944..1E94A ; NSM # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
23292329
E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
23302330

2331-
# Total code points: 1977
2331+
# Total code points: 1976
23322332

23332333
# ================================================
23342334

unicodetools/data/ucd/dev/extracted/DerivedLineBreak.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# DerivedLineBreak-15.0.0.txt
2-
# Date: 2021-12-09, 17:39:39 GMT
2+
# Date: 2021-12-11, 03:22:52 GMT
33
# © 2021 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use, see https://www.unicode.org/terms_of_use.html
@@ -2471,6 +2471,7 @@ ABEB ; BA # Po MEETEI MAYEK CHEIKHEI
24712471
11A9A..11A9C ; BA # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD
24722472
11AA1..11AA2 ; BA # Po [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
24732473
11C41..11C45 ; BA # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2
2474+
11F43..11F4F ; BA # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
24742475
11FFF ; BA # Po TAMIL PUNCTUATION END OF TEXT
24752476
12470..12474 ; BA # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
24762477
16A6E..16A6F ; BA # Po [2] MRO DANDA..MRO DOUBLE DANDA
@@ -2481,7 +2482,7 @@ ABEB ; BA # Po MEETEI MAYEK CHEIKHEI
24812482
1BC9F ; BA # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
24822483
1DA87..1DA8A ; BA # Po [4] SIGNWRITING COMMA..SIGNWRITING COLON
24832484

2484-
# Total code points: 247
2485+
# Total code points: 260
24852486

24862487
# ================================================
24872488

@@ -2661,9 +2662,8 @@ AADE..AADF ; SA # Po [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI
26612662
11F40 ; SA # Mn KAWI VOWEL SIGN EU
26622663
11F41 ; SA # Mc KAWI SIGN KILLER
26632664
11F42 ; SA # Mn KAWI CONJOINER
2664-
11F43..11F4F ; SA # Po [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
26652665

2666-
# Total code points: 834
2666+
# Total code points: 821
26672667

26682668
# ================================================
26692669

unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,9 @@ public int getMaxWidth(boolean isShort) {
811811
// unicodeMap.setErrorOnReset(true);
812812

813813
// https://www.unicode.org/reports/tr29/#SpacingMark
814+
// Ken: The *postbase* gc=Mc, InPC=Right characters (for these scripts with lb=SA)
815+
// should be excluded from the set GCB=SpacingMark, and instead should fall through to GCB=XX.
816+
// Keep these exceptions in sync with the table in UAX #29.
814817
UnicodeSet gcbSpacingMarkSet =
815818
cat.getSet("Spacing_Mark")
816819
// any of the following (which have General_Category = Other_Letter):
@@ -825,7 +828,8 @@ public int getMaxWidth(boolean isShort) {
825828
// They may have been gc=Spacing_Mark in an earlier version.
826829
"\u19B0-\u19B4\u19B8\u19B9\u19BB-\u19C0\u19C8\u19C9" + // New Tai Lue
827830
"\u1A61\u1A63\u1A64" + // Tai Tham
828-
"\uAA7B\uAA7D]")) // Myanmar
831+
"\uAA7B\uAA7D" + // Myanmar
832+
"\\U00011F03\\U00011F34\\U00011F35\\U00011F41]")) // Kawi Unicode 15
829833
.removeAll(unicodeMap.keySet("Extend"));
830834
if (compositeVersion >= (14 << 16)) {
831835
gcbSpacingMarkSet.remove(0x11720).remove(0x11721); // AHOM VOWEL SIGN A & AA

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,11 @@ In \P{U-1:GC=Cn} name=U-1:name
176176

177177
# Red Flag: cased and case_ignorable should be disjoint
178178

179-
Let $caseOverlap = [\u02B0-\u02B8\u02C0\u02C1\u02E0-\u02E4\u0345\u037A\u1D2C-\u1D61\u1D78\u1D9B-\u1DBF\u2090-\u2094\u2C7D\uA770[\u1D62-\u1D6A\u2071\u207F\u2095-\u209C\u2C7C\uA7F8\uA7F9 \uA69C-\uA69D\uAB5C-\uAB5F]\U00010780\U00010783-\U00010785\U00010787-\U000107B0\U000107B2-\U000107BA]
179+
Let $caseOverlap = [\u02B0-\u02B8\u02C0\u02C1\u02E0-\u02E4\u0345\u037A\u1D2C-\u1D61\u1D78\u1D9B-\u1DBF\u2090-\u2094\u2C7D\uA770[\u1D62-\u1D6A\u2071\u207F\u2095-\u209C\u2C7C\uA7F8\uA7F9 \uA69C-\uA69D\uAB5C-\uAB5F]\U00010780\U00010783-\U00010785\U00010787-\U000107B0\U000107B2-\U000107BA\U0001E030-\U0001E06C]
180180
# 6.1.0, added 22 overlap characters
181181
# 7.0 added \uA69C-\uA69D\uAB5C-\uAB5F
182182
# 14.0 new modifier letters in Latin Extended-F are Lowercase (U+107xx)
183+
# 15.0 new modifier letters in Cyrillic Extended-D are Lowercase (U+1E030..1E06C)
183184
\p{cased} ∥ [\p{caseignorable} - $caseOverlap]
184185

185186
##########################
@@ -479,13 +480,14 @@ Let $QUInclusions = [\u275F-\u2760 \U0001F676-\U0001F678 \u0022 \u0027 \u275B-\u
479480
# covered by adding them to the exception set $SAScriptExceptions for the test.
480481

481482
# SA are limited to certain scripts:
482-
Let $SAScripts = [\p{script=ahom} \p{script=thai} \p{script=lao} \p{script=myanmar} \p{script=khmer} \p{script=Tai_Le} \p{script=New_Tai_Lue} \p{script=Tai_Tham} \p{script=Tai_Viet}]
483+
Let $SAScripts = [\p{script=ahom} \p{script=thai} \p{script=lao} \p{script=myanmar} \p{script=khmer} \p{script=Tai_Le} \p{script=New_Tai_Lue} \p{script=Tai_Tham} \p{script=Tai_Viet} \p{script=Kawi}]
483484
$SAScripts ⊇ \p{LineBreak=SA}
484485

485486
# And in $SA scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn, plus the NEW TAI LUE THAM DIGIT ONE
486-
Let $SAScriptExceptions = [\x{1173A}\x{1173B}\x{1173F} \u19DA\u109E\u109F\u19DE\u19DF\u1AA0-\u1AA6\u1AA8-\u1AAD\uAA77-\uAA79\uAADE-\uAADF]
487-
#7.0 Added AA7D
488-
#12.0 Removed Myanmar spacing marks which were assigned Other_Alphabetic: 1063..1064, 1069..106D, 1087..108C, 108F, 109A..109B, AA7B, AA7D
487+
Let $SAScriptExceptions = [\x{1173A}\x{1173B}\x{1173F} \u19DA\u109E\u109F\u19DE\u19DF\u1AA0-\u1AA6\u1AA8-\u1AAD\uAA77-\uAA79\uAADE-\uAADF \U00011F41]
488+
# 7.0 Added AA7D
489+
# 12.0 Removed Myanmar spacing marks which were assigned Other_Alphabetic: 1063..1064, 1069..106D, 1087..108C, 108F, 109A..109B, AA7B, AA7D
490+
# 14.0 Add U+11F41 KAWI SIGN KILLER (gc=Mc, lb=SA)
489491

490492
[$SAScripts & [\p{Alphabetic} \p{gc=cf} \p{gc=Mn} $SAScriptExceptions]] = [$SAScripts & [\p{LineBreak=SA} \p{LineBreak=CM}]]
491493

@@ -563,6 +565,11 @@ Let $EgyptianControls = [\U00013430-\U00013438]
563565

564566
# Post-base spacing combining marks of most SE Asian scripts are explicitly excluded from GCB=SpacingMark
565567
# See https://www.unicode.org/reports/tr29/#SpacingMark
568+
# Ken: The *postbase* gc=Mc, InPC=Right characters (for these scripts with lb=SA)
569+
# should be excluded from the set GCB=SpacingMark, and instead should fall through to GCB=XX.
570+
# That is, when this test case fails, then
571+
# - change ToolUnicodePropertySource.java to remove the offending characters from the gcbSpacingMarkSet
572+
# - add them to the exceptions list in UAX #29
566573
Let $PostBaseSpacingMarks_All = [[[:lb=SA:]-[:sc=Khmr:]] & [:gc=Mc:] & [:InPC=Right:]]
567574
Let $PostBaseSpacingMarks_Tweak = [\u103B \u1056 \u1057 \u1A57 \u1A6D]
568575
Let $PostBaseSpacingMarks_Missed = []

0 commit comments

Comments
 (0)