Skip to content

Commit 6493d0f

Browse files
authored
Word_Break=ALetter for ¸ (#1157)
* In UCD file WordBreakProperty.txt, assign Word_Break=ALetter to U+00B8 ¸ CEDILLA. * Regenerate UCD * a strange invariant
1 parent e6e3468 commit 6493d0f

File tree

3 files changed

+11
-6
lines changed

3 files changed

+11
-6
lines changed

unicodetools/data/ucd/dev/auxiliary/WordBreakProperty.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# WordBreakProperty-17.0.0.txt
2-
# Date: 2025-04-25, 16:57:39 GMT
2+
# Date: 2025-05-26, 17:27:16 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -692,6 +692,7 @@ FF71..FF9D ; Katakana # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
692692
0061..007A ; ALetter # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
693693
00AA ; ALetter # Lo FEMININE ORDINAL INDICATOR
694694
00B5 ; ALetter # L& MICRO SIGN
695+
00B8 ; ALetter # Sk CEDILLA
695696
00BA ; ALetter # Lo MASCULINE ORDINAL INDICATOR
696697
00C0..00D6 ; ALetter # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS
697698
00D8..00F6 ; ALetter # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS
@@ -1383,7 +1384,7 @@ FFDA..FFDC ; ALetter # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
13831384
1F150..1F169 ; ALetter # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
13841385
1F170..1F189 ; ALetter # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
13851386

1386-
# Total code points: 34001
1387+
# Total code points: 34002
13871388

13881389
# ================================================
13891390

unicodetools/src/main/java/org/unicode/text/UCD/ToolUnicodePropertySource.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,7 +1291,7 @@ public int getMaxWidth(boolean isShort) {
12911291
.remove(0x200D)
12921292
.remove(0x200B)
12931293
.removeAll(tags)
1294-
// 174-CXX.
1294+
// 175-C24.
12951295
.removeAll(gcb.getSet("Prepend")),
12961296
"Format");
12971297
unicodeMap.putAll(
@@ -1324,8 +1324,10 @@ public int getMaxWidth(boolean isShort) {
13241324
// Armenian punctuation marks that occur within words; see
13251325
// http://www.unicode.org/L2/L2018/18115.htm#155-C3
13261326
.addAll(new UnicodeSet("[\\u055B\\u055C\\u055E]"))
1327-
// 174-CXX.
1328-
.add(0x070F),
1327+
// 175-C24.
1328+
.add(0x070F)
1329+
// https://github.com/unicode-org/properties/issues/400.
1330+
.add(0x00B8),
13291331
"ALetter");
13301332
unicodeMap.putAll(
13311333
new UnicodeSet("[\\u00B7\\u0387\\u05F4\\u2027\\u003A\\uFE13\\uFE55\\uFF1A]"),

unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,8 @@ Let $uax31table3 := [\u0027\u002D\u002E\u003A\u00B7\u058A\u05F3\u05F4\u0F0B\u200
926926

927927
Let $WBRemovals := [\u0387\u0604\u2018\u2024\u202F\u2E2F\uFE13\uFE52\uFE55\uFF07\uFF0E\uFF1A\u200c\u200d'.\:\u00AD\u00B7\u05F3\u05F4\u0600-\u0603\u06DD\u070F\u17B4\u17B5\u200E\u200F\u2019\u2027\u202A-\u202E\u2060-\u2064\u206A-\u206F\uFEFF\uFFF9-\uFFFB\u02C2-\u02C5\u02D2-\u02D6\u02DE-\u02DF\u02ED\u02EF-\u02FF\uA720-\uA721\uA789-\uA78A\uAB5B\u055B\u055C\u055E\U000110BD\U0001D173-\U0001D17A\U000E0001\U000E0020-\U000E007F\p{gc=Cf}\p{Block=Enclosed Alphanumerics}[\u02D7\u0605\u061C\u180E\u2066-\u2069\U0001BCA0-\U0001BCA3\U0001F130-\U0001F149\U0001F150-\U0001F169\U0001F170-\U0001F189\U0001F3FB-\U0001F3FF]]
928928
Let $WBRemovals13 := [\u02E5-\u02EB\u055A\u058A\uA708-\uA716\u055F]
929+
Let $WBRemoval17 := [\u00B8]
930+
929931
# 9.0 corrected \p{gc=Cf} and added 202F
930932
# 10.0 added 34 characters which were absorbed into WB=LE (see http://www.unicode.org/reports/tr29/tr29-30d2.html#ALetter)
931933
# 11.0 added 5 skin tone modifiers which were absorbed into WB=Extend
@@ -936,7 +938,7 @@ Let $WBRemovals13 := [\u02E5-\u02EB\u055A\u058A\uA708-\uA716\u055F]
936938
Let $Uax31Removals := [\-\u058A\u0F0B\u2010\u30A0\u30FB\u2E2F\u17B4-\u17B5]
937939
# 6.1.0 adjust SAMVAT, KHMER VOWEL INHERENT*
938940

939-
[\p{Alpha}\p{WB=Extend}\p{WB=FO}\p{WB=LE}\p{WB=ML}\p{WB=MB}\p{WB=EX}-$WBRemovals-$WBRemovals13] = [$gcAllLetters $gcAllMarks \p{gc=Nl}\p{gc=Pc}-$Uax31Removals]
941+
[\p{Alpha}\p{WB=Extend}\p{WB=FO}\p{WB=LE}\p{WB=ML}\p{WB=MB}\p{WB=EX}-$WBRemovals-$WBRemovals13-$WBRemoval17] = [$gcAllLetters $gcAllMarks \p{gc=Nl}\p{gc=Pc}-$Uax31Removals]
940942

941943
# Don’t break words after GCB=Prepend characters (that would be within a
942944
# grapheme cluster).

0 commit comments

Comments
 (0)