Skip to content

Commit d1bdcb8

Browse files
authored
SegmenterDefault.txt: more remapping, less renaming (#970)
* Use remap rules for word and sentence too * No CM1 or ZWJ_O * Regenerate UCD * ^ rather than a variable called Not, UnicodeSet unions rather than | * Regenerate UCD
1 parent c4731c8 commit d1bdcb8

File tree

7 files changed

+23074
-22610
lines changed

7 files changed

+23074
-22610
lines changed

unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html

Lines changed: 3308 additions & 3307 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/dev/auxiliary/LineBreakTest.txt

Lines changed: 17190 additions & 16674 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.html

Lines changed: 405 additions & 405 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/dev/auxiliary/SentenceBreakTest.txt

Lines changed: 449 additions & 449 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/dev/auxiliary/WordBreakTest.html

Lines changed: 111 additions & 111 deletions
Large diffs are not rendered by default.

unicodetools/data/ucd/dev/auxiliary/WordBreakTest.txt

Lines changed: 1590 additions & 1590 deletions
Large diffs are not rendered by default.

unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt

Lines changed: 21 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ $BK=\p{Line_Break=Mandatory_Break}
7272
$CB=\p{Line_Break=Contingent_Break}
7373
$CL=\p{Line_Break=Close_Punctuation}
7474
$CP=\p{Line_Break=CP}
75-
$CM1=\p{Line_Break=Combining_Mark}
75+
$CM=\p{Line_Break=Combining_Mark}
7676
$CR=\p{Line_Break=Carriage_Return}
7777
$EX=\p{Line_Break=Exclamation}
7878
$GL=\p{Line_Break=Glue}
@@ -107,7 +107,6 @@ $CJ=\p{Line_Break=Conditional_Japanese_Starter}
107107
$RI=\p{Line_Break=Regional_Indicator}
108108
$EB=\p{Line_Break=E_Base}
109109
$EM=\p{Line_Break=E_Modifier}
110-
$ZWJ_O=\p{Line_Break=ZWJ}
111110
$ZWJ=\p{Line_Break=ZWJ}
112111

113112
$QU_Pi=[$QU & \p{gc=Pi}]
@@ -116,10 +115,10 @@ $QU_Pf=[$QU & \p{gc=Pf}]
116115
$QUmPi=[$QU - \p{gc=Pi}]
117116
$QUmPf=[$QU - \p{gc=Pf}]
118117

119-
$NotEastAsian = [^\p{ea=F}\p{ea=W}\p{ea=H}]
120-
$NonEastAsianBA = [$BA & $NotEastAsian]
118+
$EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}]
119+
$NonEastAsianBA = [$BA & [^$EastAsian]]
121120

122-
$DottedCircle =
121+
$DottedCircle = [◌]
123122
$Hyphen = [\u2010]
124123

125124
$CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
@@ -135,18 +134,13 @@ $eot=(?!.)
135134

136135
# SPECIAL EXTENSIONS
137136

138-
$CM=[$CM1 $ZWJ]
139137
# LB 1 Assign a line breaking class to each code point of the input.
140138
# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.
141139
# NOTE: CB is ok to fall through, but must handle others here.
142140
## show $AL
143141
$AL=[$AI $AL $SG $XX $SA]
144142
$NS=[$NS $CJ]
145143

146-
# MACROS
147-
148-
$Spec3a_=[^ $SP $BA $HY]
149-
150144
# RULES
151145

152146
# LB 4 Always break after hard line breaks (but never between CR and LF).
@@ -164,7 +158,7 @@ $Spec3a_=[^ $SP $BA $HY]
164158
# LB 8 Break before any character following a zero-width space, even if one or more spaces intervene.
165159
8) $ZW $SP* ÷
166160
# LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)
167-
8.1) $ZWJ_O ×
161+
8.1) $ZWJ ×
168162
# LB 9 Do not break a combining character sequence; treat it as if it has the line breaking class
169163
# of the base character in all of the following rules. Treat ZWJ as if it were CM.
170164
9) (?<X>[^$BK $CR $LF $NL $SP $ZW]) ( $CM | $ZWJ )* → ${X}
@@ -176,7 +170,7 @@ $Spec3a_=[^ $SP $BA $HY]
176170
# LB 12 Do not break after NBSP and related characters.
177171
12) $GL ×
178172
# LB 12a Do not break before NBSP and related characters, except after spaces and hyphens.
179-
12.1) $Spec3a_ × $GL
173+
12.1) [^ $SP $BA $HY] × $GL
180174
# LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.
181175
13.01) × $EX
182176
13.02) × $CL
@@ -205,10 +199,10 @@ $Spec3a_=[^ $SP $BA $HY]
205199
19.01) × $QUmPi
206200
19.02) $QUmPf ×
207201
# LB 19a Unless surrounded by East Asian Characters, do not break either side of any unresolved quotation marks.
208-
19.10) $NotEastAsian × $QU
209-
19.11) × $QU ( $NotEastAsian | $eot )
210-
19.12) $QU × $NotEastAsian
211-
19.13) ( $sot | $NotEastAsian ) $QU ×
202+
19.10) [^$EastAsian] × $QU
203+
19.11) × $QU ( [^$EastAsian] | $eot )
204+
19.12) $QU × [^$EastAsian]
205+
19.13) ( $sot | [^$EastAsian] ) $QU ×
212206
# LB 20 Break before and after unresolved CB.
213207
20.01) ÷ $CB
214208
20.02) $CB ÷
@@ -306,27 +300,10 @@ $Any=.
306300
## Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.
307301
## $NotStuff=[^$OLetter $Upper $Lower $Sep]
308302
## # $ATerm and $Sterm are temporary, to match ICU until UTC decides.
309-
310-
# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend
311-
312-
$FE=[$Format $Extend]
313-
$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]
314-
## $NotSep_=[^ $Sep $CR $LF]
315-
## $FE=$Extend* $Format*
316-
$Sp=($Sp $FE*)
317-
$Lower=($Lower $FE*)
318-
$Upper=($Upper $FE*)
319-
$OLetter=($OLetter $FE*)
320-
$Numeric=($Numeric $FE*)
321-
$ATerm=($ATerm $FE*)
322-
$STerm=($STerm $FE*)
323-
$Close=($Close $FE*)
324-
$SContinue=($SContinue $FE*)
325-
326303
# MACROS
327304

328-
$ParaSep = ($Sep | $CR | $LF)
329-
$SATerm = ($STerm | $ATerm)
305+
$ParaSep = [$Sep $CR $LF]
306+
$SATerm = [$STerm $ATerm]
330307

331308
# RULES
332309

@@ -337,18 +314,16 @@ $SATerm = ($STerm | $ATerm)
337314
4) $ParaSep ÷
338315
## 3.4) ( $Control | $CR | $LF ) ÷
339316
## 3.5) ÷ ( $Control | $CR | $LF )
340-
# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
341-
# WARNING: Implemented as don't break before format (except after linebreaks),
342-
# AND add format and extend in all variables definitions that appear after this point!
343-
## 3.91) [^$Control | $CR | $LF] × $Extend
344-
5) × [$Format $Extend]
317+
# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.)
318+
# This also has the effect of: Any × (Format | Extend)
319+
5) (?<X>[^$ParaSep]) ( $Extend | $Format )* → ${X}
345320
# Do not break after full stop in certain contexts. [See note below.]
346321
# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,
347322
# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.
348323
# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
349324
6) $ATerm × $Numeric
350325
7) ($Upper | $Lower) $ATerm × $Upper
351-
8) $ATerm $Close* $Sp* × $NotPreLower_* $Lower
326+
8) $ATerm $Close* $Sp* × [^ $OLetter $Upper $Lower $ParaSep $SATerm]* $Lower
352327
8.1) $SATerm $Close* $Sp* × ($SContinue | $SATerm)
353328
# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
354329
9) $SATerm $Close* × ( $Close | $Sp | $ParaSep )
@@ -393,38 +368,12 @@ $WSegSpace=\p{Word_Break=WSegSpace}
393368

394369
# MACROS
395370

396-
$AHLetter=($ALetter | $Hebrew_Letter)
397-
$MidNumLetQ=($MidNumLet | $Single_Quote)
371+
$AHLetter=[$ALetter $Hebrew_Letter]
372+
$MidNumLetQ=[$MidNumLet $Single_Quote]
398373
## WARNING: For Rule 4: Fixes for GC, Format
399374
## # Subtract Format from Control, since we don't want to break before/after
400375
## $Control=[$Control-$Format]
401376

402-
# SPECIAL EXTENSIONS
403-
404-
# Add format and extend to everything
405-
$FE=[$Format $Extend $ZWJ]
406-
407-
$NotBreak_=[^ $Newline $CR $LF ]
408-
## $FE= ($Extend | $Format)*
409-
$Katakana=($Katakana $FE*)
410-
$ALetter=($ALetter $FE*)
411-
$MidLetter=($MidLetter $FE*)
412-
$MidNum=($MidNum $FE*)
413-
$MidNumLet=($MidNumLet $FE*)
414-
$Numeric=($Numeric $FE*)
415-
$ExtendNumLet=($ExtendNumLet $FE*)
416-
$RI=($RI $FE*)
417-
$Hebrew_Letter=($Hebrew_Letter $FE*)
418-
$Double_Quote=($Double_Quote $FE*)
419-
$Single_Quote=($Single_Quote $FE*)
420-
## $E_Base=($E_Base $FE*)
421-
## $E_Modifier=($E_Modifier $FE*)
422-
## $ZWJ=($ZWJ $FE*) # don't do this one!
423-
## $Glue_After_Zwj=($Glue_After_Zwj $FE*)
424-
## $EBG=($EBG $FE*)
425-
$AHLetter=($AHLetter $FE*)
426-
$MidNumLetQ=($MidNumLetQ $FE*)
427-
428377
# RULES
429378

430379
# Break at the start and end of text, unless the text is empty.
@@ -440,11 +389,9 @@ $MidNumLetQ=($MidNumLetQ $FE*)
440389
## 3.5) ÷ ( $Control | $CR | $LF )
441390
## 3.9) × $Extend
442391
## 3.91) [^$Control | $CR | $LF] × $Extend
443-
# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
444-
# WARNING: Implemented as don't break before format (except after linebreaks),
445-
# AND add format and extend in all variables definitions that appear after this point!
446-
## 4) × [$Format $Extend]
447-
4) $NotBreak_ × [$Format $Extend $ZWJ]
392+
# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.)
393+
# This also has the effect of: Any × (Format | Extend)
394+
4) (?<X>[^$CR $LF $Newline]) ($Extend | $Format | $ZWJ)* → ${X}
448395

449396
# VANILLA RULES
450397

0 commit comments

Comments
 (0)