@@ -72,7 +72,7 @@ $BK=\p{Line_Break=Mandatory_Break}
7272$CB=\p{Line_Break=Contingent_Break}
7373$CL=\p{Line_Break=Close_Punctuation}
7474$CP=\p{Line_Break=CP}
75- $CM1 =\p{Line_Break=Combining_Mark}
75+ $CM =\p{Line_Break=Combining_Mark}
7676$CR=\p{Line_Break=Carriage_Return}
7777$EX=\p{Line_Break=Exclamation}
7878$GL=\p{Line_Break=Glue}
@@ -107,7 +107,6 @@ $CJ=\p{Line_Break=Conditional_Japanese_Starter}
107107$RI=\p{Line_Break=Regional_Indicator}
108108$EB=\p{Line_Break=E_Base}
109109$EM=\p{Line_Break=E_Modifier}
110- $ZWJ_O=\p{Line_Break=ZWJ}
111110$ZWJ=\p{Line_Break=ZWJ}
112111
113112$QU_Pi=[$QU & \p{gc=Pi}]
@@ -116,10 +115,10 @@ $QU_Pf=[$QU & \p{gc=Pf}]
116115$QUmPi=[$QU - \p{gc=Pi}]
117116$QUmPf=[$QU - \p{gc=Pf}]
118117
119- $NotEastAsian = [^ \p{ea=F}\p{ea=W}\p{ea=H}]
120- $NonEastAsianBA = [$BA & $NotEastAsian ]
118+ $EastAsian = [\p{ea=F}\p{ea=W}\p{ea=H}]
119+ $NonEastAsianBA = [$BA & [^$EastAsian] ]
121120
122- $DottedCircle = ◌
121+ $DottedCircle = [◌]
123122$Hyphen = [\u2010]
124123
125124$CP30=[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]
@@ -135,18 +134,13 @@ $eot=(?!.)
135134
136135# SPECIAL EXTENSIONS
137136
138- $CM=[$CM1 $ZWJ]
139137# LB 1 Assign a line breaking class to each code point of the input.
140138# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.
141139# NOTE: CB is ok to fall through, but must handle others here.
142140## show $AL
143141$AL=[$AI $AL $SG $XX $SA]
144142$NS=[$NS $CJ]
145143
146- # MACROS
147-
148- $Spec3a_=[^ $SP $BA $HY]
149-
150144# RULES
151145
152146# LB 4 Always break after hard line breaks (but never between CR and LF).
@@ -164,7 +158,7 @@ $Spec3a_=[^ $SP $BA $HY]
164158# LB 8 Break before any character following a zero-width space, even if one or more spaces intervene.
1651598) $ZW $SP* ÷
166160# LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)
167- 8.1) $ZWJ_O ×
161+ 8.1) $ZWJ ×
168162# LB 9 Do not break a combining character sequence; treat it as if it has the line breaking class
169163# of the base character in all of the following rules. Treat ZWJ as if it were CM.
1701649) (?<X>[^$BK $CR $LF $NL $SP $ZW]) ( $CM | $ZWJ )* → ${X}
@@ -176,7 +170,7 @@ $Spec3a_=[^ $SP $BA $HY]
176170# LB 12 Do not break after NBSP and related characters.
17717112) $GL ×
178172# LB 12a Do not break before NBSP and related characters, except after spaces and hyphens.
179- 12.1) $Spec3a_ × $GL
173+ 12.1) [^ $SP $BA $HY] × $GL
180174# LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.
18117513.01) × $EX
18217613.02) × $CL
@@ -205,10 +199,10 @@ $Spec3a_=[^ $SP $BA $HY]
20519919.01) × $QUmPi
20620019.02) $QUmPf ×
207201# LB 19a Unless surrounded by East Asian Characters, do not break either side of any unresolved quotation marks.
208- 19.10) $NotEastAsian × $QU
209- 19.11) × $QU ( $NotEastAsian | $eot )
210- 19.12) $QU × $NotEastAsian
211- 19.13) ( $sot | $NotEastAsian ) $QU ×
202+ 19.10) [^$EastAsian] × $QU
203+ 19.11) × $QU ( [^$EastAsian] | $eot )
204+ 19.12) $QU × [^$EastAsian]
205+ 19.13) ( $sot | [^$EastAsian] ) $QU ×
212206# LB 20 Break before and after unresolved CB.
21320720.01) ÷ $CB
21420820.02) $CB ÷
@@ -306,27 +300,10 @@ $Any=.
306300## Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.
307301## $NotStuff=[^$OLetter $Upper $Lower $Sep]
308302## # $ATerm and $Sterm are temporary, to match ICU until UTC decides.
309-
310- # WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend
311-
312- $FE=[$Format $Extend]
313- $NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]
314- ## $NotSep_=[^ $Sep $CR $LF]
315- ## $FE=$Extend* $Format*
316- $Sp=($Sp $FE*)
317- $Lower=($Lower $FE*)
318- $Upper=($Upper $FE*)
319- $OLetter=($OLetter $FE*)
320- $Numeric=($Numeric $FE*)
321- $ATerm=($ATerm $FE*)
322- $STerm=($STerm $FE*)
323- $Close=($Close $FE*)
324- $SContinue=($SContinue $FE*)
325-
326303# MACROS
327304
328- $ParaSep = ( $Sep | $CR | $LF)
329- $SATerm = ( $STerm | $ATerm)
305+ $ParaSep = [ $Sep $CR $LF]
306+ $SATerm = [ $STerm $ATerm]
330307
331308# RULES
332309
@@ -337,18 +314,16 @@ $SATerm = ($STerm | $ATerm)
3373144) $ParaSep ÷
338315## 3.4) ( $Control | $CR | $LF ) ÷
339316## 3.5) ÷ ( $Control | $CR | $LF )
340- # Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
341- # WARNING: Implemented as don't break before format (except after linebreaks),
342- # AND add format and extend in all variables definitions that appear after this point!
343- ## 3.91) [^$Control | $CR | $LF] × $Extend
344- 5) × [$Format $Extend]
317+ # Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.)
318+ # This also has the effect of: Any × (Format | Extend)
319+ 5) (?<X>[^$ParaSep]) ( $Extend | $Format )* → ${X}
345320# Do not break after full stop in certain contexts. [See note below.]
346321# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,
347322# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.
348323# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
3493246) $ATerm × $Numeric
3503257) ($Upper | $Lower) $ATerm × $Upper
351- 8) $ATerm $Close* $Sp* × $NotPreLower_ * $Lower
326+ 8) $ATerm $Close* $Sp* × [^ $OLetter $Upper $Lower $ParaSep $SATerm] * $Lower
3523278.1) $SATerm $Close* $Sp* × ($SContinue | $SATerm)
353328# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
3543299) $SATerm $Close* × ( $Close | $Sp | $ParaSep )
@@ -393,38 +368,12 @@ $WSegSpace=\p{Word_Break=WSegSpace}
393368
394369# MACROS
395370
396- $AHLetter=( $ALetter | $Hebrew_Letter)
397- $MidNumLetQ=( $MidNumLet | $Single_Quote)
371+ $AHLetter=[ $ALetter $Hebrew_Letter]
372+ $MidNumLetQ=[ $MidNumLet $Single_Quote]
398373## WARNING: For Rule 4: Fixes for GC, Format
399374## # Subtract Format from Control, since we don't want to break before/after
400375## $Control=[$Control-$Format]
401376
402- # SPECIAL EXTENSIONS
403-
404- # Add format and extend to everything
405- $FE=[$Format $Extend $ZWJ]
406-
407- $NotBreak_=[^ $Newline $CR $LF ]
408- ## $FE= ($Extend | $Format)*
409- $Katakana=($Katakana $FE*)
410- $ALetter=($ALetter $FE*)
411- $MidLetter=($MidLetter $FE*)
412- $MidNum=($MidNum $FE*)
413- $MidNumLet=($MidNumLet $FE*)
414- $Numeric=($Numeric $FE*)
415- $ExtendNumLet=($ExtendNumLet $FE*)
416- $RI=($RI $FE*)
417- $Hebrew_Letter=($Hebrew_Letter $FE*)
418- $Double_Quote=($Double_Quote $FE*)
419- $Single_Quote=($Single_Quote $FE*)
420- ## $E_Base=($E_Base $FE*)
421- ## $E_Modifier=($E_Modifier $FE*)
422- ## $ZWJ=($ZWJ $FE*) # don't do this one!
423- ## $Glue_After_Zwj=($Glue_After_Zwj $FE*)
424- ## $EBG=($EBG $FE*)
425- $AHLetter=($AHLetter $FE*)
426- $MidNumLetQ=($MidNumLetQ $FE*)
427-
428377# RULES
429378
430379# Break at the start and end of text, unless the text is empty.
@@ -440,11 +389,9 @@ $MidNumLetQ=($MidNumLetQ $FE*)
440389## 3.5) ÷ ( $Control | $CR | $LF )
441390## 3.9) × $Extend
442391## 3.91) [^$Control | $CR | $LF] × $Extend
443- # Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)
444- # WARNING: Implemented as don't break before format (except after linebreaks),
445- # AND add format and extend in all variables definitions that appear after this point!
446- ## 4) × [$Format $Extend]
447- 4) $NotBreak_ × [$Format $Extend $ZWJ]
392+ # Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.)
393+ # This also has the effect of: Any × (Format | Extend)
394+ 4) (?<X>[^$CR $LF $Newline]) ($Extend | $Format | $ZWJ)* → ${X}
448395
449396# VANILLA RULES
450397
0 commit comments