Skip to content

Commit 8a7ffdd

Browse files
committed
Fix \b{lb} bug with marks and ZWJ
A combining mark (and ZWJ) usually attach to the preceding character. That makes sense, an 'a' with an acute accent following it, are considered a unit. But marks do not attach to some classes of characters. If you have a space followed by an acute accent, the accent stands on its own and doesn't hang over the space. What Unicode says to do, then is to pretend that the mark is actually an alphabetic. The implementation of \b{lb} includes a bunch of DFAs. And in several, it didn't implement this properly. This commit fixes this. When parsing backwards in the input to examine the context, in some DFAs it is supposed to ignore intervening marks. But when it gets to the end and the character is one the marks don't attach to, it should return alphabetic instead of the character. This commit changes to do that. It required some calls to the backwards parse routine to change to handle the marks themselves. The code passed the extensive tests furnished by Unicode for 16.0. They have provided a new test file for 17.0, which has new tests, and it failed for one test. This fix applies to 16.0 as well as 17.0.
1 parent c88a52f commit 8a7ffdd

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

regexec.c

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5481,8 +5481,11 @@ S_isLB(pTHX_ LB_enum before,
54815481
* the base character of the combining sequence they are part of
54825482
* breaks with that character. Backup to find that that base
54835483
* character */
5484-
prev = backup_one_LB_but_over_CM_ZWJ(strbeg, &prev_pos,
5485-
utf8_target);
5484+
while ( isLB_CM(prev)
5485+
|| isLB_ZWJ(prev))
5486+
{
5487+
prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
5488+
}
54865489

54875490
/* Here, 'prev' is the base character. If the CM/ZWJ attaches to
54885491
* it, then it inherits the behavior of 'prev'. If it
@@ -5503,8 +5506,7 @@ S_isLB(pTHX_ LB_enum before,
55035506
|| isLB_CM(prev)
55045507
|| isLB_ZWJ(prev))
55055508
{
5506-
prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
5507-
&prev_pos, utf8_target);
5509+
prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
55085510
}
55095511

55105512
matched = isLB_OP(prev);
@@ -5517,8 +5519,7 @@ S_isLB(pTHX_ LB_enum before,
55175519
|| isLB_CM(prev)
55185520
|| isLB_ZWJ(prev))
55195521
{
5520-
prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
5521-
&prev_pos, utf8_target);
5522+
prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
55225523
}
55235524

55245525
matched = isLB_QU(prev);
@@ -5597,8 +5598,7 @@ S_isLB(pTHX_ LB_enum before,
55975598
|| isLB_CM(prev)
55985599
|| isLB_ZWJ(prev))
55995600
{
5600-
prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
5601-
&prev_pos, utf8_target);
5601+
prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
56025602
}
56035603

56045604
matched = isLB_CL(prev)
@@ -5631,8 +5631,7 @@ S_isLB(pTHX_ LB_enum before,
56315631
|| isLB_CM(prev)
56325632
|| isLB_ZWJ(prev))
56335633
{
5634-
prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
5635-
&prev_pos, utf8_target);
5634+
prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
56365635
}
56375636

56385637
matched = isLB_B2(prev);
@@ -5780,6 +5779,7 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
57805779
PERL_ARGS_ASSERT_BACKUP_ONE_LB_;
57815780

57825781
LB_enum isLB_scratch; /* Used by generated isLB_foo() macros */
5782+
bool skipped_combining = false;
57835783

57845784
if (*curpos < strbeg) {
57855785
return LB_EDGE;
@@ -5810,7 +5810,9 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
58105810
*curpos = (U8 *) strbeg;
58115811
return LB_EDGE;
58125812
}
5813-
} while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
5813+
} while ( skip_CM_ZWJ
5814+
&& (isLB_CM(lb) || isLB_ZWJ(lb))
5815+
&& (skipped_combining = true));
58145816
}
58155817
else {
58165818
do {
@@ -5820,7 +5822,17 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
58205822
}
58215823
(*curpos)--;
58225824
lb = getLB_VAL_CP(*(*curpos - 1));
5823-
} while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
5825+
} while ( skip_CM_ZWJ
5826+
&& (isLB_CM(lb) || isLB_ZWJ(lb))
5827+
&& (skipped_combining = true));
5828+
}
5829+
5830+
/* Rule LB10 says that combining marks (including ZWJ) do not attach to
5831+
* certain preceding character, such as SPACE. And that in those
5832+
* circumstanceds, the mark is to be treated as if it were instead an
5833+
* alphabetic. */
5834+
if (skipped_combining && ! LB_CM_ATTACHES_TO(lb)) {
5835+
lb = LB_Alphabetic;
58245836
}
58255837

58265838
return lb;

0 commit comments

Comments
 (0)