Fix \b{lb} bug with marks and ZWJ

khwilliamson · khwilliamson · commit 8a7ffdd16917 · 2025-09-17T09:31:20.000-06:00
A combining mark (and ZWJ) usually attach to the preceding character.
That makes sense, an 'a' with an acute accent following it, are
considered a unit.

But marks do not attach to some classes of characters.  If you have a
space followed by an acute accent, the accent stands on its own and
doesn't hang over the space.

What Unicode says to do, then is to pretend that the mark is actually an
alphabetic.

The implementation of \b{lb} includes a bunch of DFAs.  And in several,
it didn't implement this properly.

This commit fixes this.  When parsing backwards in the input to examine
the context, in some DFAs it is supposed to ignore intervening marks.
But when it gets to the end and the character is one the marks don't
attach to, it should return alphabetic instead of the character.

This commit changes to do that.

It required some calls to the backwards parse routine to change to
handle the marks themselves.

The code  passed the extensive tests furnished by Unicode for 16.0.
They have provided a new test file for 17.0, which has new tests, and
it failed for one test.

This fix applies to 16.0 as well as 17.0.
diff --git a/regexec.c b/regexec.c
@@ -5481,8 +5481,11 @@ S_isLB(pTHX_ LB_enum before,
              * the base character of the combining sequence they are part of
              * breaks with that character.  Backup to find that that base
              * character */
-            prev = backup_one_LB_but_over_CM_ZWJ(strbeg, &prev_pos,
-                                                 utf8_target);
+            while (   isLB_CM(prev)
+                   || isLB_ZWJ(prev))
+            {
+                prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
+            }
 
             /* Here, 'prev' is the base character.  If the CM/ZWJ attaches to
              * it, then it inherits the behavior of 'prev'.  If it
@@ -5503,8 +5506,7 @@ S_isLB(pTHX_ LB_enum before,
                    || isLB_CM(prev)
                    || isLB_ZWJ(prev))
             {
-                prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
-                                                      &prev_pos, utf8_target);
+                prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
             }
 
             matched = isLB_OP(prev);
@@ -5517,8 +5519,7 @@ S_isLB(pTHX_ LB_enum before,
                    || isLB_CM(prev)
                    || isLB_ZWJ(prev))
             {
-                prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
-                                                      &prev_pos, utf8_target);
+                prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
             }
 
             matched = isLB_QU(prev);
@@ -5597,8 +5598,7 @@ S_isLB(pTHX_ LB_enum before,
                    || isLB_CM(prev)
                    || isLB_ZWJ(prev))
             {
-                prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
-                                                      &prev_pos, utf8_target);
+                prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
             }
 
             matched = isLB_CL(prev)
@@ -5631,8 +5631,7 @@ S_isLB(pTHX_ LB_enum before,
                    || isLB_CM(prev)
                    || isLB_ZWJ(prev))
             {
-                prev = backup_one_LB_but_over_CM_ZWJ(strbeg,
-                                                     &prev_pos, utf8_target);
+                prev = backup_one_LB(strbeg, &prev_pos, utf8_target);
             }
 
             matched = isLB_B2(prev);
@@ -5780,6 +5779,7 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
     PERL_ARGS_ASSERT_BACKUP_ONE_LB_;
 
     LB_enum isLB_scratch;   /* Used by generated isLB_foo() macros */
+    bool skipped_combining = false;
 
     if (*curpos < strbeg) {
         return LB_EDGE;
@@ -5810,7 +5810,9 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
                 *curpos = (U8 *) strbeg;
                 return LB_EDGE;
             }
-        } while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
+        } while (   skip_CM_ZWJ
+                 && (isLB_CM(lb) || isLB_ZWJ(lb))
+                 && (skipped_combining = true));
     }
     else {
         do {
@@ -5820,7 +5822,17 @@ S_backup_one_LB_(pTHX_ const U8 * const strbeg,
             }
             (*curpos)--;
             lb = getLB_VAL_CP(*(*curpos - 1));
-        } while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
+        } while (   skip_CM_ZWJ
+                 && (isLB_CM(lb) || isLB_ZWJ(lb))
+                 && (skipped_combining = true));
+    }
+
+    /* Rule LB10 says that combining marks (including ZWJ) do not attach to
+     * certain preceding character, such as SPACE.  And that in those
+     * circumstanceds, the mark is to be treated as if it were instead an
+     * alphabetic. */
+    if (skipped_combining && ! LB_CM_ATTACHES_TO(lb)) {
+        lb = LB_Alphabetic;
     }
 
     return lb;