Skip to content

Commit c0e14d9

Browse files
committed
find_by_class: Use new utf8_to_uv; not utf8_to_uvchr_buf
find_by_class() is used in pattern matching. This is a subtle bug fix when the input is malformed UTF-8. We say we don't support malformed, but this commit is a step towards better protecting against that eventuality. frior to this commit, some patterns that use find_by_class() would exhibit different matching behavior of malformed input depending on if utf8 warnings were enabled or not. This is because utf8_to_uvchr_buf() returns NUL if utf8 warnings are on; and the REPLACEMENT CHARACTER if they are off. If the match criteria accepts one but not the other, the behavior would differ. Now, malformed input never matches a class
1 parent 00f06a3 commit c0e14d9

File tree

1 file changed

+18
-29
lines changed

1 file changed

+18
-29
lines changed

regexec.c

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2303,6 +2303,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
23032303
* ('p8' and 'pb'. */
23042304
switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
23052305
SV * anyofh_list;
2306+
UV cp;
23062307

23072308
case ANYOFPOSIXL_t8_pb:
23082309
case ANYOFPOSIXL_t8_p8:
@@ -2398,10 +2399,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
23982399
anyofh_list = GET_ANYOFH_INVLIST(prog, c);
23992400
REXEC_FBC_UTF8_CLASS_SCAN(
24002401
( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2401-
&& _invlist_contains_cp(anyofh_list,
2402-
utf8_to_uvchr_buf((U8 *) s,
2403-
(U8 *) strend,
2404-
NULL))));
2402+
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2403+
&& _invlist_contains_cp(anyofh_list, cp)));
24052404
break;
24062405

24072406
case ANYOFHb_t8_pb:
@@ -2412,10 +2411,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24122411

24132412
anyofh_list = GET_ANYOFH_INVLIST(prog, c);
24142413
REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2415-
_invlist_contains_cp(anyofh_list,
2416-
utf8_to_uvchr_buf((U8 *) s,
2417-
(U8 *) strend,
2418-
NULL)));
2414+
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2415+
&& _invlist_contains_cp(anyofh_list, cp)));
24192416
}
24202417
break;
24212418

@@ -2440,10 +2437,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24402437
( inRANGE(NATIVE_UTF8_TO_I8(*s),
24412438
LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
24422439
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2443-
&& _invlist_contains_cp(anyofh_list,
2444-
utf8_to_uvchr_buf((U8 *) s,
2445-
(U8 *) strend,
2446-
NULL))));
2440+
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2441+
&& _invlist_contains_cp(anyofh_list, cp)));
24472442
break;
24482443

24492444
case ANYOFHs_t8_pb:
@@ -2453,10 +2448,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24532448
((struct regnode_anyofhs *) c)->string,
24542449
/* Note FLAGS is the string length in this regnode */
24552450
((struct regnode_anyofhs *) c)->string + FLAGS(c),
2456-
_invlist_contains_cp(anyofh_list,
2457-
utf8_to_uvchr_buf((U8 *) s,
2458-
(U8 *) strend,
2459-
NULL)));
2451+
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2452+
&& _invlist_contains_cp(anyofh_list, cp)));
24602453
break;
24612454

24622455
case ANYOFR_tb_pb:
@@ -2469,10 +2462,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24692462
case ANYOFR_t8_p8:
24702463
REXEC_FBC_UTF8_CLASS_SCAN(
24712464
( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2472-
&& withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2473-
(U8 *) strend,
2474-
NULL),
2475-
ANYOFRbase(c), ANYOFRdelta(c))));
2465+
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2466+
&& withinCOUNT(cp, ANYOFRbase(c), ANYOFRdelta(c))));
24762467
break;
24772468

24782469
case ANYOFRb_tb_pb:
@@ -2487,10 +2478,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
24872478
U8 first_byte = FLAGS(c);
24882479

24892480
REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2490-
withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2491-
(U8 *) strend,
2492-
NULL),
2493-
ANYOFRbase(c), ANYOFRdelta(c)));
2481+
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
2482+
&& withinCOUNT(cp, ANYOFRbase(c), ANYOFRdelta(c))));
24942483
}
24952484
break;
24962485

@@ -3201,11 +3190,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
32013190
switch (classnum) {
32023191
default:
32033192
REXEC_FBC_UTF8_CLASS_SCAN(
3204-
to_complement ^ cBOOL(_invlist_contains_cp(
3205-
PL_XPosix_ptrs[classnum],
3206-
utf8_to_uvchr_buf((U8 *) s,
3207-
(U8 *) strend,
3208-
NULL))));
3193+
to_complement ^ cBOOL(
3194+
utf8_to_uv((U8 *) s, (U8 *) strend,
3195+
&cp, NULL)
3196+
&& _invlist_contains_cp(
3197+
PL_XPosix_ptrs[classnum], cp)));
32093198
break;
32103199

32113200
case CC_ENUM_SPACE_:

0 commit comments

Comments
 (0)