Skip to content

Commit 907b053

Browse files
committed
regrepeat: Use new utf8_to_uv_or_die; not utf8_to_uvchr_buf
This is a subtle bug fix when the input is malformed UTF-8. We say we don't support malformed, but this commit is a step towards better protecting against that eventuality. Prior to this commit, some patterns that use regrepeat() would exhibit different matching behavior of malformed input depending on if utf8 warnings were enabled or not. This is because utf8_to_uvchr_buf() returns NUL if utf8 warnings are on; and the REPLACEMENT CHARACTER if they are off. If the match criteria accepts one but not the other, the behavior would differ. Now, malformed input is treated as a runtime error, like division by 0.
1 parent ddfa1d4 commit 907b053

File tree

1 file changed

+19
-19
lines changed

1 file changed

+19
-19
lines changed

regexec.c

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10566,9 +10566,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1056610566
&& scan < this_eol
1056710567
&& NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
1056810568
&& _invlist_contains_cp(anyofh_list,
10569-
utf8_to_uvchr_buf((U8 *) scan,
10570-
(U8 *) this_eol,
10571-
NULL)))
10569+
utf8_to_uv_or_die((U8 *) scan,
10570+
(U8 *) this_eol,
10571+
NULL)))
1057210572
{
1057310573
scan += UTF8SKIP(scan);
1057410574
hardcount++;
@@ -10582,9 +10582,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1058210582
&& scan < this_eol
1058310583
&& (U8) *scan == ANYOF_FLAGS(p)
1058410584
&& _invlist_contains_cp(anyofh_list,
10585-
utf8_to_uvchr_buf((U8 *) scan,
10586-
(U8 *) this_eol,
10587-
NULL)))
10585+
utf8_to_uv_or_die((U8 *) scan,
10586+
(U8 *) this_eol,
10587+
NULL)))
1058810588
{
1058910589
scan += UTF8SKIP(scan);
1059010590
hardcount++;
@@ -10612,9 +10612,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1061210612
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
1061310613
&& NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
1061410614
&& _invlist_contains_cp(anyofh_list,
10615-
utf8_to_uvchr_buf((U8 *) scan,
10616-
(U8 *) this_eol,
10617-
NULL)))
10615+
utf8_to_uv_or_die((U8 *) scan,
10616+
(U8 *) this_eol,
10617+
NULL)))
1061810618
{
1061910619
scan += UTF8SKIP(scan);
1062010620
hardcount++;
@@ -10627,9 +10627,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1062710627
&& scan + FLAGS(p) < this_eol
1062810628
&& memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
1062910629
&& _invlist_contains_cp(anyofh_list,
10630-
utf8_to_uvchr_buf((U8 *) scan,
10631-
(U8 *) this_eol,
10632-
NULL)))
10630+
utf8_to_uv_or_die((U8 *) scan,
10631+
(U8 *) this_eol,
10632+
NULL)))
1063310633
{
1063410634
scan += UTF8SKIP(scan);
1063510635
hardcount++;
@@ -10640,9 +10640,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1064010640
while ( hardcount < max
1064110641
&& scan < this_eol
1064210642
&& NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10643-
&& withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10644-
(U8 *) this_eol,
10645-
NULL),
10643+
&& withinCOUNT(utf8_to_uv_or_die((U8 *) scan,
10644+
(U8 *) this_eol,
10645+
NULL),
1064610646
ANYOFRbase(p), ANYOFRdelta(p)))
1064710647
{
1064810648
scan += UTF8SKIP(scan);
@@ -10664,9 +10664,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1066410664
while ( hardcount < max
1066510665
&& scan < this_eol
1066610666
&& (U8) *scan == ANYOF_FLAGS(p)
10667-
&& withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10668-
(U8 *) this_eol,
10669-
NULL),
10667+
&& withinCOUNT(utf8_to_uv_or_die((U8 *) scan,
10668+
(U8 *) this_eol,
10669+
NULL),
1067010670
ANYOFRbase(p), ANYOFRdelta(p)))
1067110671
{
1067210672
scan += UTF8SKIP(scan);
@@ -10780,7 +10780,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1078010780
while ( hardcount < max && scan < this_eol
1078110781
&& to_complement
1078210782
^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10783-
utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10783+
utf8_to_uv_or_die((U8 *) scan, (U8 *) this_eol, NULL))))
1078410784
{
1078510785
scan += UTF8SKIP(scan);
1078610786
hardcount++;

0 commit comments

Comments
 (0)