Skip to content

Commit 4312203

Browse files
committed
Merge branch rm-UTF8-skips into blead
These commits change the calls to certain functions so that they return the number of bytes consumed by those functions. This value is then used instead of UTF8SKIP. When everything goes right, the value is the same as UTF8SIP, but it is better practice to use the function return. Not only do we avoid having to recalculate it, but the functions have actually already verified every byte that comprises the code point, so it is just better practice to use this value, instead of UTF8SKIP, which looks just at the first byte.
2 parents 8cfa17a + f82bfff commit 4312203

File tree

4 files changed

+33
-22
lines changed

4 files changed

+33
-22
lines changed

locale.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -944,9 +944,16 @@ S_get_displayable_string(pTHX_
944944
SAVEFREEPV(ret);
945945

946946
while (t < e) {
947-
UV cp = (is_utf8)
948-
? utf8_to_uv_or_die((const U8 *) t, (const U8 *) e, NULL)
949-
: * (U8 *) t;
947+
UV cp;
948+
Size_t advance;
949+
if (is_utf8) {
950+
cp = utf8_to_uv_or_die((const U8 *) t, (const U8 *) e, &advance);
951+
}
952+
else {
953+
cp = *t;
954+
advance = 1;
955+
}
956+
950957
if (isPRINT(cp)) {
951958
if (! prev_was_printable) {
952959
my_strlcat(ret, " ", size);
@@ -966,7 +973,7 @@ S_get_displayable_string(pTHX_
966973
my_strlcat(ret, form("%02" UVXf, cp), size);
967974
prev_was_printable = FALSE;
968975
}
969-
t += (is_utf8) ? UTF8SKIP(t) : 1;
976+
t += advance;
970977
first_time = FALSE;
971978
}
972979

regcomp.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8183,10 +8183,11 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
81838183
p++;
81848184
}
81858185
else {
8186+
Size_t advance;
81868187
input_text[name_len++] = utf8_to_uv_or_die((const U8 *) p,
81878188
(const U8 *) e,
8188-
NULL);
8189-
p+= UTF8SKIP(p);
8189+
&advance);
8190+
p += advance;
81908191
}
81918192

81928193
/* The declaration of 'input_text' is how long we allow a potential

regexec.c

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10461,6 +10461,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1046110461

1046210462
switch (with_t_UTF8ness(OP(p), utf8_target)) {
1046310463
SV * anyofh_list;
10464+
Size_t advance;
1046410465

1046510466
case REG_ANY_t8:
1046610467
while (scan < this_eol && hardcount < max && *scan != '\n') {
@@ -10746,9 +10747,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1074610747
&& _invlist_contains_cp(anyofh_list,
1074710748
utf8_to_uv_or_die((U8 *) scan,
1074810749
(U8 *) this_eol,
10749-
NULL)))
10750+
&advance)))
1075010751
{
10751-
scan += UTF8SKIP(scan);
10752+
scan += advance;
1075210753
hardcount++;
1075310754
}
1075410755
break;
@@ -10762,9 +10763,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1076210763
&& _invlist_contains_cp(anyofh_list,
1076310764
utf8_to_uv_or_die((U8 *) scan,
1076410765
(U8 *) this_eol,
10765-
NULL)))
10766+
&advance)))
1076610767
{
10767-
scan += UTF8SKIP(scan);
10768+
scan += advance;
1076810769
hardcount++;
1076910770
}
1077010771
break;
@@ -10792,9 +10793,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1079210793
&& _invlist_contains_cp(anyofh_list,
1079310794
utf8_to_uv_or_die((U8 *) scan,
1079410795
(U8 *) this_eol,
10795-
NULL)))
10796+
&advance)))
1079610797
{
10797-
scan += UTF8SKIP(scan);
10798+
scan += advance;
1079810799
hardcount++;
1079910800
}
1080010801
break;
@@ -10807,9 +10808,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1080710808
&& _invlist_contains_cp(anyofh_list,
1080810809
utf8_to_uv_or_die((U8 *) scan,
1080910810
(U8 *) this_eol,
10810-
NULL)))
10811+
&advance)))
1081110812
{
10812-
scan += UTF8SKIP(scan);
10813+
scan += advance;
1081310814
hardcount++;
1081410815
}
1081510816
break;
@@ -10820,10 +10821,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1082010821
&& NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
1082110822
&& withinCOUNT(utf8_to_uv_or_die((U8 *) scan,
1082210823
(U8 *) this_eol,
10823-
NULL),
10824+
&advance),
1082410825
ANYOFRbase(p), ANYOFRdelta(p)))
1082510826
{
10826-
scan += UTF8SKIP(scan);
10827+
scan += advance;
1082710828
hardcount++;
1082810829
}
1082910830
break;
@@ -10844,10 +10845,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
1084410845
&& (U8) *scan == ANYOF_FLAGS(p)
1084510846
&& withinCOUNT(utf8_to_uv_or_die((U8 *) scan,
1084610847
(U8 *) this_eol,
10847-
NULL),
10848+
&advance),
1084810849
ANYOFRbase(p), ANYOFRdelta(p)))
1084910850
{
10850-
scan += UTF8SKIP(scan);
10851+
scan += advance;
1085110852
hardcount++;
1085210853
}
1085310854
break;

toke.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2964,14 +2964,15 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
29642964
s += 2;
29652965
}
29662966
else {
2967+
Size_t advance;
29672968
if (! _invlist_contains_cp(PL_utf8_charname_begin,
29682969
utf8_to_uv_or_die((const U8 *) s,
29692970
(const U8 *) e,
2970-
NULL)))
2971+
&advance)))
29712972
{
29722973
goto bad_charname;
29732974
}
2974-
s += UTF8SKIP(s);
2975+
s += advance;
29752976
}
29762977

29772978
while (s < e) {
@@ -2992,14 +2993,15 @@ Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
29922993
s += 2;
29932994
}
29942995
else {
2996+
Size_t advance;
29952997
if (! _invlist_contains_cp(PL_utf8_charname_continue,
29962998
utf8_to_uv_or_die((const U8 *) s,
29972999
(const U8 *) e,
2998-
NULL)))
3000+
&advance)))
29993001
{
30003002
goto bad_charname;
30013003
}
3002-
s += UTF8SKIP(s);
3004+
s += advance;
30033005
}
30043006
}
30053007
}

0 commit comments

Comments
 (0)