Skip to content

Commit 0187f3d

Browse files
committed
Add utf8_to_uv_msgs()
This is the first of several functions with the naming style utf8_to_uv(), and which are designed to be used instead of the problematic current ones that are like utf8_to_uvchr(). The previous ones basically throw away crucial information in their returns upon failure, creating hassles for the caller. It is hard to recover from malformed input with them to keep going to continue parsing. That is what modern UTF-8 handlers have settled on doing. Originally I planned to replace just the most problematic one, utf8_to_uvchr_buf(), but I realized that each level threw away information, so it would be better to start at the base level one, which utf8_to_uvchr_buf() eventually calls with a bunch of 0 parameters. The previous functions all had to disambiguate failure returns. This stops that at the root. The new series all return a boolean as to their success, with a consistent API throughout. The old series had one outlier, again utf8_to_uvchr_buf(), which had a different calling convention and returns. The basic logic in the base level function, which this commit handles, was sound. It just failed to return relevant information upon failure. The new API has somewhat different formal parameter names and uses Size_t instead of STRLEN for one of the parameters. It also passes the end of string position instead of a length. The latter is problematic when it could go negative, and instead becomes a huge positive number. The old base function now merely calls the new one, and throws away the relevant information, as it always has.
1 parent ddfa240 commit 0187f3d

File tree

5 files changed

+111
-68
lines changed

5 files changed

+111
-68
lines changed

embed.fnc

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3684,13 +3684,6 @@ ATdip |UV |utf8n_to_uvchr_msgs \
36843684
|const U32 flags \
36853685
|NULLOK U32 *errors \
36863686
|NULLOK AV **msgs
3687-
CTp |UV |_utf8n_to_uvchr_msgs_helper \
3688-
|NN const U8 *s \
3689-
|STRLEN curlen \
3690-
|NULLOK STRLEN *retlen \
3691-
|const U32 flags \
3692-
|NULLOK U32 *errors \
3693-
|NULLOK AV **msgs
36943687
CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \
36953688
|STRLEN curlen \
36963689
|NULLOK STRLEN *retlen \
@@ -3740,6 +3733,21 @@ AMdip |UV |utf8_to_uvchr_buf \
37403733
|NN const U8 *s \
37413734
|NN const U8 *send \
37423735
|NULLOK STRLEN *retlen
3736+
ATip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \
3737+
|NN const U8 *e \
3738+
|NN UV *cp_p \
3739+
|NULLOK Size_t *advance_p \
3740+
|const U32 flags \
3741+
|NULLOK U32 *errors \
3742+
|NULLOK AV **msgs
3743+
CTp |bool |utf8_to_uv_msgs_helper_ \
3744+
|NN const U8 * const s0 \
3745+
|NN const U8 * const e \
3746+
|NN UV *cp_p \
3747+
|NULLOK Size_t *advance_p \
3748+
|const U32 flags \
3749+
|NULLOK U32 *errors \
3750+
|NULLOK AV **msgs
37433751
CDbdp |UV |utf8_to_uvuni |NN const U8 *s \
37443752
|NULLOK STRLEN *retlen
37453753
: Used in perly.y

embed.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@
125125
# define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
126126
# define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
127127
# define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
128-
# define _utf8n_to_uvchr_msgs_helper Perl__utf8n_to_uvchr_msgs_helper
129128
# define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d)
130129
# define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b)
131130
# define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d)
@@ -863,6 +862,8 @@
863862
# define utf8_to_bytes_new_pv(a,b,c) Perl_utf8_to_bytes_new_pv(aTHX,a,b,c)
864863
# define utf8_to_bytes_overwrite(a,b) Perl_utf8_to_bytes_overwrite(aTHX,a,b)
865864
# define utf8_to_bytes_temp_pv(a,b) Perl_utf8_to_bytes_temp_pv(aTHX,a,b)
865+
# define utf8_to_uv_msgs Perl_utf8_to_uv_msgs
866+
# define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_
866867
# define utf8n_to_uvchr Perl_utf8n_to_uvchr
867868
# define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error
868869
# define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs

inline.h

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3048,21 +3048,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
30483048
|| is_utf8_valid_partial_char_flags(*ep, s + len, flags);
30493049
}
30503050

3051-
PERL_STATIC_INLINE UV
3052-
Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
3053-
STRLEN curlen,
3054-
STRLEN *retlen,
3055-
const U32 flags,
3056-
U32 * errors,
3057-
AV ** msgs)
3051+
PERL_STATIC_INLINE bool
3052+
Perl_utf8_to_uv_msgs(const U8 * const s0,
3053+
const U8 * const e,
3054+
UV * cp_p,
3055+
Size_t *advance_p,
3056+
const U32 flags,
3057+
U32 * errors,
3058+
AV ** msgs)
30583059
{
3059-
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
3060+
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS;
30603061

3061-
/* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
3062-
* simple cases, and, if necessary calls a helper function to deal with the
3063-
* more complex ones. Almost all well-formed non-problematic code points
3064-
* are considered simple, so that it's unlikely that the helper function
3065-
* will need to be called. */
3062+
/* This is the inlined portion of utf8_to_uv_msgs. It handles the simple
3063+
* cases, and, if necessary calls a helper function to deal with the more
3064+
* complex ones. Almost all well-formed non-problematic code points are
3065+
* considered simple, so that it's unlikely that the helper function will
3066+
* need to be called. */
30663067

30673068
/* Assume that isn't malformed; the vast majority of calls won't be */
30683069
if (errors) {
@@ -3075,25 +3076,25 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
30753076

30763077
/* No calls from core pass in an empty string; non-core need a check */
30773078
#ifdef PERL_CORE
3078-
assert(curlen > 0);
3079+
assert(e > s0);
30793080
#else
3080-
if (LIKELY(curlen > 0))
3081+
if (LIKELY(e > s0))
30813082
#endif
30823083

30833084
{
30843085
/* UTF-8 invariants are returned unchanged. The code below is quite
30853086
* capable of handling this, but this shortcuts this very common case
30863087
* */
30873088
if (UTF8_IS_INVARIANT(*s0)) {
3088-
if (retlen) {
3089-
*retlen = 1;
3089+
if (advance_p) {
3090+
*advance_p = 1;
30903091
}
30913092

3092-
return *s0;
3093+
*cp_p = *s0;
3094+
return true;
30933095
}
30943096

30953097
const U8 * s = s0;
3096-
const U8 * send = s + curlen;
30973098

30983099
/* This dfa is fast. If it accepts the input, it was for a
30993100
* well-formed, non-problematic code point, which can be returned
@@ -3116,27 +3117,52 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
31163117
PERL_UINT_FAST16_T state = PL_strict_utf8_dfa_tab[256 + type];
31173118
UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
31183119

3119-
while (state > 1 && ++s < send) {
3120+
while (state > 1 && ++s < e) {
31203121
type = PL_strict_utf8_dfa_tab[*s];
31213122
state = PL_strict_utf8_dfa_tab[256 + state + type];
31223123

31233124
uv = UTF8_ACCUMULATE(uv, *s);
31243125
}
31253126

31263127
if (LIKELY(state == 0)) {
3127-
if (retlen) {
3128-
*retlen = s - s0 + 1;
3128+
if (advance_p) {
3129+
*advance_p = s - s0 + 1;
31293130
}
31303131

3131-
return UNI_TO_NATIVE(uv);
3132+
*cp_p = UNI_TO_NATIVE(uv);
3133+
return true;
31323134
}
31333135
}
31343136

31353137
/* Here is potentially problematic. Use the full mechanism */
3136-
return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
3137-
errors, msgs);
3138+
return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs);
3139+
}
3140+
3141+
PERL_STATIC_INLINE UV
3142+
Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
3143+
STRLEN curlen,
3144+
STRLEN *retlen,
3145+
const U32 flags,
3146+
U32 * errors,
3147+
AV ** msgs)
3148+
{
3149+
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
3150+
3151+
UV cp;
3152+
if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors,
3153+
msgs)))
3154+
{
3155+
return cp;
3156+
}
3157+
3158+
if ((flags & UTF8_CHECK_ONLY) && retlen) {
3159+
*retlen = ((STRLEN) -1);
3160+
}
3161+
3162+
return 0;
31383163
}
31393164

3165+
31403166
PERL_STATIC_INLINE UV
31413167
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
31423168
{

proto.h

Lines changed: 10 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

utf8.c

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,28 +1318,30 @@ The caller, of course, is responsible for freeing any returned AV.
13181318
=cut
13191319
*/
13201320

1321-
UV
1322-
Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1323-
STRLEN curlen,
1324-
STRLEN *retlen,
1325-
const U32 flags,
1326-
U32 * errors,
1327-
AV ** msgs)
1321+
bool
1322+
Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1323+
const U8 * const e,
1324+
UV *cp_p,
1325+
Size_t *advance_p,
1326+
const U32 flags,
1327+
U32 * errors,
1328+
AV ** msgs)
13281329
{
1329-
const U8 * const s0 = s;
1330-
const U8 * send = s0 + curlen;
1330+
PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_;
1331+
1332+
const U8 * s = s0;
1333+
const U8 * send = e;
1334+
SSize_t curlen = send - s0;
13311335
U32 possible_problems; /* A bit is set here for each potential problem
13321336
found as we go along */
13331337
UV uv;
1334-
STRLEN expectlen; /* How long should this sequence be? */
1335-
STRLEN avail_len; /* When input is too short, gives what that is */
1338+
SSize_t expectlen; /* How long should this sequence be? */
1339+
SSize_t avail_len; /* When input is too short, gives what that is */
13361340
U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
13371341
gets set and discarded */
13381342

13391343
dTHX;
13401344

1341-
PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1342-
13431345
/* Here, is one of: a) malformed; b) a problematic code point (surrogate,
13441346
* non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
13451347
* syllables that the dfa doesn't properly handle. Quickly dispose of the
@@ -1356,13 +1358,14 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
13561358
/* Each of the affected Hanguls starts with \xED */
13571359

13581360
if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1359-
if (retlen) {
1360-
*retlen = 3;
1361+
if (advance_p) {
1362+
*advance_p = 3;
13611363
}
13621364

1363-
return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1364-
| ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1365-
| (s0[2] & UTF_CONTINUATION_MASK);
1365+
*cp_p = ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1366+
| ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1367+
| (s0[2] & UTF_CONTINUATION_MASK);
1368+
return true;
13661369
}
13671370

13681371
/* In conjunction with the exhaustive tests that can be enabled in
@@ -1403,7 +1406,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
14031406
* We also should not consume too few bytes, otherwise someone could inject
14041407
* things. For example, an input could be deliberately designed to
14051408
* overflow, and if this code bailed out immediately upon discovering that,
1406-
* returning to the caller C<*retlen> pointing to the very next byte (one
1409+
* returning to the caller C<*advance_p> pointing to the very next byte (one
14071410
* which is actually part of the overflowing sequence), that could look
14081411
* legitimate to the caller, which could discard the initial partial
14091412
* sequence and process the rest, inappropriately.
@@ -1415,7 +1418,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
14151418
* allowed one, we could allow in something that shouldn't have been.
14161419
*/
14171420

1418-
if (UNLIKELY(curlen == 0)) {
1421+
if (UNLIKELY(curlen <= 0)) {
14191422
possible_problems |= UTF8_GOT_EMPTY;
14201423
curlen = 0;
14211424
uv = UNICODE_REPLACEMENT;
@@ -1433,8 +1436,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
14331436
* function will be for, has this expected length. For efficiency, set
14341437
* things up here to return it. It will be overridden only in those rare
14351438
* cases where a malformation is found */
1436-
if (retlen) {
1437-
*retlen = expectlen;
1439+
if (advance_p) {
1440+
*advance_p = expectlen;
14381441
}
14391442

14401443
/* A continuation character can't start a valid sequence */
@@ -1606,7 +1609,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
16061609
}
16071610
}
16081611

1609-
ready_to_handle_errors:
1612+
ready_to_handle_errors: ;
16101613

16111614
/* At this point:
16121615
* curlen contains the number of bytes in the sequence that
@@ -1629,6 +1632,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
16291632
* us should be in it, but no further than s0 +
16301633
* avail_len
16311634
*/
1635+
bool success = true;
16321636

16331637
if (UNLIKELY(possible_problems)) {
16341638
bool disallowed = FALSE;
@@ -2047,19 +2051,18 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
20472051
/* Since there was a possible problem, the returned length may need to
20482052
* be changed from the one stored at the beginning of this function.
20492053
* Instead of trying to figure out if it has changed, just do it. */
2050-
if (retlen) {
2051-
*retlen = curlen;
2054+
if (advance_p) {
2055+
*advance_p = curlen;
20522056
}
20532057

20542058
if (disallowed) {
2055-
if (flags & UTF8_CHECK_ONLY && retlen) {
2056-
*retlen = ((STRLEN) -1);
2057-
}
2058-
return 0;
2059+
success = false;
2060+
uv = UNICODE_REPLACEMENT;
20592061
}
20602062
}
20612063

2062-
return UNI_TO_NATIVE(uv);
2064+
*cp_p = UNI_TO_NATIVE(uv);
2065+
return success;
20632066
}
20642067

20652068
/*

0 commit comments

Comments
 (0)