diff --git a/pp_pack.c b/pp_pack.c index 0b53611c7bb1..465d302ff982 100644 --- a/pp_pack.c +++ b/pp_pack.c @@ -253,12 +253,14 @@ utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) if (*s >= end) { goto croak; } - val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1) + if (! utf8_to_uv_flags((U8 *) *s, (U8 *) end, &val, &retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)) + { croak: croak("Malformed UTF-8 string in '%c' format in unpack", (int) TYPE_NO_MODIFIERS(datumtype)); + } + if (val >= 0x100) { ck_warner(packWARN(WARN_UNPACK), "Character in '%c' format wrapped in unpack", @@ -279,49 +281,76 @@ S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t UV val; STRLEN retlen; const char *from = *s; - int bad = 0; - const U32 flags = ckWARN(WARN_UTF8) ? - UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); + bool bad = false; + const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; const bool needs_swap = NEEDS_SWAP(datumtype); if (UNLIKELY(needs_swap)) buf += buf_len; - for (;buf_len > 0; buf_len--) { + AV * msgs = NULL; + for (; buf_len > 0; buf_len--) { if (from >= end) return FALSE; - val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); - if (retlen == (STRLEN) -1) { - from += UTF8_SAFE_SKIP(from, end); - bad |= 1; - } else from += retlen; - if (val >= 0x100) { - bad |= 2; - val = (U8) val; + + AV * this_msgs = NULL; + if (utf8_to_uv_msgs((U8 *) from, (U8 *) end, &val, &retlen, flags, + NULL, &this_msgs)) + { + if (val >= 0x100) { + bad = true; + val = (U8) val; + } } + + from += retlen; + + /* Add any messages from this conversion to the list for later output + * */ + if (this_msgs) { + while (av_count(this_msgs) > 0) { + av_push(msgs, av_shift(this_msgs)); + } + + Safefree(this_msgs); + } + if (UNLIKELY(needs_swap)) *(U8 *)--buf = (U8)val; else *(U8 *)buf++ = (U8)val; } + /* We have enough characters for the buffer. Did we have problems ? */ - if (bad) { - if (bad & 1) { - /* Rewalk the string fragment while warning */ - const char *ptr; - const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; - for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { - if (ptr >= end) break; - utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags); - } - if (from > end) from = end; + if (msgs) { + while (av_count(msgs) > 0) { + HV * msg_hash = (HV *) av_shift(msgs); + SV ** packed_categories_p = hv_fetchs(msg_hash, "warn_categories", 0); + if (packed_categories_p == NULL) { + continue; + } + + UV packed_categories = SvUV(*packed_categories_p); + if (packed_categories == 0) { + continue; + } + + SV ** warn_text_p = hv_fetchs(msg_hash, "text", 0); + if (warn_text_p) { + warner(packed_categories, "%s", SvPV_nolen(*warn_text_p)); + } } - if ((bad & 2)) - ck_warner(packWARN(datumtype & TYPE_IS_PACK ? - WARN_PACK : WARN_UNPACK), - "Character(s) in '%c' format wrapped in %s", - (int) TYPE_NO_MODIFIERS(datumtype), - datumtype & TYPE_IS_PACK ? "pack" : "unpack"); + + Safefree(msgs); + } + + if (bad) { + ck_warner(packWARN(datumtype & TYPE_IS_PACK ? + WARN_PACK : WARN_UNPACK), + "Character(s) in '%c' format wrapped in %s", + (int) TYPE_NO_MODIFIERS(datumtype), + datumtype & TYPE_IS_PACK ? "pack" : "unpack"); } + *s = from; return TRUE; } @@ -408,16 +437,16 @@ STMT_START { \ } STMT_END /* Only to be used inside a loop (see the break) */ -#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ -STMT_START { \ - STRLEN retlen; \ - if (str >= end) break; \ - val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ - if (retlen == (STRLEN) -1) { \ - *cur = '\0'; \ - croak("Malformed UTF-8 string in pack"); \ - } \ - str += retlen; \ +#define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ +STMT_START { \ + STRLEN retlen; \ + if (str >= end) break; \ + if (! utf8_to_uv_flags((U8 *) str, (U8 *) end, &val, &retlen, \ + utf8_flags)) { \ + *cur = '\0'; \ + croak("Malformed UTF-8 string in pack"); \ + } \ + str += retlen; \ } STMT_END static const char *_action( const tempsym_t* symptr ) @@ -1230,17 +1259,28 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c case 'c': while (len-- > 0 && s < strend) { int aint; - if (utf8) - { + if (utf8) { STRLEN retlen; - aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, - ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); - if (retlen == (STRLEN) -1) + UV auv; + if (! utf8_to_uv_flags((U8 *) s, (U8 *) strend, + &auv, &retlen, + (ckWARN(WARN_UTF8)) + ? 0 + : UTF8_ALLOW_ANY)) + { croak("Malformed UTF-8 string in unpack"); + } + + aint = auv; + if ( (UV) aint != auv) { + croak("Malformed UTF-8 string in unpack"); + } + s += retlen; - } - else - aint = *(U8 *)(s)++; + } + else { + aint = *(U8 *)(s)++; + } if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ aint -= 256; if (!checksum) @@ -1310,15 +1350,16 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c break; len = UTF8SKIP(result); if (!S_utf8_to_bytes(aTHX_ &ptr, strend, - (char *) &result[1], len-1, 'U')) break; - auv = utf8n_to_uvchr(result, len, &retlen, - UTF8_ALLOW_DEFAULT); + (char *) &result[1], len - 1, 'U')) + { + break; + } + + auv = utf8_to_uv_or_die(result, result + len, &retlen); s = ptr; - } else { - auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen, - UTF8_ALLOW_DEFAULT); - if (retlen == (STRLEN) -1) - croak("Malformed UTF-8 string in unpack"); + } + else { + (void) utf8_to_uv((U8 *) s, (U8 *) strend, &auv, &retlen); s += retlen; } if (!checksum)