@@ -1949,21 +1949,21 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19491949 * malformation is an overlong (which allows it to be fully
19501950 * computed). Or it may have been "cured" as best it can by the
19511951 * loop just above. */
1952- if (UNLIKELY (UNICODE_IS_SURROGATE (uv ))) {
1953- if (flags & (UTF8_DISALLOW_SURROGATE |UTF8_WARN_SURROGATE )) {
1954- possible_problems |= UTF8_GOT_SURROGATE ;
1955- }
1952+ if (UNLIKELY (UNICODE_IS_SURROGATE (uv ))) {
1953+ if (flags & (UTF8_DISALLOW_SURROGATE |UTF8_WARN_SURROGATE )) {
1954+ possible_problems |= UTF8_GOT_SURROGATE ;
19561955 }
1957- else if ( UNLIKELY ( UNICODE_IS_SUPER ( uv ))) {
1958- if (flags & ( UTF8_DISALLOW_SUPER | UTF8_WARN_SUPER )) {
1959- possible_problems |= UTF8_GOT_SUPER ;
1960- }
1956+ }
1957+ else if (UNLIKELY ( UNICODE_IS_SUPER ( uv ) )) {
1958+ if ( flags & ( UTF8_DISALLOW_SUPER | UTF8_WARN_SUPER )) {
1959+ possible_problems |= UTF8_GOT_SUPER ;
19611960 }
1962- else if ( UNLIKELY ( UNICODE_IS_NONCHAR ( uv ))) {
1963- if (flags & ( UTF8_DISALLOW_NONCHAR | UTF8_WARN_NONCHAR )) {
1964- possible_problems |= UTF8_GOT_NONCHAR ;
1965- }
1961+ }
1962+ else if (UNLIKELY ( UNICODE_IS_NONCHAR ( uv ) )) {
1963+ if ( flags & ( UTF8_DISALLOW_NONCHAR | UTF8_WARN_NONCHAR )) {
1964+ possible_problems |= UTF8_GOT_NONCHAR ;
19661965 }
1966+ }
19671967 } /* End of ! must_be_super */
19681968 } /* End of checking if is a special code point */
19691969
@@ -2258,26 +2258,27 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22582258 case UTF8_GOT_EMPTY :
22592259 COMMON_DEFAULT_REJECTS (,);
22602260
2261- /* This so-called malformation is now treated as a bug in
2262- * the caller. If you have nothing to decode, skip calling
2263- * this function */
2264- assert ( 0 );
2265- message = Perl_form ( aTHX_ "%s (empty string)" ,
2266- malformed_text );
2261+ /* This so-called malformation is now treated as a bug in the
2262+ * caller. If you have nothing to decode, skip calling this
2263+ * function */
2264+
2265+ assert ( 0 );
2266+ message = Perl_form ( aTHX_ "%s (empty string)" , malformed_text );
22672267 break ;
22682268
22692269 case UTF8_GOT_CONTINUATION :
22702270 COMMON_DEFAULT_REJECTS (,);
2271- message = Perl_form (aTHX_
2271+ message = Perl_form (aTHX_
22722272 "%s: %s (unexpected continuation byte 0x%02x,"
22732273 " with no preceding start byte)" ,
22742274 malformed_text ,
2275- _byte_dump_string (s0 , 1 , 0 ), * s0 );
2275+ _byte_dump_string (s0 , 1 , 0 ),
2276+ * s0 );
22762277 break ;
22772278
22782279 case UTF8_GOT_SHORT :
22792280 COMMON_DEFAULT_REJECTS (,);
2280- message = Perl_form (aTHX_
2281+ message = Perl_form (aTHX_
22812282 "%s: %s (too short; %d byte%s available, need %d)" ,
22822283 malformed_text ,
22832284 _byte_dump_string (s0 , avail_len , 0 ),
@@ -2289,14 +2290,15 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22892290 case UTF8_GOT_NON_CONTINUATION :
22902291 {
22912292 COMMON_DEFAULT_REJECTS (,);
2292- /* If we don't know for sure that the input length is
2293- * valid, avoid as much as possible reading past the
2294- * end of the buffer */
2295- int printlen = (flags & UTF8_NO_CONFIDENCE_IN_CURLEN_ )
2296- ? (int ) (s - s0 )
2297- : (int ) (avail_len );
2298- message = Perl_form (aTHX_ "%s" ,
2299- unexpected_non_continuation_text (s0 ,
2293+
2294+ /* If we don't know for sure that the input length is valid,
2295+ * avoid as much as possible reading past the end of the buffer
2296+ * */
2297+ int printlen = (flags & UTF8_NO_CONFIDENCE_IN_CURLEN_ )
2298+ ? (int ) (s - s0 )
2299+ : (int ) (avail_len );
2300+ message = Perl_form (aTHX_ "%s" ,
2301+ unexpected_non_continuation_text (s0 ,
23002302 printlen ,
23012303 s - s0 ,
23022304 (int ) expectlen ));
@@ -2307,37 +2309,34 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23072309 case UTF8_GOT_LONG_WITH_VALUE :
23082310 COMMON_DEFAULT_REJECTS (,);
23092311
2310- /* These error types cause 'input_uv' to be something
2311- * that isn't what was intended, so can't use it in the
2312- * message. The other error types either can't
2313- * generate an overlong, or else the 'input_uv' is
2314- * valid */
2315- if (orig_problems &
2316- (UTF8_GOT_TOO_SHORT |UTF8_GOT_OVERFLOW ))
2317- {
2318- message = Perl_form (aTHX_
2319- "%s: %s (any UTF-8 sequence that starts"
2320- " with \"%s\" is overlong which can and"
2321- " should be represented with a"
2322- " different, shorter sequence)" ,
2323- malformed_text ,
2324- _byte_dump_string (s0 , send - s0 , 0 ),
2325- _byte_dump_string (s0 , curlen , 0 ));
2326- }
2327- else {
2328- U8 tmpbuf [UTF8_MAXBYTES + 1 ];
2329- const U8 * const e = uvoffuni_to_utf8_flags (tmpbuf ,
2312+ /* These error types cause 'input_uv' to be something that
2313+ * isn't what was intended, so can't use it in the message.
2314+ * The other error types either can't generate an overlong, or
2315+ * else the 'input_uv' is valid */
2316+ if (orig_problems & (UTF8_GOT_TOO_SHORT |UTF8_GOT_OVERFLOW )) {
2317+ message = Perl_form (aTHX_
2318+ "%s: %s (any UTF-8 sequence that starts with"
2319+ " \"%s\" is overlong which can and should be"
2320+ " represented with a different, shorter sequence)" ,
2321+ malformed_text ,
2322+ _byte_dump_string (s0 , send - s0 , 0 ),
2323+ _byte_dump_string (s0 , curlen , 0 ));
2324+ }
2325+ else {
2326+ U8 tmpbuf [UTF8_MAXBYTES + 1 ];
2327+ const U8 * const e = uvoffuni_to_utf8_flags (tmpbuf ,
23302328 input_uv , 0 );
2331- /* Don't use U+ for non-Unicode code points, which
2332- * includes those in the Latin1 range */
2333- const char * preface = ( UNICODE_IS_SUPER (input_uv )
2329+
2330+ /* Don't use U+ for non-Unicode code points, which includes
2331+ * those in the Latin1 range */
2332+ const char * preface = ( UNICODE_IS_SUPER (input_uv )
23342333#ifdef EBCDIC
2335- || input_uv <= 0xFF
2334+ || input_uv <= 0xFF
23362335#endif
2337- )
2338- ? "0x"
2339- : "U+" ;
2340- message = Perl_form (aTHX_
2336+ )
2337+ ? "0x"
2338+ : "U+" ;
2339+ message = Perl_form (aTHX_
23412340 "%s: %s (overlong; instead use %s to represent"
23422341 " %s%0*" UVXf ")" ,
23432342 malformed_text ,
@@ -2348,7 +2347,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23482347 for small code
23492348 points */
23502349 UNI_TO_NATIVE (input_uv ));
2351- }
2350+ }
23522351 break ;
23532352
23542353/* PACK_WARN returns:
@@ -2384,33 +2383,30 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23842383 COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_SURROGATE ,
23852384 WARN_SURROGATE ,,);
23862385
2387- /* This is the only error that can occur with a
2388- * surrogate when the 'input_uv' isn't valid */
2389- if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2390- message = Perl_form (aTHX_
2391- "UTF-16 surrogate (any UTF-8 sequence that"
2392- " starts with \"%s\" is for a surrogate)" ,
2393- _byte_dump_string (s0 , curlen , 0 ));
2394- }
2395- else {
2396- message = Perl_form (aTHX_ surrogate_cp_format ,
2397- input_uv );
2398- }
2386+ /* This is the only error that can occur with a surrogate when
2387+ * the 'input_uv' isn't valid */
2388+ if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2389+ message = Perl_form (aTHX_
2390+ "UTF-16 surrogate (any UTF-8 sequence that"
2391+ " starts with \"%s\" is for a surrogate)" ,
2392+ _byte_dump_string (s0 , curlen , 0 ));
2393+ }
2394+ else {
2395+ message = Perl_form (aTHX_ surrogate_cp_format , input_uv );
2396+ }
23992397
24002398 break ;
24012399
24022400 case UTF8_GOT_NONCHAR :
2403-
24042401 COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_NONCHAR , WARN_NONCHAR ,,);
24052402
2406- /* The code above should have guaranteed that we don't
2407- * get here with conditions other than these */
2408- assert (! (orig_problems & ~( UTF8_GOT_LONG
2409- |UTF8_GOT_LONG_WITH_VALUE
2410- |UTF8_GOT_PERL_EXTENDED
2411- |UTF8_GOT_NONCHAR )));
2412-
2413- message = Perl_form (aTHX_ nonchar_cp_format , input_uv );
2403+ /* The code above should have guaranteed that we don't get here
2404+ * with conditions other than these */
2405+ assert (! (orig_problems & ~( UTF8_GOT_LONG
2406+ |UTF8_GOT_LONG_WITH_VALUE
2407+ |UTF8_GOT_PERL_EXTENDED
2408+ |UTF8_GOT_NONCHAR )));
2409+ message = Perl_form (aTHX_ nonchar_cp_format , input_uv );
24142410
24152411 break ;
24162412
@@ -2524,50 +2520,50 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
25242520 /* We only get here if there is a message to be displayed or
25252521 * returned; each case statement in the switch above does a
25262522 * continue if no message for it need be generated. */
2527- if (msgs ) {
2528- if (msgs_return == NULL ) {
2529- msgs_return = newAV ();
2523+ if (msgs ) {
2524+ if (msgs_return == NULL ) {
2525+ msgs_return = newAV ();
2526+ }
2527+
2528+ av_push (msgs_return ,
2529+ /* Negative 'pack_warn' really means 0 here. But this
2530+ * converts that to UTF-8 to preserve broken behavior
2531+ * depended upon by Encode. */
2532+ newRV_noinc ((SV * ) new_msg_hv (message ,
2533+ ((pack_warn <= 0 )
2534+ ? packWARN (WARN_UTF8 )
2535+ : pack_warn ),
2536+ this_flag_bit )));
2537+ }
2538+ else {
2539+ if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2540+ |UTF8_FORCE_WARN_IF_MALFORMED )))
2541+ {
2542+ ENTER ;
2543+ SAVEI8 (PL_dowarn );
2544+ SAVESPTR (PL_curcop );
2545+
2546+ PL_dowarn = G_WARN_ALL_ON |G_WARN_ON ;
2547+ if (PL_curcop ) {
2548+ SAVECURCOPWARNINGS ();
2549+ PL_curcop -> cop_warnings = pWARN_ALL ;
25302550 }
2551+ }
25312552
2532- av_push (msgs_return ,
2533- /* Negative 'pack_warn' really means 0 here. But
2534- * this converts that to UTF-8 to preserve broken
2535- * behavior depended upon by Encode. */
2536- newRV_noinc ((SV * ) new_msg_hv (message ,
2537- ((pack_warn <= 0 )
2538- ? packWARN (WARN_UTF8 )
2539- : pack_warn ),
2540- this_flag_bit )));
2553+ if (PL_op ) {
2554+ Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
2555+ OP_DESC (PL_op ));
25412556 }
25422557 else {
2543- if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2544- |UTF8_FORCE_WARN_IF_MALFORMED )))
2545- {
2546- ENTER ;
2547- SAVEI8 (PL_dowarn );
2548- SAVESPTR (PL_curcop );
2549-
2550- PL_dowarn = G_WARN_ALL_ON |G_WARN_ON ;
2551- if (PL_curcop ) {
2552- SAVECURCOPWARNINGS ();
2553- PL_curcop -> cop_warnings = pWARN_ALL ;
2554- }
2555- }
2556-
2557- if (PL_op ) {
2558- Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
2559- OP_DESC (PL_op ));
2560- }
2561- else {
2562- Perl_warner (aTHX_ pack_warn , "%s" , message );
2563- }
2558+ Perl_warner (aTHX_ pack_warn , "%s" , message );
2559+ }
25642560
2565- if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2566- |UTF8_FORCE_WARN_IF_MALFORMED )))
2567- {
2568- LEAVE ;
2569- }
2561+ if (UNLIKELY (flags & ( UTF8_DIE_IF_MALFORMED
2562+ |UTF8_FORCE_WARN_IF_MALFORMED )))
2563+ {
2564+ LEAVE ;
25702565 }
2566+ }
25712567 } /* End of 'while (possible_problems)' */
25722568
25732569 if (msgs_return ) {
0 commit comments