@@ -2090,41 +2090,49 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2090
2090
U32 replaces = ( UTF8_ALLOW_ANY |UTF8_ALLOW_EMPTY )
2091
2091
|(flags & UTF8_DISALLOW_ILLEGAL_INTERCHANGE );
2092
2092
2093
- /* The following macro returns 0 if no message needs to be generated
2094
- * for this problem even if everything else says to. Otherwise returns
2095
- * the warning category to use for the message..
2093
+ /* The following macro returns:
2094
+ * 0 when there is no reason to generate a message for this
2095
+ * condition, because the appropriate warnings categories are
2096
+ * off and not overridden
2097
+ * < 0 when the only reason would be to return a message in an AV
2098
+ * structure. This happens when the macro would otherwise
2099
+ * return 0, but detects there is an AV structure to fill in.
2100
+ * > 0 when there are warning categories effectively enabled. If
2101
+ * so, the value is the result of calling the appropriate
2102
+ * packWARN macro on those categories.
2096
2103
*
2097
- * No message need be generated if the UTF8_CHECK_ONLY flag has been
2098
- * set by the caller. Otherwise, a message should be generated if:
2099
- * 1) the caller has furnished a structure into which messages should
2100
- * be returned to it (so it itself can decide what to do); or
2101
- * 2) warnings are enabled for either of the category parameters to
2102
- * the macro; or
2103
- * 3) the special MALFORMED flags have been passed
2104
+ * The first parameter 'warning' is a warnings category that applies to
2105
+ * the condition. The following tests are checked in this priority
2106
+ * order; the first that matches is taken:
2104
2107
*
2105
- * The 'warning' parameter is the higher priority warning category to
2106
- * check. The macro calls ckWARN_d(warning), so warnings for it are
2107
- * considered to be on by default.
2108
- *
2109
- * The second, lower priority category is optional. To specify not to
2110
- * use one, call the macro
2111
- * like: NEED_MESSAGE(WARN_FOO,,)
2112
- * Otherwise like: NEED_MESSAGE(WARN_FOO, ckWARN_d, WARN_BAR)
2113
- *
2114
- * The second parameter could also have been ckWARN to specify that the
2115
- * second category isn't on by default.
2108
+ * 1) 'warning' is considered enabled if the UTF8_DIE_IF_MALFORMED
2109
+ * flag is set.
2110
+ * 2) 'warning' is considered disabled if the UTF8_CHECK_ONLY flag is
2111
+ * set.
2112
+ * 3) 'warning' is considered enabled if the
2113
+ * UTF8_FORCE_WARN_IF_MALFORMED flag is set
2114
+ * 4) 'warning is considered enabled if ckWARN_d(warning) is true
2115
+ * 5) A secondary warning category is optionally passed, along with
2116
+ * either to use ckWARN or ckWARN_d on it. This is considered
2117
+ * enabled if that returns true.
2118
+ * 6) -1 is returned if 'msgs' isn't NULL, which means the caller
2119
+ * wants any message stored into it
2120
+ * 7) 0 is returned.
2116
2121
*
2117
2122
* When called without a second category, the macro outputs a bunch of
2118
2123
* zeroes that the compiler should fold to nothing */
2119
- #define NEED_MESSAGE (warning , extra_ckWARN , extra_category ) \
2120
- ((flags & UTF8_CHECK_ONLY) ? 0 : \
2121
- ((ckWARN_d(warning)) ? warning : \
2122
- ((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2123
- ((flags & ( UTF8_DIE_IF_MALFORMED \
2124
- |UTF8_FORCE_WARN_IF_MALFORMED)) ? warning : \
2125
- ((msgs) ? warning : 0)))))
2124
+ #define PACK_WARN (warning , extra_ckWARN , extra_category ) \
2125
+ (UNLIKELY(flags & UTF8_DIE_IF_MALFORMED) ? packWARN(warning) \
2126
+ : (flags & UTF8_CHECK_ONLY) ? 0 \
2127
+ : UNLIKELY(flags & UTF8_FORCE_WARN_IF_MALFORMED) ? packWARN(warning)\
2128
+ : ckWARN_d(warning) ? packWARN(warning) \
2129
+ : extra_ckWARN(extra_category +0) ? packWARN2(warning, \
2130
+ extra_category +0) \
2131
+ : (msgs) ? -1 \
2132
+ : 0)
2126
2133
2127
2134
while (possible_problems ) { /* Handle each possible problem */
2135
+ IV pack_warn = 0 ;
2128
2136
char * message = NULL ;
2129
2137
2130
2138
/* The lowest bit positions, as #defined in utf8.h, are handled
@@ -2158,10 +2166,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2158
2166
* handling of any generated warning message. That means that if a
2159
2167
* case: finds there is no message, it can 'continue' to the next
2160
2168
* loop iteration instead of doing a 'break', whose only purpose
2161
- * would be to handle the message. */
2162
-
2163
- /* Most case:s use this; overridden in a few */
2164
- U32 pack_warn = packWARN (WARN_UTF8 );
2169
+ * would be to handle the message.
2170
+ */
2165
2171
2166
2172
switch (this_problem ) {
2167
2173
default :
@@ -2179,7 +2185,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2179
2185
* this function */
2180
2186
assert (0 );
2181
2187
2182
- if (NEED_MESSAGE (WARN_UTF8 ,,)) {
2188
+ if (PACK_WARN (WARN_UTF8 ,,)) {
2183
2189
message = Perl_form (aTHX_ "%s (empty string)" ,
2184
2190
malformed_text );
2185
2191
}
@@ -2189,7 +2195,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2189
2195
2190
2196
case UTF8_GOT_CONTINUATION :
2191
2197
if (! (flags & UTF8_ALLOW_CONTINUATION )) {
2192
- if (NEED_MESSAGE (WARN_UTF8 ,,)) {
2198
+ if (PACK_WARN (WARN_UTF8 ,,)) {
2193
2199
message = Perl_form (aTHX_
2194
2200
"%s: %s (unexpected continuation byte 0x%02x,"
2195
2201
" with no preceding start byte)" ,
@@ -2202,7 +2208,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2202
2208
2203
2209
case UTF8_GOT_SHORT :
2204
2210
if (! (flags & UTF8_ALLOW_SHORT )) {
2205
- if (NEED_MESSAGE (WARN_UTF8 ,,)) {
2211
+ if (PACK_WARN (WARN_UTF8 ,,)) {
2206
2212
message = Perl_form (aTHX_
2207
2213
"%s: %s (too short; %d byte%s available, need %d)" ,
2208
2214
malformed_text ,
@@ -2217,7 +2223,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2217
2223
2218
2224
case UTF8_GOT_NON_CONTINUATION :
2219
2225
if (! (flags & UTF8_ALLOW_NON_CONTINUATION )) {
2220
- if (NEED_MESSAGE (WARN_UTF8 ,,)) {
2226
+ if (PACK_WARN (WARN_UTF8 ,,)) {
2221
2227
2222
2228
/* If we don't know for sure that the input length is
2223
2229
* valid, avoid as much as possible reading past the
@@ -2240,7 +2246,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2240
2246
if (! (flags & ( UTF8_ALLOW_LONG
2241
2247
|UTF8_ALLOW_LONG_AND_ITS_VALUE )))
2242
2248
{
2243
- if (NEED_MESSAGE (WARN_UTF8 ,,)) {
2249
+ if (PACK_WARN (WARN_UTF8 ,,)) {
2244
2250
2245
2251
/* These error types cause 'input_uv' to be something
2246
2252
* that isn't what was intended, so can't use it in the
@@ -2296,8 +2302,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2296
2302
* this case are true */
2297
2303
2298
2304
if (flags & UTF8_WARN_SURROGATE ) {
2299
- if (NEED_MESSAGE (WARN_SURROGATE ,,)) {
2300
- pack_warn = packWARN (WARN_SURROGATE );
2305
+ if (PACK_WARN (WARN_SURROGATE ,,)) {
2301
2306
2302
2307
/* These are the only errors that can occur with a
2303
2308
* surrogate when the 'input_uv' isn't valid */
@@ -2323,15 +2328,14 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2323
2328
* this case are true */
2324
2329
2325
2330
if (flags & UTF8_WARN_NONCHAR ) {
2326
- if (NEED_MESSAGE (WARN_NONCHAR ,,)) {
2331
+ if (PACK_WARN (WARN_NONCHAR ,,)) {
2327
2332
/* The code above should have guaranteed that we don't
2328
2333
* get here with conditions other than these */
2329
2334
assert (! (orig_problems & ~( UTF8_GOT_LONG
2330
2335
|UTF8_GOT_LONG_WITH_VALUE
2331
2336
|UTF8_GOT_PERL_EXTENDED
2332
2337
|UTF8_GOT_NONCHAR )));
2333
2338
2334
- pack_warn = packWARN (WARN_NONCHAR );
2335
2339
message = Perl_form (aTHX_ nonchar_cp_format , input_uv );
2336
2340
}
2337
2341
}
@@ -2424,7 +2428,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2424
2428
* is enabled, but which category to use? Historically, we've
2425
2429
* used 'utf8' if it is enabled; and that seems like the more
2426
2430
* severe category, more befitting a malformation. */
2427
- pack_warn = NEED_MESSAGE (WARN_UTF8 , ckWARN_d , WARN_NON_UNICODE );
2431
+ pack_warn = PACK_WARN (WARN_UTF8 , ckWARN_d , WARN_NON_UNICODE );
2428
2432
if (pack_warn ) {
2429
2433
message = Perl_form (aTHX_ non_cp_format ,
2430
2434
_byte_dump_string (s0 , curlen , 0 ));
@@ -2486,8 +2490,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2486
2490
2487
2491
/* These code points are non-portable, so warn if either
2488
2492
* category is enabled */
2489
- if (NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE )) {
2490
- pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2493
+ if (PACK_WARN (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE )) {
2491
2494
if (cp_format ) {
2492
2495
message = Perl_form (aTHX_ cp_format , input_uv );
2493
2496
}
@@ -2532,8 +2535,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2532
2535
* enabled). */
2533
2536
error_flags_return |= this_flag_bit ;
2534
2537
if (flags & UTF8_WARN_SUPER ) {
2535
- if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2536
- pack_warn = packWARN (WARN_NON_UNICODE );
2538
+ if (PACK_WARN (WARN_NON_UNICODE ,,)) {
2537
2539
if (cp_format ) {
2538
2540
message = Perl_form (aTHX_ cp_format , input_uv );
2539
2541
}
@@ -2558,7 +2560,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2558
2560
}
2559
2561
2560
2562
av_push (msgs_return ,
2561
- newRV_noinc ((SV * ) new_msg_hv (message , pack_warn ,
2563
+ /* Negative 'pack_warn' really means 0 here. But
2564
+ * this converts that to UTF-8 to preserve broken
2565
+ * behavior depended upon by Encode. */
2566
+ newRV_noinc ((SV * ) new_msg_hv (message ,
2567
+ ((pack_warn <= 0 )
2568
+ ? packWARN (WARN_UTF8 )
2569
+ : pack_warn ),
2562
2570
this_flag_bit )));
2563
2571
}
2564
2572
else if (! (flags & UTF8_CHECK_ONLY )) {
0 commit comments