Skip to content

Commit 9540191

Browse files
committed
utf8_to_uv_msgs: Revise and rename macro
This macro is used to hide the details of determining if an abnormal condition should raise a warning or not. But I found it more convenient to expand the macro to return the packed warnings category(ies) if a warning should be raised or not. That information is known inside the macro and was being discarded, and then having to be recalculated. The new name reflects its expanded purpose, PACK_WARN. 0 is returned if no warnings need be raised; and importantly fixing a bug in the old code, it returns < 0 if no warning should be raised directly, but that an entry needs to be added to the AV array returned by the function (if the parameter requesting that has been passed in) But Encode, for which this form of the translation function was created, and may be the only user of it, depends on not getting a zero return. So this has an override until Encode can be fixed. I introduced the DIE_IF_MALFORMED flag in the previous development release, making it subservient to the CHECK_ONLY flag. I have since realized that the precedence should be reversed. If a developer inadvertently passes both flags, it is better to honor the one saying you need to quit, than the one saying ignore any problems.
1 parent 78cced2 commit 9540191

File tree

1 file changed

+55
-47
lines changed

1 file changed

+55
-47
lines changed

utf8.c

Lines changed: 55 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2090,41 +2090,49 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20902090
U32 replaces = ( UTF8_ALLOW_ANY|UTF8_ALLOW_EMPTY)
20912091
|(flags & UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
20922092

2093-
/* The following macro returns 0 if no message needs to be generated
2094-
* for this problem even if everything else says to. Otherwise returns
2095-
* the warning category to use for the message..
2093+
/* The following macro returns:
2094+
* 0 when there is no reason to generate a message for this
2095+
* condition, because the appropriate warnings categories are
2096+
* off and not overridden
2097+
* < 0 when the only reason would be to return a message in an AV
2098+
* structure. This happens when the macro would otherwise
2099+
* return 0, but detects there is an AV structure to fill in.
2100+
* > 0 when there are warning categories effectively enabled. If
2101+
* so, the value is the result of calling the appropriate
2102+
* packWARN macro on those categories.
20962103
*
2097-
* No message need be generated if the UTF8_CHECK_ONLY flag has been
2098-
* set by the caller. Otherwise, a message should be generated if:
2099-
* 1) the caller has furnished a structure into which messages should
2100-
* be returned to it (so it itself can decide what to do); or
2101-
* 2) warnings are enabled for either of the category parameters to
2102-
* the macro; or
2103-
* 3) the special MALFORMED flags have been passed
2104+
* The first parameter 'warning' is a warnings category that applies to
2105+
* the condition. The following tests are checked in this priority
2106+
* order; the first that matches is taken:
21042107
*
2105-
* The 'warning' parameter is the higher priority warning category to
2106-
* check. The macro calls ckWARN_d(warning), so warnings for it are
2107-
* considered to be on by default.
2108-
*
2109-
* The second, lower priority category is optional. To specify not to
2110-
* use one, call the macro
2111-
* like: NEED_MESSAGE(WARN_FOO,,)
2112-
* Otherwise like: NEED_MESSAGE(WARN_FOO, ckWARN_d, WARN_BAR)
2113-
*
2114-
* The second parameter could also have been ckWARN to specify that the
2115-
* second category isn't on by default.
2108+
* 1) 'warning' is considered enabled if the UTF8_DIE_IF_MALFORMED
2109+
* flag is set.
2110+
* 2) 'warning' is considered disabled if the UTF8_CHECK_ONLY flag is
2111+
* set.
2112+
* 3) 'warning' is considered enabled if the
2113+
* UTF8_FORCE_WARN_IF_MALFORMED flag is set
2114+
* 4) 'warning is considered enabled if ckWARN_d(warning) is true
2115+
* 5) A secondary warning category is optionally passed, along with
2116+
* either to use ckWARN or ckWARN_d on it. This is considered
2117+
* enabled if that returns true.
2118+
* 6) -1 is returned if 'msgs' isn't NULL, which means the caller
2119+
* wants any message stored into it
2120+
* 7) 0 is returned.
21162121
*
21172122
* When called without a second category, the macro outputs a bunch of
21182123
* zeroes that the compiler should fold to nothing */
2119-
#define NEED_MESSAGE(warning, extra_ckWARN, extra_category) \
2120-
((flags & UTF8_CHECK_ONLY) ? 0 : \
2121-
((ckWARN_d(warning)) ? warning : \
2122-
((extra_ckWARN(extra_category +0)) ? extra_category +0 : \
2123-
((flags & ( UTF8_DIE_IF_MALFORMED \
2124-
|UTF8_FORCE_WARN_IF_MALFORMED)) ? warning : \
2125-
((msgs) ? warning : 0)))))
2124+
#define PACK_WARN(warning, extra_ckWARN, extra_category) \
2125+
(UNLIKELY(flags & UTF8_DIE_IF_MALFORMED) ? packWARN(warning) \
2126+
: (flags & UTF8_CHECK_ONLY) ? 0 \
2127+
: UNLIKELY(flags & UTF8_FORCE_WARN_IF_MALFORMED) ? packWARN(warning)\
2128+
: ckWARN_d(warning) ? packWARN(warning) \
2129+
: extra_ckWARN(extra_category +0) ? packWARN2(warning, \
2130+
extra_category +0) \
2131+
: (msgs) ? -1 \
2132+
: 0)
21262133

21272134
while (possible_problems) { /* Handle each possible problem */
2135+
IV pack_warn = 0;
21282136
char * message = NULL;
21292137

21302138
/* The lowest bit positions, as #defined in utf8.h, are handled
@@ -2158,10 +2166,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
21582166
* handling of any generated warning message. That means that if a
21592167
* case: finds there is no message, it can 'continue' to the next
21602168
* loop iteration instead of doing a 'break', whose only purpose
2161-
* would be to handle the message. */
2162-
2163-
/* Most case:s use this; overridden in a few */
2164-
U32 pack_warn = packWARN(WARN_UTF8);
2169+
* would be to handle the message.
2170+
*/
21652171

21662172
switch (this_problem) {
21672173
default:
@@ -2179,7 +2185,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
21792185
* this function */
21802186
assert(0);
21812187

2182-
if (NEED_MESSAGE(WARN_UTF8,,)) {
2188+
if (PACK_WARN(WARN_UTF8,,)) {
21832189
message = Perl_form(aTHX_ "%s (empty string)",
21842190
malformed_text);
21852191
}
@@ -2189,7 +2195,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
21892195

21902196
case UTF8_GOT_CONTINUATION:
21912197
if (! (flags & UTF8_ALLOW_CONTINUATION)) {
2192-
if (NEED_MESSAGE(WARN_UTF8,,)) {
2198+
if (PACK_WARN(WARN_UTF8,,)) {
21932199
message = Perl_form(aTHX_
21942200
"%s: %s (unexpected continuation byte 0x%02x,"
21952201
" with no preceding start byte)",
@@ -2202,7 +2208,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22022208

22032209
case UTF8_GOT_SHORT:
22042210
if (! (flags & UTF8_ALLOW_SHORT)) {
2205-
if (NEED_MESSAGE(WARN_UTF8,,)) {
2211+
if (PACK_WARN(WARN_UTF8,,)) {
22062212
message = Perl_form(aTHX_
22072213
"%s: %s (too short; %d byte%s available, need %d)",
22082214
malformed_text,
@@ -2217,7 +2223,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22172223

22182224
case UTF8_GOT_NON_CONTINUATION:
22192225
if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
2220-
if (NEED_MESSAGE(WARN_UTF8,,)) {
2226+
if (PACK_WARN(WARN_UTF8,,)) {
22212227

22222228
/* If we don't know for sure that the input length is
22232229
* valid, avoid as much as possible reading past the
@@ -2240,7 +2246,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22402246
if (! (flags & ( UTF8_ALLOW_LONG
22412247
|UTF8_ALLOW_LONG_AND_ITS_VALUE)))
22422248
{
2243-
if (NEED_MESSAGE(WARN_UTF8,,)) {
2249+
if (PACK_WARN(WARN_UTF8,,)) {
22442250

22452251
/* These error types cause 'input_uv' to be something
22462252
* that isn't what was intended, so can't use it in the
@@ -2296,8 +2302,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22962302
* this case are true */
22972303

22982304
if (flags & UTF8_WARN_SURROGATE) {
2299-
if (NEED_MESSAGE(WARN_SURROGATE,,)) {
2300-
pack_warn = packWARN(WARN_SURROGATE);
2305+
if (PACK_WARN(WARN_SURROGATE,,)) {
23012306

23022307
/* These are the only errors that can occur with a
23032308
* surrogate when the 'input_uv' isn't valid */
@@ -2323,15 +2328,14 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23232328
* this case are true */
23242329

23252330
if (flags & UTF8_WARN_NONCHAR) {
2326-
if (NEED_MESSAGE(WARN_NONCHAR,,)) {
2331+
if (PACK_WARN(WARN_NONCHAR,,)) {
23272332
/* The code above should have guaranteed that we don't
23282333
* get here with conditions other than these */
23292334
assert (! (orig_problems & ~( UTF8_GOT_LONG
23302335
|UTF8_GOT_LONG_WITH_VALUE
23312336
|UTF8_GOT_PERL_EXTENDED
23322337
|UTF8_GOT_NONCHAR)));
23332338

2334-
pack_warn = packWARN(WARN_NONCHAR);
23352339
message = Perl_form(aTHX_ nonchar_cp_format, input_uv);
23362340
}
23372341
}
@@ -2424,7 +2428,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
24242428
* is enabled, but which category to use? Historically, we've
24252429
* used 'utf8' if it is enabled; and that seems like the more
24262430
* severe category, more befitting a malformation. */
2427-
pack_warn = NEED_MESSAGE(WARN_UTF8, ckWARN_d, WARN_NON_UNICODE);
2431+
pack_warn = PACK_WARN(WARN_UTF8, ckWARN_d, WARN_NON_UNICODE);
24282432
if (pack_warn) {
24292433
message = Perl_form(aTHX_ non_cp_format,
24302434
_byte_dump_string(s0, curlen, 0));
@@ -2486,8 +2490,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
24862490

24872491
/* These code points are non-portable, so warn if either
24882492
* category is enabled */
2489-
if (NEED_MESSAGE(WARN_NON_UNICODE, ckWARN, WARN_PORTABLE)) {
2490-
pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
2493+
if (PACK_WARN(WARN_NON_UNICODE, ckWARN, WARN_PORTABLE)) {
24912494
if (cp_format) {
24922495
message = Perl_form(aTHX_ cp_format, input_uv);
24932496
}
@@ -2532,8 +2535,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
25322535
* enabled). */
25332536
error_flags_return |= this_flag_bit;
25342537
if (flags & UTF8_WARN_SUPER) {
2535-
if (NEED_MESSAGE(WARN_NON_UNICODE,,)) {
2536-
pack_warn = packWARN(WARN_NON_UNICODE);
2538+
if (PACK_WARN(WARN_NON_UNICODE,,)) {
25372539
if (cp_format) {
25382540
message = Perl_form(aTHX_ cp_format, input_uv);
25392541
}
@@ -2558,7 +2560,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
25582560
}
25592561

25602562
av_push(msgs_return,
2561-
newRV_noinc((SV*) new_msg_hv(message, pack_warn,
2563+
/* Negative 'pack_warn' really means 0 here. But
2564+
* this converts that to UTF-8 to preserve broken
2565+
* behavior depended upon by Encode. */
2566+
newRV_noinc((SV*) new_msg_hv(message,
2567+
((pack_warn <= 0)
2568+
? packWARN(WARN_UTF8)
2569+
: pack_warn),
25622570
this_flag_bit)));
25632571
}
25642572
else if (! (flags & UTF8_CHECK_ONLY)) {

0 commit comments

Comments
 (0)