@@ -1735,7 +1735,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
17351735 int overlong_detect_length = 0 ;
17361736
17371737 /* Gives how many bytes are available, which may turn out to be less than
1738- * the expected length */
1738+ * (but never more than) the expected length, */
17391739 Size_t avail_len ;
17401740
17411741 /* The ending position, plus 1, of the first character in the sequence
@@ -1756,7 +1756,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
17561756 avail_len = send - s0 ;
17571757
17581758 /* We now know we can examine the first byte of the input. A continuation
1759- * character can't start a valid sequence */
1759+ * byte can't start a valid sequence */
17601760 if (UNLIKELY (UTF8_IS_CONTINUATION (* s0 ))) {
17611761 possible_problems |= UTF8_GOT_CONTINUATION ;
17621762 curlen = 1 ;
@@ -1975,7 +1975,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19751975 * expected to occupy, based on the value of the
19761976 * presumed start byte in s0. This will be 0 if the
19771977 * sequence is empty, or 1 if s0 isn't actually a
1978- * start byte.
1978+ * start byte. CAUTION: this could be beyond the end
1979+ * of the buffer.
19791980 * avail_len gives the number of bytes in the sequence this
19801981 * call can look at, one character's worth at most.
19811982 * curlen gives the number of bytes in the sequence that
@@ -2013,6 +2014,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20132014 * 2) returning information about the problem to the caller in
20142015 * *errors and/or *msgs; and/or
20152016 * 3) raising appropriate warnings.
2017+ * 4) potentially croaking if the input is a forbidden sequence, and
2018+ * the flag has been set that indicates to croak on those.
20162019 *
20172020 * There are two main categories of potential problems.
20182021 *
@@ -2035,39 +2038,96 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20352038 * otherwise the function returns the Unicode REPLACEMENT
20362039 * CHARACTER as the translation of these.
20372040 *
2041+ * These all have the same results unless flags are passed to
2042+ * change the behavior. Without flags the behavior is:
2043+ *
2044+ * 1) The function returns failure.
2045+ * 2) *cp_p is set to the REPLACEMENT_CHARACTER
2046+ * 3) For each problem, a bit is set in *errors denoting the
2047+ * error, if errors is not NULL.
2048+ * 4) For each problem, an entry is generated in *msgs, if msgs
2049+ * is not NULL.
2050+ * 5) a warning is raised if msgs is NULL and the appropriate
2051+ * warning category(ies) are enabled.
2052+ *
2053+ * Various flags change the behavior:
2054+ *
2055+ * UTF8_FORCE_WARN_IF_MALFORMED is forbidden if msgs is not
2056+ * NULL, and is ignored if UTF8_CHECK_ONLY is also
2057+ * specified; otherwise it turns on all warnings
2058+ * categories for the duration of the function.
2059+ *
2060+ * UTF8_DIE_IF_MALFORMED is forbidden if msgs is not NULL;
2061+ * otherwise it acts as if UTF8_FORCE_WARN_IF_MALFORMED
2062+ * has also been specified, and also croaks rather than
2063+ * returning.
2064+ *
2065+ * UTF8_CHECK_ONLY is ignored if msgs is not NULL or if
2066+ * UTF8_DIE_IF_MALFORMED is also set; otherwise it
2067+ * suppresses any warnings; behaviors 1) through 4) above
2068+ * are unchanged
2069+ *
2070+ * Also there is a flag associated with each possible condition,
2071+ * for example, UTF8_ALLOW_LONG. If set, the behavior is modified
2072+ * so that the corresponding condition:
2073+ * 1) doesn't cause the function to return failure
2074+ * 2) the REPLACEMENT_CHARACTER is still stored in *cp_p,
2075+ * except for the flag UTF8_ALLOW_LONG_AND_ITS_VALUE,
2076+ * which returns the calculated code point, even if plain
2077+ * UTF8_ALLOW_LONG is also set.
2078+ * 3) *errors still has a bit set.
2079+ * 4) no entry is generated in *msgs.
2080+ * 5) no warning is raised
2081+ *
2082+ * Note that this means the UTF8_CHECK_ONLY flag has the same
2083+ * effect as passing an ALLOW flag for every condition.
2084+ *
2085+ * Note also that an entry is placed in *errors for each condition
2086+ * found, regardless of the other flags. The caller can rely on
2087+ * this being an accurate accounting of all conditions found, even
2088+ * if they aren't otherwise reported.
2089+ *
20382090 * b) The other type is by default not considered to be a problem.
2039- * These are for when the input was syntactically valid
2040- * Perl- extended-UTF-8 for a code point that is representable on
2091+ * These are for when the input was syntactically valid UTF-8 (as
2092+ * extended by Perl) for a code point that is representable on
20412093 * this platform, but that code point isn't considered by Unicode
2042- * to be freely exchangeable between applications. To get here,
2043- * code earlier in this function has determined both that this
2044- * sequence is for such a code point, and that the 'flags'
2045- * parameter indicates that these are to be considered
2046- * problematic, meaning this sequence should be rejected, merely
2047- * warned about, or both. *errors will be set for each of these.
2048- *
2049- * If the caller to this function has set the corresponding
2050- * DISALLOW bit in 'flags', the translation of this sequence will
2051- * be the Unicode REPLACEMENT CHARACTER.
2094+ * to be freely exchangeable between applications.
20522095 *
2053- * If the caller to this function has set the corresponding WARN
2054- * bit in 'flags' potentially a warning message will be generated,
2055- * using the rules common to both types of problems, and detailed
2056- * below.
2096+ * The 'flags' parameter to this function must contain an
2097+ * appropriate set bit in order for this function to consider them
2098+ * to be problems. And to get here, code earlier in this function
2099+ * has determined one of those flags applies to this sequence.
2100+ * This means that we know already that this input is problematic,
2101+ * unlike the type a) items.
20572102 *
2058- * In all cases the corresponding bit in *errors is set. This is
2059- * in contrast to the other type of problem where the input
2060- * 'flags' affect if the bit is set or not.
2103+ * Each of these problematic sequences has two independent flags
2104+ * associated with it. The DISALLOW flag causes this code point
2105+ * to be rejected; the WARN flag causes it to attempt to raise a
2106+ * warning about it. To do both, specify both flags. This is
2107+ * different from the type a) items, where the ALLOW flag affects
2108+ * both the rejection and warning. The same 5 actions as type a)
2109+ * have to be done, but the conditions differ. The actions when
2110+ * the UTF8_CHECK_ONLY flag is not included are:
20612111 *
2062- * The default is to generate a warning for each of these. If the
2063- * input 'flags' has a corresponding ALLOW flag, warnings are
2064- * suppressed. The only other thing the ALLOW flags do is
2065- * determine if the function returns sucess or failure
2112+ * 1) If the DISALLOW flag is set, the function returns failure,
2113+ * or croaks if the UTF8_DIE_IF_MALFORMED flag is included.
2114+ * 2) If the DISALLOW flag is set, the REPLACEMENT_CHARACTER is
2115+ * substituted for the returned code point
2116+ * 3) A bit is set in *errors if errors is not NULL
2117+ * 4) An entry in *msgs is generated if msgs is not NULL. Since
2118+ * to get here, we know the input is problematic, an entry is
2119+ * unconditionally made. The warnings category for it will be
2120+ * zero if neither the corresponding WARN flag nor the
2121+ * UTF8_FORCE_WARN_IF_MALFORMED flag are included.
2122+ * 5) A warning is raised if msgs is NULL and either:
2123+ * i) the flag UTF8_FORCE_WARN_IF_MALFORMED is included; or
2124+ * ii) the corresponding WARN flag is included, and the
2125+ * appropriate warning category(ies) are enabled.
20662126 *
2067- * For both types of problems, if warnings are called for by the input
2068- * flags, also setting the UTF8_CHECK_ONLY flag overrides
2069- * generating them. If 'msgs' is not NULL, they all will be returned
2070- * there; otherwise they will be raised if warnings are enabled .
2127+ * Including the UTF8_CHECK_ONLY flag has no effect if the
2128+ * UTF8_DIE_IF_MALFORMED is also included; otherwise it changes
2129+ * the above actions only to not do 5); so no warnings get
2130+ * generated .
20712131 */
20722132
20732133 bool disallowed = FALSE;
@@ -2169,7 +2229,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
21692229 }
21702230
21712231 /* The code is structured so that there is a case: in a switch()
2172- * for each problem type, so as to handle the different details of
2232+ * for each condition type, so as to handle the different details of
21732233 * each. The only common part after setting things up is the
21742234 * handling of any generated warning message. That means that if a
21752235 * case: finds there is no message, it can 'continue' to the next
@@ -2222,7 +2282,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22222282 malformed_text ,
22232283 _byte_dump_string (s0 , send - s0 , 0 ),
22242284 (int )avail_len ,
2225- avail_len == 1 ? "" : "s" ,
2285+ avail_len == 1 ? "" : "s" , /* Pluralize */
22262286 (int )expectlen );
22272287 break ;
22282288
@@ -2324,8 +2384,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23242384 COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_SURROGATE ,
23252385 WARN_SURROGATE ,,);
23262386
2327- /* These are the only errors that can occur with a
2328- * surrogate when the 'input_uv' isn't valid */
2387+ /* This is the only error that can occur with a
2388+ * surrogate when the 'input_uv' isn't valid */
23292389 if (orig_problems & UTF8_GOT_TOO_SHORT ) {
23302390 message = Perl_form (aTHX_
23312391 "UTF-16 surrogate (any UTF-8 sequence that"
0 commit comments