@@ -1735,7 +1735,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1735
1735
int overlong_detect_length = 0 ;
1736
1736
1737
1737
/* Gives how many bytes are available, which may turn out to be less than
1738
- * the expected length */
1738
+ * (but never more than) the expected length, */
1739
1739
Size_t avail_len ;
1740
1740
1741
1741
/* The ending position, plus 1, of the first character in the sequence
@@ -1756,7 +1756,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1756
1756
avail_len = send - s0 ;
1757
1757
1758
1758
/* We now know we can examine the first byte of the input. A continuation
1759
- * character can't start a valid sequence */
1759
+ * byte can't start a valid sequence */
1760
1760
if (UNLIKELY (UTF8_IS_CONTINUATION (* s0 ))) {
1761
1761
possible_problems |= UTF8_GOT_CONTINUATION ;
1762
1762
curlen = 1 ;
@@ -1975,7 +1975,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1975
1975
* expected to occupy, based on the value of the
1976
1976
* presumed start byte in s0. This will be 0 if the
1977
1977
* sequence is empty, or 1 if s0 isn't actually a
1978
- * start byte.
1978
+ * start byte. CAUTION: this could be beyond the end
1979
+ * of the buffer.
1979
1980
* avail_len gives the number of bytes in the sequence this
1980
1981
* call can look at, one character's worth at most.
1981
1982
* curlen gives the number of bytes in the sequence that
@@ -2013,6 +2014,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2013
2014
* 2) returning information about the problem to the caller in
2014
2015
* *errors and/or *msgs; and/or
2015
2016
* 3) raising appropriate warnings.
2017
+ * 4) potentially croaking if the input is a forbidden sequence, and
2018
+ * the flag has been set that indicates to croak on those.
2016
2019
*
2017
2020
* There are two main categories of potential problems.
2018
2021
*
@@ -2035,39 +2038,96 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2035
2038
* otherwise the function returns the Unicode REPLACEMENT
2036
2039
* CHARACTER as the translation of these.
2037
2040
*
2041
+ * These all have the same results unless flags are passed to
2042
+ * change the behavior. Without flags the behavior is:
2043
+ *
2044
+ * 1) The function returns failure.
2045
+ * 2) *cp_p is set to the REPLACEMENT_CHARACTER
2046
+ * 3) For each problem, a bit is set in *errors denoting the
2047
+ * error, if errors is not NULL.
2048
+ * 4) For each problem, an entry is generated in *msgs, if msgs
2049
+ * is not NULL.
2050
+ * 5) a warning is raised if msgs is NULL and the appropriate
2051
+ * warning category(ies) are enabled.
2052
+ *
2053
+ * Various flags change the behavior:
2054
+ *
2055
+ * UTF8_FORCE_WARN_IF_MALFORMED is forbidden if msgs is not
2056
+ * NULL, and is ignored if UTF8_CHECK_ONLY is also
2057
+ * specified; otherwise it turns on all warnings
2058
+ * categories for the duration of the function.
2059
+ *
2060
+ * UTF8_DIE_IF_MALFORMED is forbidden if msgs is not NULL;
2061
+ * otherwise it acts as if UTF8_FORCE_WARN_IF_MALFORMED
2062
+ * has also been specified, and also croaks rather than
2063
+ * returning.
2064
+ *
2065
+ * UTF8_CHECK_ONLY is ignored if msgs is not NULL or if
2066
+ * UTF8_DIE_IF_MALFORMED is also set; otherwise it
2067
+ * suppresses any warnings; behaviors 1) through 4) above
2068
+ * are unchanged
2069
+ *
2070
+ * Also there is a flag associated with each possible condition,
2071
+ * for example, UTF8_ALLOW_LONG. If set, the behavior is modified
2072
+ * so that the corresponding condition:
2073
+ * 1) doesn't cause the function to return failure
2074
+ * 2) the REPLACEMENT_CHARACTER is still stored in *cp_p,
2075
+ * except for the flag UTF8_ALLOW_LONG_AND_ITS_VALUE,
2076
+ * which returns the calculated code point, even if plain
2077
+ * UTF8_ALLOW_LONG is also set.
2078
+ * 3) *errors still has a bit set.
2079
+ * 4) no entry is generated in *msgs.
2080
+ * 5) no warning is raised
2081
+ *
2082
+ * Note that this means the UTF8_CHECK_ONLY flag has the same
2083
+ * effect as passing an ALLOW flag for every condition.
2084
+ *
2085
+ * Note also that an entry is placed in *errors for each condition
2086
+ * found, regardless of the other flags. The caller can rely on
2087
+ * this being an accurate accounting of all conditions found, even
2088
+ * if they aren't otherwise reported.
2089
+ *
2038
2090
* b) The other type is by default not considered to be a problem.
2039
- * These are for when the input was syntactically valid
2040
- * Perl- extended-UTF-8 for a code point that is representable on
2091
+ * These are for when the input was syntactically valid UTF-8 (as
2092
+ * extended by Perl) for a code point that is representable on
2041
2093
* this platform, but that code point isn't considered by Unicode
2042
- * to be freely exchangeable between applications. To get here,
2043
- * code earlier in this function has determined both that this
2044
- * sequence is for such a code point, and that the 'flags'
2045
- * parameter indicates that these are to be considered
2046
- * problematic, meaning this sequence should be rejected, merely
2047
- * warned about, or both. *errors will be set for each of these.
2048
- *
2049
- * If the caller to this function has set the corresponding
2050
- * DISALLOW bit in 'flags', the translation of this sequence will
2051
- * be the Unicode REPLACEMENT CHARACTER.
2094
+ * to be freely exchangeable between applications.
2052
2095
*
2053
- * If the caller to this function has set the corresponding WARN
2054
- * bit in 'flags' potentially a warning message will be generated,
2055
- * using the rules common to both types of problems, and detailed
2056
- * below.
2096
+ * The 'flags' parameter to this function must contain an
2097
+ * appropriate set bit in order for this function to consider them
2098
+ * to be problems. And to get here, code earlier in this function
2099
+ * has determined one of those flags applies to this sequence.
2100
+ * This means that we know already that this input is problematic,
2101
+ * unlike the type a) items.
2057
2102
*
2058
- * In all cases the corresponding bit in *errors is set. This is
2059
- * in contrast to the other type of problem where the input
2060
- * 'flags' affect if the bit is set or not.
2103
+ * Each of these problematic sequences has two independent flags
2104
+ * associated with it. The DISALLOW flag causes this code point
2105
+ * to be rejected; the WARN flag causes it to attempt to raise a
2106
+ * warning about it. To do both, specify both flags. This is
2107
+ * different from the type a) items, where the ALLOW flag affects
2108
+ * both the rejection and warning. The same 5 actions as type a)
2109
+ * have to be done, but the conditions differ. The actions when
2110
+ * the UTF8_CHECK_ONLY flag is not included are:
2061
2111
*
2062
- * The default is to generate a warning for each of these. If the
2063
- * input 'flags' has a corresponding ALLOW flag, warnings are
2064
- * suppressed. The only other thing the ALLOW flags do is
2065
- * determine if the function returns sucess or failure
2112
+ * 1) If the DISALLOW flag is set, the function returns failure,
2113
+ * or croaks if the UTF8_DIE_IF_MALFORMED flag is included.
2114
+ * 2) If the DISALLOW flag is set, the REPLACEMENT_CHARACTER is
2115
+ * substituted for the returned code point
2116
+ * 3) A bit is set in *errors if errors is not NULL
2117
+ * 4) An entry in *msgs is generated if msgs is not NULL. Since
2118
+ * to get here, we know the input is problematic, an entry is
2119
+ * unconditionally made. The warnings category for it will be
2120
+ * zero if neither the corresponding WARN flag nor the
2121
+ * UTF8_FORCE_WARN_IF_MALFORMED flag are included.
2122
+ * 5) A warning is raised if msgs is NULL and either:
2123
+ * i) the flag UTF8_FORCE_WARN_IF_MALFORMED is included; or
2124
+ * ii) the corresponding WARN flag is included, and the
2125
+ * appropriate warning category(ies) are enabled.
2066
2126
*
2067
- * For both types of problems, if warnings are called for by the input
2068
- * flags, also setting the UTF8_CHECK_ONLY flag overrides
2069
- * generating them. If 'msgs' is not NULL, they all will be returned
2070
- * there; otherwise they will be raised if warnings are enabled .
2127
+ * Including the UTF8_CHECK_ONLY flag has no effect if the
2128
+ * UTF8_DIE_IF_MALFORMED is also included; otherwise it changes
2129
+ * the above actions only to not do 5); so no warnings get
2130
+ * generated .
2071
2131
*/
2072
2132
2073
2133
bool disallowed = FALSE;
@@ -2169,7 +2229,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2169
2229
}
2170
2230
2171
2231
/* The code is structured so that there is a case: in a switch()
2172
- * for each problem type, so as to handle the different details of
2232
+ * for each condition type, so as to handle the different details of
2173
2233
* each. The only common part after setting things up is the
2174
2234
* handling of any generated warning message. That means that if a
2175
2235
* case: finds there is no message, it can 'continue' to the next
@@ -2222,7 +2282,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2222
2282
malformed_text ,
2223
2283
_byte_dump_string (s0 , send - s0 , 0 ),
2224
2284
(int )avail_len ,
2225
- avail_len == 1 ? "" : "s" ,
2285
+ avail_len == 1 ? "" : "s" , /* Pluralize */
2226
2286
(int )expectlen );
2227
2287
break ;
2228
2288
@@ -2324,8 +2384,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2324
2384
COMMON_DEFAULT_ACCEPTEDS (UTF8_WARN_SURROGATE ,
2325
2385
WARN_SURROGATE ,,);
2326
2386
2327
- /* These are the only errors that can occur with a
2328
- * surrogate when the 'input_uv' isn't valid */
2387
+ /* This is the only error that can occur with a
2388
+ * surrogate when the 'input_uv' isn't valid */
2329
2389
if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2330
2390
message = Perl_form (aTHX_
2331
2391
"UTF-16 surrogate (any UTF-8 sequence that"
0 commit comments