@@ -1951,58 +1951,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
1951
1951
U32 pack_warn = packWARN (WARN_UTF8 );
1952
1952
1953
1953
switch (this_problem ) {
1954
- case UTF8_GOT_OVERFLOW :
1955
-
1956
- /* Overflow means also got a super and are using Perl's
1957
- * extended UTF-8, but we handle all three cases here */
1958
- possible_problems &= ~(UTF8_GOT_SUPER |UTF8_GOT_PERL_EXTENDED );
1959
- * errors |= UTF8_GOT_OVERFLOW ;
1960
- uv = UNICODE_REPLACEMENT ;
1961
-
1962
- /* But the API says we flag all errors found */
1963
- if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
1964
- * errors |= UTF8_GOT_SUPER ;
1965
- }
1966
- if (flags
1967
- & (UTF8_WARN_PERL_EXTENDED |UTF8_DISALLOW_PERL_EXTENDED ))
1968
- {
1969
- * errors |= UTF8_GOT_PERL_EXTENDED ;
1970
- }
1971
-
1972
- /* Disallow if any of the three categories say to */
1973
- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
1974
- || (flags & ( UTF8_DISALLOW_SUPER
1975
- |UTF8_DISALLOW_PERL_EXTENDED )))
1976
- {
1977
- disallowed = TRUE;
1978
- }
1979
-
1980
- /* Likewise, warn if any say to */
1981
- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
1982
- || (flags & (UTF8_WARN_SUPER |UTF8_WARN_PERL_EXTENDED )))
1983
- {
1984
-
1985
- /* Overflow is a hybrid. If the word size on this platform
1986
- * were wide enough for this to not overflow, a non-Unicode
1987
- * code point would have been generated. If the caller
1988
- * wanted warnings for such code points, the warning
1989
- * category would be WARN_NON_UNICODE, On the other hand,
1990
- * overflow is considered a malformation, which is serious,
1991
- * and the category would be just WARN_UTF8. We clearly
1992
- * should warn if either category is enabled, but which
1993
- * category to use? Historically, we've used 'utf8' if it
1994
- * is enabled; and that seems like the more severe
1995
- * category, more befitting a malformation. */
1996
- pack_warn = NEED_MESSAGE (WARN_UTF8 ,
1997
- ckWARN_d , WARN_NON_UNICODE );
1998
- if (pack_warn ) {
1999
- message = Perl_form (aTHX_ "%s: %s (overflows)" ,
2000
- malformed_text ,
2001
- _byte_dump_string (s0 , curlen , 0 ));
2002
- this_flag_bit = UTF8_GOT_OVERFLOW ;
2003
- }
2004
- }
2005
-
1954
+ default :
1955
+ Perl_croak (aTHX_ "panic: Unexpected case value in "
1956
+ " utf8n_to_uvchr_msgs() %d" , this_problem );
1957
+ /* NOTREACHED */
2006
1958
break ;
2007
1959
2008
1960
case UTF8_GOT_EMPTY :
@@ -2117,78 +2069,6 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2117
2069
2118
2070
break ;
2119
2071
2120
- case UTF8_GOT_SUPER :
2121
-
2122
- if (flags & UTF8_WARN_SUPER ) {
2123
- * errors |= UTF8_GOT_SUPER ;
2124
-
2125
- if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2126
- pack_warn = packWARN (WARN_NON_UNICODE );
2127
-
2128
- if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2129
- message = Perl_form (aTHX_
2130
- "Any UTF-8 sequence that starts with"
2131
- " \"%s\" is for a non-Unicode code point,"
2132
- " may not be portable" ,
2133
- _byte_dump_string (s0 , curlen , 0 ));
2134
- }
2135
- else {
2136
- message = Perl_form (aTHX_ super_cp_format , uv );
2137
- }
2138
- this_flag_bit = UTF8_GOT_SUPER ;
2139
- }
2140
- }
2141
-
2142
- /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2143
- * and before possibly bailing out, so that the more dire
2144
- * warning will override the regular one. */
2145
- if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
2146
- if ( (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
2147
- && NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE ))
2148
- {
2149
- pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2150
-
2151
- /* If it is an overlong that evaluates to a code point
2152
- * that doesn't have to use the Perl extended UTF-8, it
2153
- * still used it, and so we output a message that
2154
- * doesn't refer to the code point. The same is true
2155
- * if there was a SHORT malformation where the code
2156
- * point is not valid. In that case, 'uv' will have
2157
- * been set to the REPLACEMENT CHAR, and the message
2158
- * below without the code point in it will be selected
2159
- * */
2160
- if (UNICODE_IS_PERL_EXTENDED (uv )) {
2161
- message = Perl_form (aTHX_
2162
- PL_extended_cp_format , uv );
2163
- }
2164
- else {
2165
- message = Perl_form (aTHX_
2166
- "Any UTF-8 sequence that starts with"
2167
- " \"%s\" is a Perl extension, and"
2168
- " so is not portable" ,
2169
- _byte_dump_string (s0 , curlen , 0 ));
2170
- }
2171
- this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2172
- }
2173
-
2174
- if (flags & ( UTF8_WARN_PERL_EXTENDED
2175
- |UTF8_DISALLOW_PERL_EXTENDED ))
2176
- {
2177
- * errors |= UTF8_GOT_PERL_EXTENDED ;
2178
-
2179
- if (flags & UTF8_DISALLOW_PERL_EXTENDED ) {
2180
- disallowed = TRUE;
2181
- }
2182
- }
2183
- }
2184
-
2185
- if (flags & UTF8_DISALLOW_SUPER ) {
2186
- * errors |= UTF8_GOT_SUPER ;
2187
- disallowed = TRUE;
2188
- }
2189
-
2190
- break ;
2191
-
2192
2072
case UTF8_GOT_NONCHAR :
2193
2073
2194
2074
if (flags & UTF8_WARN_NONCHAR ) {
@@ -2276,11 +2156,131 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
2276
2156
}
2277
2157
2278
2158
break ;
2159
+ case UTF8_GOT_OVERFLOW :
2279
2160
2280
- default :
2281
- Perl_croak (aTHX_ "panic: Unexpected case value in "
2282
- " utf8n_to_uvchr_msgs() %d" , this_problem );
2283
- /* NOTREACHED */
2161
+ /* Overflow means also got a super and are using Perl's
2162
+ * extended UTF-8, but we handle all three cases here */
2163
+ possible_problems &= ~(UTF8_GOT_SUPER |UTF8_GOT_PERL_EXTENDED );
2164
+ * errors |= UTF8_GOT_OVERFLOW ;
2165
+ uv = UNICODE_REPLACEMENT ;
2166
+
2167
+ /* But the API says we flag all errors found */
2168
+ if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2169
+ * errors |= UTF8_GOT_SUPER ;
2170
+ }
2171
+ if (flags
2172
+ & (UTF8_WARN_PERL_EXTENDED |UTF8_DISALLOW_PERL_EXTENDED ))
2173
+ {
2174
+ * errors |= UTF8_GOT_PERL_EXTENDED ;
2175
+ }
2176
+
2177
+ /* Disallow if any of the three categories say to */
2178
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2179
+ || (flags & ( UTF8_DISALLOW_SUPER
2180
+ |UTF8_DISALLOW_PERL_EXTENDED )))
2181
+ {
2182
+ disallowed = TRUE;
2183
+ }
2184
+
2185
+ /* Likewise, warn if any say to */
2186
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2187
+ || (flags & (UTF8_WARN_SUPER |UTF8_WARN_PERL_EXTENDED )))
2188
+ {
2189
+
2190
+ /* Overflow is a hybrid. If the word size on this platform
2191
+ * were wide enough for this to not overflow, a non-Unicode
2192
+ * code point would have been generated. If the caller
2193
+ * wanted warnings for such code points, the warning
2194
+ * category would be WARN_NON_UNICODE, On the other hand,
2195
+ * overflow is considered a malformation, which is serious,
2196
+ * and the category would be just WARN_UTF8. We clearly
2197
+ * should warn if either category is enabled, but which
2198
+ * category to use? Historically, we've used 'utf8' if it
2199
+ * is enabled; and that seems like the more severe
2200
+ * category, more befitting a malformation. */
2201
+ pack_warn = NEED_MESSAGE (WARN_UTF8 ,
2202
+ ckWARN_d , WARN_NON_UNICODE );
2203
+ if (pack_warn ) {
2204
+ message = Perl_form (aTHX_ "%s: %s (overflows)" ,
2205
+ malformed_text ,
2206
+ _byte_dump_string (s0 , curlen , 0 ));
2207
+ this_flag_bit = UTF8_GOT_OVERFLOW ;
2208
+ }
2209
+ }
2210
+
2211
+ break ;
2212
+
2213
+ case UTF8_GOT_SUPER :
2214
+
2215
+ if (flags & UTF8_WARN_SUPER ) {
2216
+ * errors |= UTF8_GOT_SUPER ;
2217
+
2218
+ if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2219
+ pack_warn = packWARN (WARN_NON_UNICODE );
2220
+
2221
+ if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2222
+ message = Perl_form (aTHX_
2223
+ "Any UTF-8 sequence that starts with"
2224
+ " \"%s\" is for a non-Unicode code point,"
2225
+ " may not be portable" ,
2226
+ _byte_dump_string (s0 , curlen , 0 ));
2227
+ }
2228
+ else {
2229
+ message = Perl_form (aTHX_ super_cp_format , uv );
2230
+ }
2231
+ this_flag_bit = UTF8_GOT_SUPER ;
2232
+ }
2233
+ }
2234
+
2235
+ /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2236
+ * and before possibly bailing out, so that the more dire
2237
+ * warning will override the regular one. */
2238
+ if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
2239
+ if ( (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
2240
+ && NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE ))
2241
+ {
2242
+ pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2243
+
2244
+ /* If it is an overlong that evaluates to a code point
2245
+ * that doesn't have to use the Perl extended UTF-8, it
2246
+ * still used it, and so we output a message that
2247
+ * doesn't refer to the code point. The same is true
2248
+ * if there was a SHORT malformation where the code
2249
+ * point is not valid. In that case, 'uv' will have
2250
+ * been set to the REPLACEMENT CHAR, and the message
2251
+ * below without the code point in it will be selected
2252
+ * */
2253
+ if (UNICODE_IS_PERL_EXTENDED (uv )) {
2254
+ message = Perl_form (aTHX_
2255
+ PL_extended_cp_format , uv );
2256
+ }
2257
+ else {
2258
+ message = Perl_form (aTHX_
2259
+ "Any UTF-8 sequence that starts with"
2260
+ " \"%s\" is a Perl extension, and"
2261
+ " so is not portable" ,
2262
+ _byte_dump_string (s0 , curlen , 0 ));
2263
+ }
2264
+ this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2265
+ }
2266
+
2267
+ if (flags & ( UTF8_WARN_PERL_EXTENDED
2268
+ |UTF8_DISALLOW_PERL_EXTENDED ))
2269
+ {
2270
+ * errors |= UTF8_GOT_PERL_EXTENDED ;
2271
+
2272
+ if (flags & UTF8_DISALLOW_PERL_EXTENDED ) {
2273
+ disallowed = TRUE;
2274
+ }
2275
+ }
2276
+ }
2277
+
2278
+ if (flags & UTF8_DISALLOW_SUPER ) {
2279
+ * errors |= UTF8_GOT_SUPER ;
2280
+ disallowed = TRUE;
2281
+ }
2282
+
2283
+ break ;
2284
2284
2285
2285
} /* End of switch() on the possible problems */
2286
2286
0 commit comments