@@ -1371,12 +1371,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1371
1371
U32 discard_errors ; /* Used to save branches when 'errors' is NULL; this
1372
1372
gets set and discarded */
1373
1373
1374
- /* The below are used only if there is both an overlong malformation and a
1375
- * too short one. Otherwise the first two are set to 's0' and 'send', and
1376
- * the third not used at all */
1377
- U8 * adjusted_s0 ;
1378
- U8 temp_char_buf [UTF8_MAXBYTES + 1 ]; /* Used to avoid a Newx in this
1379
- routine; see [perl #130921] */
1380
1374
dTHX ;
1381
1375
1382
1376
PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER ;
@@ -1419,7 +1413,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1419
1413
expectlen = 0 ;
1420
1414
avail_len = 0 ;
1421
1415
discard_errors = 0 ;
1422
- adjusted_s0 = (U8 * ) s0 ;
1423
1416
1424
1417
if (errors ) {
1425
1418
* errors = 0 ;
@@ -1549,37 +1542,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1549
1542
|| (UNLIKELY (0 < is_utf8_overlong (s0 , s - s0 ))))))
1550
1543
{
1551
1544
possible_problems |= UTF8_GOT_LONG ;
1552
-
1553
- if ( UNLIKELY ( possible_problems & UTF8_GOT_TOO_SHORT )
1554
-
1555
- /* The calculation in the 'true' branch of this 'if'
1556
- * below won't work if overflows, and isn't needed
1557
- * anyway. Further below we handle all overflow
1558
- * cases */
1559
- && LIKELY (! (possible_problems & UTF8_GOT_OVERFLOW )))
1560
- {
1561
- UV min_uv = uv ;
1562
- STRLEN i ;
1563
-
1564
- /* Here, the input is both overlong and is missing some trailing
1565
- * bytes. There is no single code point it could be for, but there
1566
- * may be enough information present to determine if what we have
1567
- * so far is for an unallowed code point, such as for a surrogate.
1568
- * The code further below has the intelligence to determine this,
1569
- * but just for non-overlong UTF-8 sequences. What we do here is
1570
- * calculate the smallest code point the input could represent if
1571
- * there were no too short malformation. Then we compute and save
1572
- * the UTF-8 for that, which is what the code below looks at
1573
- * instead of the raw input. It turns out that the smallest such
1574
- * code point is all we need. */
1575
- for (i = curlen ; i < expectlen ; i ++ ) {
1576
- min_uv = UTF8_ACCUMULATE (min_uv ,
1577
- I8_TO_NATIVE_UTF8 (UTF_MIN_CONTINUATION_BYTE ));
1578
- }
1579
-
1580
- adjusted_s0 = temp_char_buf ;
1581
- (void ) uvoffuni_to_utf8_flags (adjusted_s0 , min_uv , 0 );
1582
- }
1583
1545
}
1584
1546
1585
1547
/* Here, we have found all the possible problems, except for when the input
@@ -1604,38 +1566,58 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1604
1566
/* See if the input has malformations besides possibly overlong */
1605
1567
if (UNLIKELY (possible_problems & ~UTF8_GOT_LONG )) {
1606
1568
1607
- /* Here, there is a malformation other than overlong, we need to
1608
- look at the source UTF-8, possibly adjusted to be non-overlong */
1609
- if ( isUTF8_POSSIBLY_PROBLEMATIC (* adjusted_s0 )
1610
- || UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 )))
1611
- {
1612
- if (UNLIKELY (NATIVE_UTF8_TO_I8 (* adjusted_s0 )
1613
- > UTF_START_BYTE_110000_ ))
1614
- {
1615
- possible_problems |= UTF8_GOT_SUPER ;
1616
- }
1617
- else if (curlen > 1 ) {
1618
- if (UNLIKELY ( NATIVE_UTF8_TO_I8 (* adjusted_s0 )
1619
- == UTF_START_BYTE_110000_
1620
- && NATIVE_UTF8_TO_I8 (* (adjusted_s0 + 1 ))
1621
- >= UTF_FIRST_CONT_BYTE_110000_ ))
1622
- {
1623
- possible_problems |= UTF8_GOT_SUPER ;
1624
- }
1625
- else if (UNLIKELY (is_SURROGATE_utf8 (adjusted_s0 ))) {
1626
- possible_problems |= UTF8_GOT_SURROGATE ;
1627
- }
1628
- }
1569
+ /* Here, the input is malformed in some way besides possibly
1570
+ * overlong, except it doesn't overflow. If you look at the
1571
+ * code above, to get here, it must be a too short string,
1572
+ * possibly overlong besides. */
1573
+ assert (possible_problems & UTF8_GOT_TOO_SHORT );
1629
1574
1630
- /* We need a complete well-formed UTF-8 character to discern
1631
- * non-characters, so can't look for them here */
1575
+ /* There is no single code point it could be for, but there may
1576
+ * be enough information present to determine if what we have
1577
+ * so far would, if filled out completely, be for one of these
1578
+ * problematic code points we are being asked to check for.
1579
+ *
1580
+ * The range of surrogates is
1581
+ * ASCII platforms EBCDIC I8
1582
+ * "\xed\xa0\x80" "\xf1\xb6\xa0\xa0"
1583
+ * to "\xed\xbf\xbf". "\xf1\xb7\xbf\xbf"
1584
+ *
1585
+ * (Continuation byte range):
1586
+ * \x80 to \xbf \xa0 to \xbf
1587
+ *
1588
+ * In both cases, if we have the first two bytes, we can tell
1589
+ * if it is a surrogate or not. If we have only one byte, we
1590
+ * can't tell, so we have to assume it isn't a surrogate.
1591
+ *
1592
+ * It is more complicated for supers due to the possibility of
1593
+ * overlongs. For example, in ASCII, the first non-Unicode code
1594
+ * point is represented by the sequence \xf4\x90\x80\x80, so
1595
+ * \xf8\x80\x80\x80\x41 looks like it is for a much bigger code
1596
+ * point. But it in fact is an overlong representation of the
1597
+ * letter "A".
1598
+ *
1599
+ * So what we do is calculate the smallest code point the input
1600
+ * could represent if there were no too short malformation.
1601
+ * This is done by pretending the input was filled out to its
1602
+ * full length with occurrences of the smallest continuation
1603
+ * byte. For surrogates we could just look at the bytes, but
1604
+ * this single algorithm works for both those and supers.
1605
+ *
1606
+ * To determine if a code point is a non-character, we need all
1607
+ * bytes, so this effort is wasted if the caller is looking for
1608
+ * just those, but that is unlikely; the two official Unicode
1609
+ * restrictions include the other two. */
1610
+ for (unsigned i = curlen ; i < expectlen ; i ++ ) {
1611
+ uv = UTF8_ACCUMULATE (uv ,
1612
+ I8_TO_NATIVE_UTF8 (UTF_MIN_CONTINUATION_BYTE ));
1613
+ }
1632
1614
}
1633
- }
1634
- else
1635
1615
1636
- /* Here there were no malformations, or the only malformation is an
1637
- * overlong, 'uv' is valid, and the 'if' above made sure that it
1638
- * could be problematic */
1616
+ /* Here 'uv' is as valid as it can get. Perhaps it was valid all
1617
+ * along because there were no malformations, or the only
1618
+ * malformation is an overlong (which allows it to be fully
1619
+ * computed). Or it may have been "cured" as best it can by the
1620
+ * loop just above. */
1639
1621
if (isUNICODE_POSSIBLY_PROBLEMATIC (uv )) {
1640
1622
if (UNLIKELY (UNICODE_IS_SURROGATE (uv ))) {
1641
1623
possible_problems |= UTF8_GOT_SURROGATE ;
0 commit comments