Skip to content

Commit cc8bd1e

Browse files
committed
utf8.c: Combine two blocks
It turns out that the work being done in the first block is only used in the second block. If that block doesn't get executed, the first block's effort is thrown away. So fold the first block into the second. This results in a bunch of temporaries that were used to communicate between the blocks being able to be removed. More detailed comments are added.
1 parent f241a5b commit cc8bd1e

File tree

1 file changed

+49
-67
lines changed

1 file changed

+49
-67
lines changed

utf8.c

Lines changed: 49 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,12 +1371,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
13711371
U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
13721372
gets set and discarded */
13731373

1374-
/* The below are used only if there is both an overlong malformation and a
1375-
* too short one. Otherwise the first two are set to 's0' and 'send', and
1376-
* the third not used at all */
1377-
U8 * adjusted_s0;
1378-
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1379-
routine; see [perl #130921] */
13801374
dTHX;
13811375

13821376
PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
@@ -1419,7 +1413,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
14191413
expectlen = 0;
14201414
avail_len = 0;
14211415
discard_errors = 0;
1422-
adjusted_s0 = (U8 *) s0;
14231416

14241417
if (errors) {
14251418
*errors = 0;
@@ -1549,37 +1542,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
15491542
|| (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
15501543
{
15511544
possible_problems |= UTF8_GOT_LONG;
1552-
1553-
if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1554-
1555-
/* The calculation in the 'true' branch of this 'if'
1556-
* below won't work if overflows, and isn't needed
1557-
* anyway. Further below we handle all overflow
1558-
* cases */
1559-
&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1560-
{
1561-
UV min_uv = uv;
1562-
STRLEN i;
1563-
1564-
/* Here, the input is both overlong and is missing some trailing
1565-
* bytes. There is no single code point it could be for, but there
1566-
* may be enough information present to determine if what we have
1567-
* so far is for an unallowed code point, such as for a surrogate.
1568-
* The code further below has the intelligence to determine this,
1569-
* but just for non-overlong UTF-8 sequences. What we do here is
1570-
* calculate the smallest code point the input could represent if
1571-
* there were no too short malformation. Then we compute and save
1572-
* the UTF-8 for that, which is what the code below looks at
1573-
* instead of the raw input. It turns out that the smallest such
1574-
* code point is all we need. */
1575-
for (i = curlen; i < expectlen; i++) {
1576-
min_uv = UTF8_ACCUMULATE(min_uv,
1577-
I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1578-
}
1579-
1580-
adjusted_s0 = temp_char_buf;
1581-
(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1582-
}
15831545
}
15841546

15851547
/* Here, we have found all the possible problems, except for when the input
@@ -1604,38 +1566,58 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
16041566
/* See if the input has malformations besides possibly overlong */
16051567
if (UNLIKELY(possible_problems & ~UTF8_GOT_LONG)) {
16061568

1607-
/* Here, there is a malformation other than overlong, we need to
1608-
look at the source UTF-8, possibly adjusted to be non-overlong */
1609-
if ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1610-
|| UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))
1611-
{
1612-
if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1613-
> UTF_START_BYTE_110000_))
1614-
{
1615-
possible_problems |= UTF8_GOT_SUPER;
1616-
}
1617-
else if (curlen > 1) {
1618-
if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
1619-
== UTF_START_BYTE_110000_
1620-
&& NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1621-
>= UTF_FIRST_CONT_BYTE_110000_))
1622-
{
1623-
possible_problems |= UTF8_GOT_SUPER;
1624-
}
1625-
else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1626-
possible_problems |= UTF8_GOT_SURROGATE;
1627-
}
1628-
}
1569+
/* Here, the input is malformed in some way besides possibly
1570+
* overlong, except it doesn't overflow. If you look at the
1571+
* code above, to get here, it must be a too short string,
1572+
* possibly overlong besides. */
1573+
assert(possible_problems & UTF8_GOT_TOO_SHORT);
16291574

1630-
/* We need a complete well-formed UTF-8 character to discern
1631-
* non-characters, so can't look for them here */
1575+
/* There is no single code point it could be for, but there may
1576+
* be enough information present to determine if what we have
1577+
* so far would, if filled out completely, be for one of these
1578+
* problematic code points we are being asked to check for.
1579+
*
1580+
* The range of surrogates is
1581+
* ASCII platforms EBCDIC I8
1582+
* "\xed\xa0\x80" "\xf1\xb6\xa0\xa0"
1583+
* to "\xed\xbf\xbf". "\xf1\xb7\xbf\xbf"
1584+
*
1585+
* (Continuation byte range):
1586+
* \x80 to \xbf \xa0 to \xbf
1587+
*
1588+
* In both cases, if we have the first two bytes, we can tell
1589+
* if it is a surrogate or not. If we have only one byte, we
1590+
* can't tell, so we have to assume it isn't a surrogate.
1591+
*
1592+
* It is more complicated for supers due to the possibility of
1593+
* overlongs. For example, in ASCII, the first non-Unicode code
1594+
* point is represented by the sequence \xf4\x90\x80\x80, so
1595+
* \xf8\x80\x80\x80\x41 looks like it is for a much bigger code
1596+
* point. But it in fact is an overlong representation of the
1597+
* letter "A".
1598+
*
1599+
* So what we do is calculate the smallest code point the input
1600+
* could represent if there were no too short malformation.
1601+
* This is done by pretending the input was filled out to its
1602+
* full length with occurrences of the smallest continuation
1603+
* byte. For surrogates we could just look at the bytes, but
1604+
* this single algorithm works for both those and supers.
1605+
*
1606+
* To determine if a code point is a non-character, we need all
1607+
* bytes, so this effort is wasted if the caller is looking for
1608+
* just those, but that is unlikely; the two official Unicode
1609+
* restrictions include the other two. */
1610+
for (unsigned i = curlen; i < expectlen; i++) {
1611+
uv = UTF8_ACCUMULATE(uv,
1612+
I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1613+
}
16321614
}
1633-
}
1634-
else
16351615

1636-
/* Here there were no malformations, or the only malformation is an
1637-
* overlong, 'uv' is valid, and the 'if' above made sure that it
1638-
* could be problematic */
1616+
/* Here 'uv' is as valid as it can get. Perhaps it was valid all
1617+
* along because there were no malformations, or the only
1618+
* malformation is an overlong (which allows it to be fully
1619+
* computed). Or it may have been "cured" as best it can by the
1620+
* loop just above. */
16391621
if (isUNICODE_POSSIBLY_PROBLEMATIC(uv)) {
16401622
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
16411623
possible_problems |= UTF8_GOT_SURROGATE;

0 commit comments

Comments
 (0)