Skip to content

Commit 354ba92

Browse files
committed
pp_ucfirst: Avoid a conversion from UTF-8 to code point
This generates the UTF-8 for U+0345. Then we can do a string compare against that when parsing a UTF-8 string, instead of first converting to numeric.
1 parent fa8f275 commit 354ba92

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

pp.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4511,7 +4511,6 @@ PP_wrapped(pp_uc, 1, 0)
45114511
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
45124512

45134513
#define GREEK_CAPITAL_LETTER_IOTA 0x0399
4514-
#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
45154514
/* All occurrences of these are to be moved to follow any other marks.
45164515
* This is context-dependent. We may not be passed enough context to
45174516
* move the iota subscript beyond all of them, but we do the best we can
@@ -4552,9 +4551,8 @@ PP_wrapped(pp_uc, 1, 0)
45524551
#else
45534552
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &upper_len, 0);
45544553
#endif
4555-
if ( uv == GREEK_CAPITAL_LETTER_IOTA
4556-
&& utf8_to_uv_or_die(s, send, 0) ==
4557-
COMBINING_GREEK_YPOGEGRAMMENI)
4554+
if ( UNLIKELY(uv == GREEK_CAPITAL_LETTER_IOTA)
4555+
&& memBEGINs(s, this_len, COMBINING_GREEK_YPOGEGRAMMENI_UTF8))
45584556
{
45594557
in_iota_subscript = TRUE;
45604558
}

regen/unicode_constants.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,6 +1020,7 @@ END
10201020
10211021
U+0300 string
10221022
U+0307 string
1023+
U+0345 string
10231024
10241025
U+1E9E string_skip_if_undef
10251026

unicode_constants.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ bytes.
5757

5858
# define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80" /* U+0300 */
5959
# define COMBINING_DOT_ABOVE_UTF8 "\xCC\x87" /* U+0307 */
60+
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xCD\x85" /* U+0345 */
6061

6162
# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xE1\xBA\x9E" /* U+1E9E */
6263

@@ -122,6 +123,7 @@ bytes.
122123

123124
# define COMBINING_GRAVE_ACCENT_UTF8 "\xAF\x41" /* U+0300 */
124125
# define COMBINING_DOT_ABOVE_UTF8 "\xAF\x48" /* U+0307 */
126+
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xB1\x46" /* U+0345 */
125127

126128
# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xBF\x63\x72" /* U+1E9E */
127129

@@ -187,6 +189,7 @@ bytes.
187189

188190
# define COMBINING_GRAVE_ACCENT_UTF8 "\xAD\x41" /* U+0300 */
189191
# define COMBINING_DOT_ABOVE_UTF8 "\xAD\x48" /* U+0307 */
192+
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xAF\x46" /* U+0345 */
190193

191194
# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xBF\x62\x71" /* U+1E9E */
192195

0 commit comments

Comments
 (0)