Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 32 additions & 28 deletions pp.c
Original file line number Diff line number Diff line change
Expand Up @@ -4511,7 +4511,6 @@ PP_wrapped(pp_uc, 1, 0)
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];

#define GREEK_CAPITAL_LETTER_IOTA 0x0399
#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
/* All occurrences of these are to be moved to follow any other marks.
* This is context-dependent. We may not be passed enough context to
* move the iota subscript beyond all of them, but we do the best we can
Expand All @@ -4525,11 +4524,11 @@ PP_wrapped(pp_uc, 1, 0)
bool in_iota_subscript = FALSE;

while (s < send) {
STRLEN u;
STRLEN ulen;
STRLEN this_len;
STRLEN upper_len;
UV uv;
if (UNLIKELY(in_iota_subscript)) {
UV cp = utf8_to_uv_or_die(s, send, &u);
UV cp = utf8_to_uv_or_die(s, send, &this_len);

if (! _invlist_contains_cp(PL_utf8_mark, cp)) {

Expand All @@ -4540,25 +4539,27 @@ PP_wrapped(pp_uc, 1, 0)
}
}
else {
u = UTF8SKIP(s);
this_len = UTF8SKIP(s);
}

/* Then handle the current character. Get the changed case value
* and copy it to the output buffer */

#ifdef USE_LOCALE_CTYPE
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &upper_len,
IN_LC_RUNTIME(LC_CTYPE));
#else
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, 0);
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &upper_len, 0);
#endif
if ( uv == GREEK_CAPITAL_LETTER_IOTA
&& utf8_to_uv_or_die(s, send, 0) ==
COMBINING_GREEK_YPOGEGRAMMENI)
if ( UNLIKELY(uv == GREEK_CAPITAL_LETTER_IOTA)
&& memBEGINs(s, this_len, COMBINING_GREEK_YPOGEGRAMMENI_UTF8))
{
in_iota_subscript = TRUE;
}
else {
if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
if ( upper_len > this_len
&& (SvLEN(dest) < (min += upper_len - this_len)))
{
/* If the eventually required minimum size outgrows the
* available space, we need to grow. */
const UV o = d - (U8*)SvPVX_const(dest);
Expand All @@ -4571,10 +4572,10 @@ PP_wrapped(pp_uc, 1, 0)
* another option */
d = o + (U8*) SvGROW(dest, min);
}
Copy(tmpbuf, d, ulen, U8);
d += ulen;
Copy(tmpbuf, d, upper_len, U8);
d += upper_len;
}
s += u;
s += this_len;
}
if (in_iota_subscript) {
*d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
Expand Down Expand Up @@ -4868,12 +4869,13 @@ PP_wrapped(pp_lc, 1, 0)
bool remove_dot_above = FALSE;

while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
const STRLEN this_len = UTF8SKIP(s);
STRLEN lower_len;

#ifdef USE_LOCALE_CTYPE

_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
_toLOWER_utf8_flags(s, send, tmpbuf, &lower_len,
IN_LC_RUNTIME(LC_CTYPE));

/* If we are in a Turkic locale, we have to do more work. As noted
* in the comments for lcfirst, there is a special case if a 'I'
Expand All @@ -4888,9 +4890,10 @@ PP_wrapped(pp_lc, 1, 0)
&& IN_LC_RUNTIME(LC_CTYPE))
{
if ( UNLIKELY(remove_dot_above)
&& memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8))
&& memBEGINs(tmpbuf, sizeof(tmpbuf),
COMBINING_DOT_ABOVE_UTF8))
{
s += u;
s += this_len;
remove_dot_above = FALSE;
continue;
}
Expand All @@ -4901,15 +4904,16 @@ PP_wrapped(pp_lc, 1, 0)
#else
PERL_UNUSED_VAR(remove_dot_above);

_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);
_toLOWER_utf8_flags(s, send, tmpbuf, &lower_len, 0);
#endif

/* Here is where we would do context-sensitive actions for the
* Greek final sigma. See the commit message for 86510fb15 for why
* there isn't any */

if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {

if ( lower_len > this_len
&& (SvLEN(dest) < (min += lower_len - this_len)))
{
/* If the eventually required minimum size outgrows the
* available space, we need to grow. */
const UV o = d - (U8*)SvPVX_const(dest);
Expand All @@ -4925,9 +4929,9 @@ PP_wrapped(pp_lc, 1, 0)

/* Copy the newly lowercased letter to the output buffer we're
* building */
Copy(tmpbuf, d, ulen, U8);
d += ulen;
s += u;
Copy(tmpbuf, d, lower_len, U8);
d += lower_len;
s += this_len;
} /* End of looping through the source string */
SvUTF8_on(dest);
*d = '\0';
Expand Down Expand Up @@ -5131,19 +5135,19 @@ PP_wrapped(pp_fc, 1, 0)

if (DO_UTF8(source)) { /* UTF-8 flagged string. */
while (s < send) {
const STRLEN u = UTF8SKIP(s);
const STRLEN this_len = UTF8SKIP(s);
STRLEN ulen;

_toFOLD_utf8_flags(s, send, tmpbuf, &ulen, flags);

if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
if (ulen > this_len && (SvLEN(dest) < (min += ulen - this_len))) {
const UV o = d - (U8*)SvPVX_const(dest);
d = o + (U8*) SvGROW(dest, min);
}

Copy(tmpbuf, d, ulen, U8);
d += ulen;
s += u;
s += this_len;
}
SvUTF8_on(dest);
} /* Unflagged string */
Expand Down
1 change: 1 addition & 0 deletions regen/unicode_constants.pl
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,7 @@ END

U+0300 string
U+0307 string
U+0345 string

U+1E9E string_skip_if_undef

Expand Down
3 changes: 3 additions & 0 deletions unicode_constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ bytes.

# define COMBINING_GRAVE_ACCENT_UTF8 "\xCC\x80" /* U+0300 */
# define COMBINING_DOT_ABOVE_UTF8 "\xCC\x87" /* U+0307 */
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xCD\x85" /* U+0345 */

# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xE1\xBA\x9E" /* U+1E9E */

Expand Down Expand Up @@ -122,6 +123,7 @@ bytes.

# define COMBINING_GRAVE_ACCENT_UTF8 "\xAF\x41" /* U+0300 */
# define COMBINING_DOT_ABOVE_UTF8 "\xAF\x48" /* U+0307 */
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xB1\x46" /* U+0345 */

# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xBF\x63\x72" /* U+1E9E */

Expand Down Expand Up @@ -187,6 +189,7 @@ bytes.

# define COMBINING_GRAVE_ACCENT_UTF8 "\xAD\x41" /* U+0300 */
# define COMBINING_DOT_ABOVE_UTF8 "\xAD\x48" /* U+0307 */
# define COMBINING_GREEK_YPOGEGRAMMENI_UTF8 "\xAF\x46" /* U+0345 */

# define LATIN_CAPITAL_LETTER_SHARP_S_UTF8 "\xBF\x62\x71" /* U+1E9E */

Expand Down
Loading