Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpan/CPAN-Meta-YAML/t/11_read_string.t
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ subtest 'invalid UTF-8' => sub {
# get invalid UTF-8 by reading Latin-1 with lax :utf8 layer
my $string = do {
local $SIG{__WARN__} = sub {};
slurp( test_data_file('latin1.yml'), ":utf8" );
slurp( test_data_file('latin1.yml'), ":utf8_lax" );
};
my $obj = eval { CPAN::Meta::YAML->read_string($string); };
is( $obj, undef, "read_string should return undef" );
error_like( qr/invalid UTF-8 string/,
error_like( qr/UTF-8/,
"Got expected error about invalid UTF-8 string"
);
};
Expand Down
2 changes: 1 addition & 1 deletion lib/PerlIO.pm
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package PerlIO;

our $VERSION = '1.12';
our $VERSION = '1.13';

# Map layer name to package that defines it
our %alias;
Expand Down
343 changes: 341 additions & 2 deletions perlio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1492,6 +1492,7 @@ PerlIO_default_layers(pTHX)
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte));
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_scalar));
Expand Down Expand Up @@ -2239,9 +2240,9 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab)
return -1;
}

PERLIO_FUNCS_DECL(PerlIO_utf8) = {
PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = {
sizeof(PerlIO_funcs),
"utf8",
"utf8_lax",
0,
PERLIO_K_UTF8 | PERLIO_K_MULTIARG,
PerlIOUtf8_pushed,
Expand Down Expand Up @@ -5272,6 +5273,344 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = {
PerlIOCrlf_set_ptrcnt,
};

static const U8 xs_utf8_sequence_len[0x100] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
};

#define UTF8_MAX_BYTES 4

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better bitwise math algebra formula for doing this vs the current solution. At first glance, I thought that table was a duplicate of https://github.com/Perl/perl5/blob/blead/utf8.h#L222

EXTCONST unsigned char PL_utf8skip[] = {
/* 0x00 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */

but on the 2nd examination, the table in this commit isn't identical to PL_utf8skip[]. Since the max value in this table is 0x3 or 0x01 | 0x02, or 0x4 depending how you look at it, in other words, a bit vector with 2 or 3 bits per array element will hold the info, This table is leaving alot of dead bits in this char array. My gut instinct says such an ocean of unused bits is "a way" but the wrong way to do it. Atleast pack the lengths into quads. Ideally these bits

    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
    4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */

can be reduced to 256 bits of const literal operands. More detailed, it can be reduced to 4 or 6 x 64bit integer constants, and 4 or 6 branches, something like if(ch <= 32) len = ( 0xU64_BITVEC_MASK >> ch) & 0x3;. I'll guess doing some SO/Google research will reveal fast and better and more clever algos invented by various ppl vs this 256 byte table.

Changing this table to something better is a very low priority, and should be done as the last thing before this draft is committed to blead as prod code. As a draft/WIP/concept, perf and speed arent the highest priority until consensus and future revisions make this PR commitable to blead.

typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;

static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
STRLEN i, n = xs_utf8_sequence_len[*cur];

if (n < 1 || len < 2)
return 1;

switch (cur[0]) {
case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
default: if ((cur[1] & 0xC0) != 0x80) return 1; break;
}

if (n > len)
n = len;
for (i = 2; i < n; i++)
if ((cur[i] & 0xC0) != 0x80)
break;
return i;
}

static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__;
static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
static const char *hex = "0123456789ABCDEF";
const char *fmt;
char seq[UTF8_MAX_BYTES * 3];
char *d = seq;

if (eof)
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
else
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";

while (len-- > 0) {
const U8 c = *cur++;
*d++ = hex[c >> 4];
*d++ = hex[c & 15];
if (len)
*d++ = ' ';
}
*d = 0;
Perl_croak(aTHX_ fmt, seq);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use Perl_croak_nocontext() and delete the pTHX_ on the proto. This is an arctic cold panic fn. Its no_return branch should be as small as possible in mach code in its caller frames, since this no_return branch will never execute in reality (prod code env).

}

static void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__;
static void report_noncharacter(pTHX_ UV usv) {
static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
Perl_croak(aTHX_ fmt, usv);
}

static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
const bool eof = PerlIO_eof(handle);
const U8 *cur = buf;
const U8 *end4 = end - 4;
STRLEN skip = 0;
U32 v;

while (cur < end4) {
while (cur < end4 && *cur < 0x80)
cur++;

check:
switch (xs_utf8_sequence_len[*cur]) {
case 0:
goto illformed;
case 1:
cur += 1;
break;
case 2:
/* 110xxxxx 10xxxxxx */
if ((cur[1] & 0xC0) != 0x80)
goto illformed;
cur += 2;
break;
case 3:
v = ((U32)cur[0] << 16)
| ((U32)cur[1] << 8)
| ((U32)cur[2]);
/* 1110xxxx 10xxxxxx 10xxxxxx */
if ((v & 0x00F0C0C0) != 0x00E08080 ||
/* Non-shortest form */
v < 0x00E0A080)
goto illformed;
/* Surrogates U+D800..U+DFFF */
if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
goto illformed;
/* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
goto noncharacter;
cur += 3;
break;
case 4:
v = ((U32)cur[0] << 24)
| ((U32)cur[1] << 16)
| ((U32)cur[2] << 8)
| ((U32)cur[3]);
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use htonl() which GCC optimizes to 1 cpu on Linux since 2014 or 2008. Or use dedicated CPU intrinsics or use prewritten perl.h/handy.h tools to do this. I have a another PR open about fixing a soap opera involving htonl() vs the day 1 1993 fundamental design of the Win32/64 platform inside WinPerl.

perl.h/handy.h/Configure has almost non-existent support for ISO C compliant unaligned memory access. But GCC project devs evangelize that combining static inline new_fn() {} + memcpy(&u32, unalgn_u32, 4) is holy water and a universal C grammar synonym for safe hardware unalign memory read/writes on all possible CPU archs GCC supports.

Note, GCC will always turn memcpy(&u32, unalgn_u32, 4) into an inline intrinsic that is 0.25 CPU instructions to 1 CPU instructions long, unlike WinPerl built with MSVC. 0.25 CPU instructions means changing an operand to an existing CPU op, think of Intels lea or Intel's SIB byte or Intel's SSE's movaps vs movups.

if ((v & 0xF8C0C0C0) != 0xF0808080 ||
/* Non-shortest form */
v < 0xF0908080 ||
/* Greater than U+10FFFF */
v > 0xF48FBFBF)
goto illformed;
/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
goto noncharacter;
cur += 4;
break;
}
}

if (cur < end) {
if (cur + xs_utf8_sequence_len[*cur] <= end)
goto check;
skip = skip_sequence(cur, end - cur);
if (eof || cur + skip < end)
goto illformed;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can if(eof) goto illformed; test be moved upwards, to happen before skip = skip_sequence(cur, end - cur); statement?

}
return cur - buf;

illformed:
if (!skip)
skip = skip_sequence(cur, end - cur);
PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
report_illformed(aTHX_ cur, skip, eof);

noncharacter:
if (v < 0x10000)
v = (v & 0x3F) | (v & 0x1F00) >> 2;
else
v = (v & 0x3F) | (v & 0x1F00) >> 2 | (v & 0x0F0000) >> 4;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't compile this and look at the asm code, but I suspect low IQ MSVC will have 2 branches, both with identical CPUs op doing expression (v & 0x3F) | (v & 0x1F00) >> 2 so factor that out of the if else branch.

PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
report_noncharacter(aTHX_ v);
}

typedef struct {
PerlIOBuf buf;
STDCHAR leftovers[UTF8_MAX_BYTES];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if you seek with something in leftovers?

size_t leftover_length;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C struct alignment violation. UTF8_MAX_BYTES is 4, size_t is 8 long on 64b cpu. reorder plz.

int flags;
} PerlIOUnicode;

static struct {
const char* name;
size_t length;
utf8_flags value;
} map[] = {
{ STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
{ STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
{ STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
{ STR_WITH_LEN("strict"), STRICT_UTF8 },
{ STR_WITH_LEN("loose"), (utf8_flags)(ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST) },
};

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

structs like this are sloppy and have as many null bytes as liters of water in the Pacific Ocean. A C struct with 3 arrays, sorted by element size, is the correct solution. Size sort order wud be char *, U8, U8. But another problem, a particularly nasty side of ISO C 89-23 grammar.

typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;

Will waste 3 bytes or more like 3.8 bytes to store this constant. I wish C's enum was real type checking instead of fake type cheacking, and that C's enum had a size selector, since ISO C's bitfields, character literal integers, and enums, features are still stuck in January 1970 evolution wise.

static int lookup_parameter(pTHX_ const char* ptr, size_t len) {
unsigned i;
for (i = 0; i < sizeof map / sizeof *map; ++i) {
if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
return map[i].value;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compute map[i] only once and cache it with map_ptr = &(map[i]), memcmp is a real fn call.

}
Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr);
}
static int parse_parameters(pTHX_ SV* param) {
STRLEN len;
const char *begin, *delim;
if (!param || !SvOK(param))
return 0;

begin = SvPV(param, len);
delim = strchr(begin, ',');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no strlen() class fn calls plz, use memchr(). memchr() can use U16/U32/U64/U128/U256 mem read/test/cmp/jmp instructions, strchr() can't advance except 1 byte at a time.

I will pretend that Apple's innovative and controversial valgrind report violating memcmp()/strlen()/etc inside OSX have never been invented or released as gold.

They first compute the next 4096 page boundary, then perform all memory reads, using U32 *s ops inside strlen(), which will cause reading of upto 3 bytes ahead of legally "uninitialized" and legally "undefined" memory, according to all valgrind style dev tools. These std libc functions with "undefined behavior", physically can't ever throw a SEGV doing these over-reads, since 4096 byte units are the smallest granularity available on all Apple and non Apple CPU archs. PS other than SH4/Sega Dreamcast's 1024 byte page size. No other CPUs in my human lifetime with a page smaller than 4096 have ever been made. Its always 4096 or higher, except for obsolete SH4.

SvPV() probably needs the _const variant here.

if(delim) {
int ret = 0;
const char* end = begin + len;
do {
ret |= lookup_parameter(aTHX_ begin, delim - begin);
begin = delim + 1;
delim = strchr(begin, ',');
} while (delim);
if (begin < end)
ret |= lookup_parameter(aTHX_ begin, end - begin);
return ret;
}
else {
return lookup_parameter(aTHX_ begin, len);
}
}

static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
int flags = parse_parameters(aTHX_ arg);
if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
PerlIOBase(f)->flags |= PERLIO_F_UTF8;
PerlIOSelf(f, PerlIOUnicode)->flags = flags;
return 0;
}
return -1;
}

static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
PerlIOBuf * const b = &u->buf;
PerlIO *n = PerlIONext(f);
SSize_t avail;
Size_t read_bytes = 0;
STDCHAR *end;
SSize_t fit;

if (PerlIO_flush(f) != 0)
return -1;
if (PerlIOBase(f)->flags & PERLIO_F_TTY)
PerlIOBase_flush_linebuf(aTHX);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

y a 2nd flush() variant? didnt 1st one do it?


if (!b->buf)
PerlIO_get_base(f);

assert(b->buf);

if (u->leftover_length) {
Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
b->end = b->buf + u->leftover_length;
read_bytes = u->leftover_length;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deref u->leftover_length once and cache it to a C auto. struct member u->leftover_length will prob get 2 mem reads even with -O2 here. Since the write to addr b->end, could happen to be the same addr as u->leftover_length in C abstract machine.

u->leftover_length = 0;
}
else {
b->ptr = b->end = b->buf;
}
fit = (SSize_t)b->bufsiz - (b->end - b->buf);

if (!PerlIOValid(n)) {
PerlIOBase(f)->flags |= PERLIO_F_EOF;
return -1;
}

if (PerlIO_fast_gets(n)) {
/*
* Layer below is also buffered. We do _NOT_ want to call its
* ->Read() because that will loop till it gets what we asked for
* which may hang on a pipe etc. Instead take anything it has to
* hand, or ask it to fill _once_.
*/
avail = PerlIO_get_cnt(n);
if (avail <= 0) {
avail = PerlIO_fill(n);
if (avail == 0)
avail = PerlIO_get_cnt(n);
else {
if (!PerlIO_error(n) && PerlIO_eof(n))
avail = 0;
}
}
if (avail > 0) {
STDCHAR *ptr = PerlIO_get_ptr(n);
const SSize_t cnt = avail;
if (avail > fit)
avail = fit;
Copy(ptr, b->end, avail, STDCHAR);
PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
read_bytes += avail;
}
}
else {
avail = PerlIO_read(n, b->end, fit);
if (avail > 0)
read_bytes += avail;
}
if (avail <= 0) {
if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
return -1;
}
}
end = b->buf + read_bytes;
b->end = b->buf;
b->end += validate(aTHX_ (const U8 *)b->end, (const U8 *)end, u->flags, n);
if (b->end < end) {
size_t len = b->buf + read_bytes - b->end;
Copy(b->end, u->leftovers, len, char);
u->leftover_length = len;
}
PerlIOBase(f)->flags |= PERLIO_F_RDBUF;

return 0;
}

PERLIO_FUNCS_DECL(PerlIO_utf8) = {
sizeof(PerlIO_funcs),
"utf8",
sizeof(PerlIOUnicode),
PERLIO_K_BUFFERED|PERLIO_K_UTF8,
PerlIOUnicode_pushed,
PerlIOBuf_popped,
PerlIOBuf_open,
PerlIOBase_binmode,
NULL,
PerlIOBase_fileno,
PerlIOBuf_dup,
PerlIOBuf_read,
PerlIOBuf_unread,
PerlIOBuf_write,
PerlIOBuf_seek,
PerlIOBuf_tell,
PerlIOBuf_close,
PerlIOBuf_flush,
PerlIOUnicode_fill,
PerlIOBase_eof,
PerlIOBase_error,
PerlIOBase_clearerr,
PerlIOBase_setlinebuf,
PerlIOBuf_get_base,
PerlIOBuf_bufsiz,
PerlIOBuf_get_ptr,
PerlIOBuf_get_cnt,
PerlIOBuf_set_ptrcnt,
};

PerlIO *
Perl_PerlIO_stdin(pTHX)
{
Expand Down
1 change: 1 addition & 0 deletions perliol.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ EXTCONST PerlIO_funcs PerlIO_perlio;
EXTCONST PerlIO_funcs PerlIO_stdio;
EXTCONST PerlIO_funcs PerlIO_crlf;
EXTCONST PerlIO_funcs PerlIO_utf8;
EXTCONST PerlIO_funcs PerlIO_utf8_lax;
EXTCONST PerlIO_funcs PerlIO_byte;
EXTCONST PerlIO_funcs PerlIO_raw;
EXTCONST PerlIO_funcs PerlIO_pending;
Expand Down
Loading
Loading