-
Notifications
You must be signed in to change notification settings - Fork 602
Make :utf8 an actually validating layer #23342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: blead
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1492,6 +1492,7 @@ PerlIO_default_layers(pTHX) | |
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte)); | ||
| PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_scalar)); | ||
|
|
@@ -2239,9 +2240,9 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab) | |
| return -1; | ||
| } | ||
|
|
||
| PERLIO_FUNCS_DECL(PerlIO_utf8) = { | ||
| PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = { | ||
| sizeof(PerlIO_funcs), | ||
| "utf8", | ||
| "utf8_lax", | ||
| 0, | ||
| PERLIO_K_UTF8 | PERLIO_K_MULTIARG, | ||
| PerlIOUtf8_pushed, | ||
|
|
@@ -5272,6 +5273,344 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = { | |
| PerlIOCrlf_set_ptrcnt, | ||
| }; | ||
|
|
||
| static const U8 xs_utf8_sequence_len[0x100] = { | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */ | ||
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */ | ||
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */ | ||
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */ | ||
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */ | ||
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */ | ||
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */ | ||
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */ | ||
| 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */ | ||
| 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */ | ||
| }; | ||
|
|
||
| #define UTF8_MAX_BYTES 4 | ||
|
|
||
| typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags; | ||
|
|
||
| static STRLEN skip_sequence(const U8 *cur, const STRLEN len) { | ||
| STRLEN i, n = xs_utf8_sequence_len[*cur]; | ||
|
|
||
| if (n < 1 || len < 2) | ||
| return 1; | ||
|
|
||
| switch (cur[0]) { | ||
| case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break; | ||
| case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break; | ||
| case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break; | ||
| case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */ | ||
| default: if ((cur[1] & 0xC0) != 0x80) return 1; break; | ||
| } | ||
|
|
||
| if (n > len) | ||
| n = len; | ||
| for (i = 2; i < n; i++) | ||
| if ((cur[i] & 0xC0) != 0x80) | ||
| break; | ||
| return i; | ||
| } | ||
|
|
||
| static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__; | ||
| static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) { | ||
| static const char *hex = "0123456789ABCDEF"; | ||
| const char *fmt; | ||
| char seq[UTF8_MAX_BYTES * 3]; | ||
| char *d = seq; | ||
|
|
||
| if (eof) | ||
| fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file"; | ||
| else | ||
| fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>"; | ||
|
|
||
| while (len-- > 0) { | ||
| const U8 c = *cur++; | ||
| *d++ = hex[c >> 4]; | ||
| *d++ = hex[c & 15]; | ||
| if (len) | ||
| *d++ = ' '; | ||
| } | ||
| *d = 0; | ||
| Perl_croak(aTHX_ fmt, seq); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use |
||
| } | ||
|
|
||
| static void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__; | ||
| static void report_noncharacter(pTHX_ UV usv) { | ||
| static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf; | ||
| Perl_croak(aTHX_ fmt, usv); | ||
| } | ||
|
|
||
| static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) { | ||
| const bool eof = PerlIO_eof(handle); | ||
| const U8 *cur = buf; | ||
| const U8 *end4 = end - 4; | ||
| STRLEN skip = 0; | ||
| U32 v; | ||
|
|
||
| while (cur < end4) { | ||
| while (cur < end4 && *cur < 0x80) | ||
| cur++; | ||
|
|
||
| check: | ||
| switch (xs_utf8_sequence_len[*cur]) { | ||
| case 0: | ||
| goto illformed; | ||
| case 1: | ||
| cur += 1; | ||
| break; | ||
| case 2: | ||
| /* 110xxxxx 10xxxxxx */ | ||
| if ((cur[1] & 0xC0) != 0x80) | ||
| goto illformed; | ||
| cur += 2; | ||
| break; | ||
| case 3: | ||
| v = ((U32)cur[0] << 16) | ||
| | ((U32)cur[1] << 8) | ||
| | ((U32)cur[2]); | ||
| /* 1110xxxx 10xxxxxx 10xxxxxx */ | ||
| if ((v & 0x00F0C0C0) != 0x00E08080 || | ||
| /* Non-shortest form */ | ||
| v < 0x00E0A080) | ||
| goto illformed; | ||
| /* Surrogates U+D800..U+DFFF */ | ||
| if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080) | ||
| goto illformed; | ||
| /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */ | ||
| if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE)) | ||
| goto noncharacter; | ||
| cur += 3; | ||
| break; | ||
| case 4: | ||
| v = ((U32)cur[0] << 24) | ||
| | ((U32)cur[1] << 16) | ||
| | ((U32)cur[2] << 8) | ||
| | ((U32)cur[3]); | ||
| /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use perl.h/handy.h/Configure has almost non-existent support for ISO C compliant unaligned memory access. But GCC project devs evangelize that combining Note, GCC will always turn |
||
| if ((v & 0xF8C0C0C0) != 0xF0808080 || | ||
| /* Non-shortest form */ | ||
| v < 0xF0908080 || | ||
| /* Greater than U+10FFFF */ | ||
| v > 0xF48FBFBF) | ||
| goto illformed; | ||
| /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */ | ||
| if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE) | ||
| goto noncharacter; | ||
| cur += 4; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (cur < end) { | ||
| if (cur + xs_utf8_sequence_len[*cur] <= end) | ||
| goto check; | ||
| skip = skip_sequence(cur, end - cur); | ||
| if (eof || cur + skip < end) | ||
| goto illformed; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can |
||
| } | ||
| return cur - buf; | ||
|
|
||
| illformed: | ||
| if (!skip) | ||
| skip = skip_sequence(cur, end - cur); | ||
| PerlIOBase(handle)->flags |= PERLIO_F_ERROR; | ||
| report_illformed(aTHX_ cur, skip, eof); | ||
|
|
||
| noncharacter: | ||
| if (v < 0x10000) | ||
| v = (v & 0x3F) | (v & 0x1F00) >> 2; | ||
| else | ||
| v = (v & 0x3F) | (v & 0x1F00) >> 2 | (v & 0x0F0000) >> 4; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't compile this and look at the asm code, but I suspect low IQ MSVC will have 2 branches, both with identical CPUs op doing expression |
||
| PerlIOBase(handle)->flags |= PERLIO_F_ERROR; | ||
| report_noncharacter(aTHX_ v); | ||
| } | ||
|
|
||
| typedef struct { | ||
| PerlIOBuf buf; | ||
| STDCHAR leftovers[UTF8_MAX_BYTES]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if you seek with something in leftovers? |
||
| size_t leftover_length; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. C struct alignment violation. |
||
| int flags; | ||
| } PerlIOUnicode; | ||
|
|
||
| static struct { | ||
| const char* name; | ||
| size_t length; | ||
| utf8_flags value; | ||
| } map[] = { | ||
| { STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES }, | ||
| { STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS }, | ||
| { STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST }, | ||
| { STR_WITH_LEN("strict"), STRICT_UTF8 }, | ||
| { STR_WITH_LEN("loose"), (utf8_flags)(ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST) }, | ||
| }; | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. structs like this are sloppy and have as many null bytes as liters of water in the Pacific Ocean. A C struct with 3 arrays, sorted by element size, is the correct solution. Size sort order wud be Will waste |
||
| static int lookup_parameter(pTHX_ const char* ptr, size_t len) { | ||
| unsigned i; | ||
| for (i = 0; i < sizeof map / sizeof *map; ++i) { | ||
| if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0) | ||
| return map[i].value; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compute |
||
| } | ||
| Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr); | ||
| } | ||
| static int parse_parameters(pTHX_ SV* param) { | ||
| STRLEN len; | ||
| const char *begin, *delim; | ||
| if (!param || !SvOK(param)) | ||
| return 0; | ||
|
|
||
| begin = SvPV(param, len); | ||
| delim = strchr(begin, ','); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no I will pretend that Apple's innovative and controversial valgrind report violating They first compute the next 4096 page boundary, then perform all memory reads, using U32 *s ops inside
|
||
| if(delim) { | ||
| int ret = 0; | ||
| const char* end = begin + len; | ||
| do { | ||
| ret |= lookup_parameter(aTHX_ begin, delim - begin); | ||
| begin = delim + 1; | ||
| delim = strchr(begin, ','); | ||
| } while (delim); | ||
| if (begin < end) | ||
| ret |= lookup_parameter(aTHX_ begin, end - begin); | ||
| return ret; | ||
| } | ||
| else { | ||
| return lookup_parameter(aTHX_ begin, len); | ||
| } | ||
| } | ||
|
|
||
| static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) { | ||
| int flags = parse_parameters(aTHX_ arg); | ||
| if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) { | ||
| PerlIOBase(f)->flags |= PERLIO_F_UTF8; | ||
| PerlIOSelf(f, PerlIOUnicode)->flags = flags; | ||
| return 0; | ||
| } | ||
| return -1; | ||
| } | ||
|
|
||
| static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) { | ||
| PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode); | ||
| PerlIOBuf * const b = &u->buf; | ||
| PerlIO *n = PerlIONext(f); | ||
| SSize_t avail; | ||
| Size_t read_bytes = 0; | ||
| STDCHAR *end; | ||
| SSize_t fit; | ||
|
|
||
| if (PerlIO_flush(f) != 0) | ||
| return -1; | ||
| if (PerlIOBase(f)->flags & PERLIO_F_TTY) | ||
| PerlIOBase_flush_linebuf(aTHX); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. y a 2nd flush() variant? didnt 1st one do it? |
||
|
|
||
| if (!b->buf) | ||
| PerlIO_get_base(f); | ||
|
|
||
| assert(b->buf); | ||
|
|
||
| if (u->leftover_length) { | ||
| Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR); | ||
| b->end = b->buf + u->leftover_length; | ||
| read_bytes = u->leftover_length; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. deref |
||
| u->leftover_length = 0; | ||
| } | ||
| else { | ||
| b->ptr = b->end = b->buf; | ||
| } | ||
| fit = (SSize_t)b->bufsiz - (b->end - b->buf); | ||
|
|
||
| if (!PerlIOValid(n)) { | ||
| PerlIOBase(f)->flags |= PERLIO_F_EOF; | ||
| return -1; | ||
| } | ||
|
|
||
| if (PerlIO_fast_gets(n)) { | ||
| /* | ||
| * Layer below is also buffered. We do _NOT_ want to call its | ||
| * ->Read() because that will loop till it gets what we asked for | ||
| * which may hang on a pipe etc. Instead take anything it has to | ||
| * hand, or ask it to fill _once_. | ||
| */ | ||
| avail = PerlIO_get_cnt(n); | ||
| if (avail <= 0) { | ||
| avail = PerlIO_fill(n); | ||
| if (avail == 0) | ||
| avail = PerlIO_get_cnt(n); | ||
| else { | ||
| if (!PerlIO_error(n) && PerlIO_eof(n)) | ||
| avail = 0; | ||
| } | ||
| } | ||
| if (avail > 0) { | ||
| STDCHAR *ptr = PerlIO_get_ptr(n); | ||
| const SSize_t cnt = avail; | ||
| if (avail > fit) | ||
| avail = fit; | ||
| Copy(ptr, b->end, avail, STDCHAR); | ||
| PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail); | ||
| read_bytes += avail; | ||
| } | ||
| } | ||
| else { | ||
| avail = PerlIO_read(n, b->end, fit); | ||
| if (avail > 0) | ||
| read_bytes += avail; | ||
| } | ||
| if (avail <= 0) { | ||
| if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) { | ||
| PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR; | ||
| return -1; | ||
| } | ||
| } | ||
| end = b->buf + read_bytes; | ||
| b->end = b->buf; | ||
| b->end += validate(aTHX_ (const U8 *)b->end, (const U8 *)end, u->flags, n); | ||
| if (b->end < end) { | ||
| size_t len = b->buf + read_bytes - b->end; | ||
| Copy(b->end, u->leftovers, len, char); | ||
| u->leftover_length = len; | ||
| } | ||
| PerlIOBase(f)->flags |= PERLIO_F_RDBUF; | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| PERLIO_FUNCS_DECL(PerlIO_utf8) = { | ||
| sizeof(PerlIO_funcs), | ||
| "utf8", | ||
| sizeof(PerlIOUnicode), | ||
| PERLIO_K_BUFFERED|PERLIO_K_UTF8, | ||
| PerlIOUnicode_pushed, | ||
| PerlIOBuf_popped, | ||
| PerlIOBuf_open, | ||
| PerlIOBase_binmode, | ||
| NULL, | ||
| PerlIOBase_fileno, | ||
| PerlIOBuf_dup, | ||
| PerlIOBuf_read, | ||
| PerlIOBuf_unread, | ||
| PerlIOBuf_write, | ||
| PerlIOBuf_seek, | ||
| PerlIOBuf_tell, | ||
| PerlIOBuf_close, | ||
| PerlIOBuf_flush, | ||
| PerlIOUnicode_fill, | ||
| PerlIOBase_eof, | ||
| PerlIOBase_error, | ||
| PerlIOBase_clearerr, | ||
| PerlIOBase_setlinebuf, | ||
| PerlIOBuf_get_base, | ||
| PerlIOBuf_bufsiz, | ||
| PerlIOBuf_get_ptr, | ||
| PerlIOBuf_get_cnt, | ||
| PerlIOBuf_set_ptrcnt, | ||
| }; | ||
|
|
||
| PerlIO * | ||
| Perl_PerlIO_stdin(pTHX) | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a better bitwise math algebra formula for doing this vs the current solution. At first glance, I thought that table was a duplicate of https://github.com/Perl/perl5/blob/blead/utf8.h#L222
but on the 2nd examination, the table in this commit isn't identical to
PL_utf8skip[]. Since the max value in this table is0x3or0x01 | 0x02, or0x4depending how you look at it, in other words, a bit vector with 2 or 3 bits per array element will hold the info, This table is leaving alot of dead bits in this char array. My gut instinct says such an ocean of unused bits is "a way" but the wrong way to do it. Atleast pack the lengths into quads. Ideally these bitscan be reduced to 256 bits of const literal operands. More detailed, it can be reduced to 4 or 6 x 64bit integer constants, and 4 or 6 branches, something like
if(ch <= 32) len = ( 0xU64_BITVEC_MASK >> ch) & 0x3;. I'll guess doing some SO/Google research will reveal fast and better and more clever algos invented by various ppl vs this 256 byte table.Changing this table to something better is a very low priority, and should be done as the last thing before this draft is committed to blead as prod code. As a draft/WIP/concept, perf and speed arent the highest priority until consensus and future revisions make this PR commitable to blead.