Skip to content

Commit 841de31

Browse files
committed
Made :utf8 an actual layer
It will check the input for validity, by default strict validity though less strict forms are provided. This also means PerlIO::get_layers doesn't return a "utf8" pseudo-layer anymore, which can break some code making that assumption.
1 parent ed4d129 commit 841de31

File tree

13 files changed

+381
-71
lines changed

13 files changed

+381
-71
lines changed

cpan/CPAN-Meta-YAML/t/11_read_string.t

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ subtest 'invalid UTF-8' => sub {
4040
# get invalid UTF-8 by reading Latin-1 with lax :utf8 layer
4141
my $string = do {
4242
local $SIG{__WARN__} = sub {};
43-
slurp( test_data_file('latin1.yml'), ":utf8" );
43+
slurp( test_data_file('latin1.yml'), ":utf8_lax" );
4444
};
4545
my $obj = eval { CPAN::Meta::YAML->read_string($string); };
4646
is( $obj, undef, "read_string should return undef" );
47-
error_like( qr/invalid UTF-8 string/,
47+
error_like( qr/UTF-8/,
4848
"Got expected error about invalid UTF-8 string"
4949
);
5050
};

lib/PerlIO.pm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package PerlIO;
22

3-
our $VERSION = '1.12';
3+
our $VERSION = '1.13';
44

55
# Map layer name to package that defines it
66
our %alias;

perlio.c

Lines changed: 341 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,7 @@ PerlIO_default_layers(pTHX)
14921492
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_stdio));
14931493
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_crlf));
14941494
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8));
1495+
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_utf8_lax));
14951496
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_remove));
14961497
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_byte));
14971498
PerlIO_define_layer(aTHX_ PERLIO_FUNCS_CAST(&PerlIO_scalar));
@@ -2239,9 +2240,9 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab)
22392240
return -1;
22402241
}
22412242

2242-
PERLIO_FUNCS_DECL(PerlIO_utf8) = {
2243+
PERLIO_FUNCS_DECL(PerlIO_utf8_lax) = {
22432244
sizeof(PerlIO_funcs),
2244-
"utf8",
2245+
"utf8_lax",
22452246
0,
22462247
PERLIO_K_UTF8 | PERLIO_K_MULTIARG,
22472248
PerlIOUtf8_pushed,
@@ -5272,6 +5273,344 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = {
52725273
PerlIOCrlf_set_ptrcnt,
52735274
};
52745275

5276+
static const U8 xs_utf8_sequence_len[0x100] = {
5277+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
5278+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
5279+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
5280+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
5281+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
5282+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
5283+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
5284+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
5285+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
5286+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
5287+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
5288+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
5289+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
5290+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
5291+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
5292+
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
5293+
};
5294+
5295+
#define UTF8_MAX_BYTES 4
5296+
5297+
typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;
5298+
5299+
static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
5300+
STRLEN i, n = xs_utf8_sequence_len[*cur];
5301+
5302+
if (n < 1 || len < 2)
5303+
return 1;
5304+
5305+
switch (cur[0]) {
5306+
case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
5307+
case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
5308+
case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
5309+
case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
5310+
default: if ((cur[1] & 0xC0) != 0x80) return 1; break;
5311+
}
5312+
5313+
if (n > len)
5314+
n = len;
5315+
for (i = 2; i < n; i++)
5316+
if ((cur[i] & 0xC0) != 0x80)
5317+
break;
5318+
return i;
5319+
}
5320+
5321+
static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__;
5322+
static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
5323+
static const char *hex = "0123456789ABCDEF";
5324+
const char *fmt;
5325+
char seq[UTF8_MAX_BYTES * 3];
5326+
char *d = seq;
5327+
5328+
if (eof)
5329+
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
5330+
else
5331+
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";
5332+
5333+
while (len-- > 0) {
5334+
const U8 c = *cur++;
5335+
*d++ = hex[c >> 4];
5336+
*d++ = hex[c & 15];
5337+
if (len)
5338+
*d++ = ' ';
5339+
}
5340+
*d = 0;
5341+
Perl_croak(aTHX_ fmt, seq);
5342+
}
5343+
5344+
static void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__;
5345+
static void report_noncharacter(pTHX_ UV usv) {
5346+
static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
5347+
Perl_croak(aTHX_ fmt, usv);
5348+
}
5349+
5350+
static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
5351+
const bool eof = PerlIO_eof(handle);
5352+
const U8 *cur = buf;
5353+
const U8 *end4 = end - 4;
5354+
STRLEN skip = 0;
5355+
U32 v;
5356+
5357+
while (cur < end4) {
5358+
while (cur < end4 && *cur < 0x80)
5359+
cur++;
5360+
5361+
check:
5362+
switch (xs_utf8_sequence_len[*cur]) {
5363+
case 0:
5364+
goto illformed;
5365+
case 1:
5366+
cur += 1;
5367+
break;
5368+
case 2:
5369+
/* 110xxxxx 10xxxxxx */
5370+
if ((cur[1] & 0xC0) != 0x80)
5371+
goto illformed;
5372+
cur += 2;
5373+
break;
5374+
case 3:
5375+
v = ((U32)cur[0] << 16)
5376+
| ((U32)cur[1] << 8)
5377+
| ((U32)cur[2]);
5378+
/* 1110xxxx 10xxxxxx 10xxxxxx */
5379+
if ((v & 0x00F0C0C0) != 0x00E08080 ||
5380+
/* Non-shortest form */
5381+
v < 0x00E0A080)
5382+
goto illformed;
5383+
/* Surrogates U+D800..U+DFFF */
5384+
if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
5385+
goto illformed;
5386+
/* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
5387+
if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
5388+
goto noncharacter;
5389+
cur += 3;
5390+
break;
5391+
case 4:
5392+
v = ((U32)cur[0] << 24)
5393+
| ((U32)cur[1] << 16)
5394+
| ((U32)cur[2] << 8)
5395+
| ((U32)cur[3]);
5396+
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
5397+
if ((v & 0xF8C0C0C0) != 0xF0808080 ||
5398+
/* Non-shortest form */
5399+
v < 0xF0908080 ||
5400+
/* Greater than U+10FFFF */
5401+
v > 0xF48FBFBF)
5402+
goto illformed;
5403+
/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
5404+
if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
5405+
goto noncharacter;
5406+
cur += 4;
5407+
break;
5408+
}
5409+
}
5410+
5411+
if (cur < end) {
5412+
if (cur + xs_utf8_sequence_len[*cur] <= end)
5413+
goto check;
5414+
skip = skip_sequence(cur, end - cur);
5415+
if (eof || cur + skip < end)
5416+
goto illformed;
5417+
}
5418+
return cur - buf;
5419+
5420+
illformed:
5421+
if (!skip)
5422+
skip = skip_sequence(cur, end - cur);
5423+
PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
5424+
report_illformed(aTHX_ cur, skip, eof);
5425+
5426+
noncharacter:
5427+
if (v < 0x10000)
5428+
v = (v & 0x3F) | (v & 0x1F00) >> 2;
5429+
else
5430+
v = (v & 0x3F) | (v & 0x1F00) >> 2 | (v & 0x0F0000) >> 4;
5431+
PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
5432+
report_noncharacter(aTHX_ v);
5433+
}
5434+
5435+
typedef struct {
5436+
PerlIOBuf buf;
5437+
STDCHAR leftovers[UTF8_MAX_BYTES];
5438+
size_t leftover_length;
5439+
int flags;
5440+
} PerlIOUnicode;
5441+
5442+
static struct {
5443+
const char* name;
5444+
size_t length;
5445+
utf8_flags value;
5446+
} map[] = {
5447+
{ STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
5448+
{ STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
5449+
{ STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
5450+
{ STR_WITH_LEN("strict"), STRICT_UTF8 },
5451+
{ STR_WITH_LEN("loose"), (utf8_flags)(ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST) },
5452+
};
5453+
5454+
static int lookup_parameter(pTHX_ const char* ptr, size_t len) {
5455+
unsigned i;
5456+
for (i = 0; i < sizeof map / sizeof *map; ++i) {
5457+
if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
5458+
return map[i].value;
5459+
}
5460+
Perl_croak(aTHX_ "Unknown argument to :utf8: %*s", (int)len, ptr);
5461+
}
5462+
static int parse_parameters(pTHX_ SV* param) {
5463+
STRLEN len;
5464+
const char *begin, *delim;
5465+
if (!param || !SvOK(param))
5466+
return 0;
5467+
5468+
begin = SvPV(param, len);
5469+
delim = strchr(begin, ',');
5470+
if(delim) {
5471+
int ret = 0;
5472+
const char* end = begin + len;
5473+
do {
5474+
ret |= lookup_parameter(aTHX_ begin, delim - begin);
5475+
begin = delim + 1;
5476+
delim = strchr(begin, ',');
5477+
} while (delim);
5478+
if (begin < end)
5479+
ret |= lookup_parameter(aTHX_ begin, end - begin);
5480+
return ret;
5481+
}
5482+
else {
5483+
return lookup_parameter(aTHX_ begin, len);
5484+
}
5485+
}
5486+
5487+
static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
5488+
int flags = parse_parameters(aTHX_ arg);
5489+
if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
5490+
PerlIOBase(f)->flags |= PERLIO_F_UTF8;
5491+
PerlIOSelf(f, PerlIOUnicode)->flags = flags;
5492+
return 0;
5493+
}
5494+
return -1;
5495+
}
5496+
5497+
static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
5498+
PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
5499+
PerlIOBuf * const b = &u->buf;
5500+
PerlIO *n = PerlIONext(f);
5501+
SSize_t avail;
5502+
Size_t read_bytes = 0;
5503+
STDCHAR *end;
5504+
SSize_t fit;
5505+
5506+
if (PerlIO_flush(f) != 0)
5507+
return -1;
5508+
if (PerlIOBase(f)->flags & PERLIO_F_TTY)
5509+
PerlIOBase_flush_linebuf(aTHX);
5510+
5511+
if (!b->buf)
5512+
PerlIO_get_base(f);
5513+
5514+
assert(b->buf);
5515+
5516+
if (u->leftover_length) {
5517+
Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
5518+
b->end = b->buf + u->leftover_length;
5519+
read_bytes = u->leftover_length;
5520+
u->leftover_length = 0;
5521+
}
5522+
else {
5523+
b->ptr = b->end = b->buf;
5524+
}
5525+
fit = (SSize_t)b->bufsiz - (b->end - b->buf);
5526+
5527+
if (!PerlIOValid(n)) {
5528+
PerlIOBase(f)->flags |= PERLIO_F_EOF;
5529+
return -1;
5530+
}
5531+
5532+
if (PerlIO_fast_gets(n)) {
5533+
/*
5534+
* Layer below is also buffered. We do _NOT_ want to call its
5535+
* ->Read() because that will loop till it gets what we asked for
5536+
* which may hang on a pipe etc. Instead take anything it has to
5537+
* hand, or ask it to fill _once_.
5538+
*/
5539+
avail = PerlIO_get_cnt(n);
5540+
if (avail <= 0) {
5541+
avail = PerlIO_fill(n);
5542+
if (avail == 0)
5543+
avail = PerlIO_get_cnt(n);
5544+
else {
5545+
if (!PerlIO_error(n) && PerlIO_eof(n))
5546+
avail = 0;
5547+
}
5548+
}
5549+
if (avail > 0) {
5550+
STDCHAR *ptr = PerlIO_get_ptr(n);
5551+
const SSize_t cnt = avail;
5552+
if (avail > fit)
5553+
avail = fit;
5554+
Copy(ptr, b->end, avail, STDCHAR);
5555+
PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
5556+
read_bytes += avail;
5557+
}
5558+
}
5559+
else {
5560+
avail = PerlIO_read(n, b->end, fit);
5561+
if (avail > 0)
5562+
read_bytes += avail;
5563+
}
5564+
if (avail <= 0) {
5565+
if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
5566+
PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
5567+
return -1;
5568+
}
5569+
}
5570+
end = b->buf + read_bytes;
5571+
b->end = b->buf;
5572+
b->end += validate(aTHX_ (const U8 *)b->end, (const U8 *)end, u->flags, n);
5573+
if (b->end < end) {
5574+
size_t len = b->buf + read_bytes - b->end;
5575+
Copy(b->end, u->leftovers, len, char);
5576+
u->leftover_length = len;
5577+
}
5578+
PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
5579+
5580+
return 0;
5581+
}
5582+
5583+
PERLIO_FUNCS_DECL(PerlIO_utf8) = {
5584+
sizeof(PerlIO_funcs),
5585+
"utf8",
5586+
sizeof(PerlIOUnicode),
5587+
PERLIO_K_BUFFERED|PERLIO_K_UTF8,
5588+
PerlIOUnicode_pushed,
5589+
PerlIOBuf_popped,
5590+
PerlIOBuf_open,
5591+
PerlIOBase_binmode,
5592+
NULL,
5593+
PerlIOBase_fileno,
5594+
PerlIOBuf_dup,
5595+
PerlIOBuf_read,
5596+
PerlIOBuf_unread,
5597+
PerlIOBuf_write,
5598+
PerlIOBuf_seek,
5599+
PerlIOBuf_tell,
5600+
PerlIOBuf_close,
5601+
PerlIOBuf_flush,
5602+
PerlIOUnicode_fill,
5603+
PerlIOBase_eof,
5604+
PerlIOBase_error,
5605+
PerlIOBase_clearerr,
5606+
PerlIOBase_setlinebuf,
5607+
PerlIOBuf_get_base,
5608+
PerlIOBuf_bufsiz,
5609+
PerlIOBuf_get_ptr,
5610+
PerlIOBuf_get_cnt,
5611+
PerlIOBuf_set_ptrcnt,
5612+
};
5613+
52755614
PerlIO *
52765615
Perl_PerlIO_stdin(pTHX)
52775616
{

perliol.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ EXTCONST PerlIO_funcs PerlIO_perlio;
111111
EXTCONST PerlIO_funcs PerlIO_stdio;
112112
EXTCONST PerlIO_funcs PerlIO_crlf;
113113
EXTCONST PerlIO_funcs PerlIO_utf8;
114+
EXTCONST PerlIO_funcs PerlIO_utf8_lax;
114115
EXTCONST PerlIO_funcs PerlIO_byte;
115116
EXTCONST PerlIO_funcs PerlIO_raw;
116117
EXTCONST PerlIO_funcs PerlIO_pending;

0 commit comments

Comments
 (0)