@@ -1492,6 +1492,7 @@ PerlIO_default_layers(pTHX)
14921492 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_stdio ));
14931493 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_crlf ));
14941494 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_utf8 ));
1495+ PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_utf8_lax ));
14951496 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_remove ));
14961497 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_byte ));
14971498 PerlIO_define_layer (aTHX_ PERLIO_FUNCS_CAST (& PerlIO_scalar ));
@@ -2239,9 +2240,9 @@ PerlIOUtf8_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab)
22392240 return -1 ;
22402241}
22412242
2242- PERLIO_FUNCS_DECL (PerlIO_utf8 ) = {
2243+ PERLIO_FUNCS_DECL (PerlIO_utf8_lax ) = {
22432244 sizeof (PerlIO_funcs ),
2244- "utf8 " ,
2245+ "utf8_lax " ,
22452246 0 ,
22462247 PERLIO_K_UTF8 | PERLIO_K_MULTIARG ,
22472248 PerlIOUtf8_pushed ,
@@ -5272,6 +5273,344 @@ PERLIO_FUNCS_DECL(PerlIO_crlf) = {
52725273 PerlIOCrlf_set_ptrcnt ,
52735274};
52745275
5276+ static const U8 xs_utf8_sequence_len [0x100 ] = {
5277+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x00-0x0F */
5278+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x10-0x1F */
5279+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x20-0x2F */
5280+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x30-0x3F */
5281+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x40-0x4F */
5282+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x50-0x5F */
5283+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x60-0x6F */
5284+ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* 0x70-0x7F */
5285+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , /* 0x80-0x8F */
5286+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , /* 0x90-0x9F */
5287+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , /* 0xA0-0xAF */
5288+ 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , /* 0xB0-0xBF */
5289+ 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , /* 0xC0-0xCF */
5290+ 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , /* 0xD0-0xDF */
5291+ 3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 , /* 0xE0-0xEF */
5292+ 4 ,4 ,4 ,4 ,4 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , /* 0xF0-0xFF */
5293+ };
5294+
5295+ #define UTF8_MAX_BYTES 4
5296+
5297+ typedef enum { STRICT_UTF8 = 0 , ALLOW_SURROGATES = 1 , ALLOW_NONCHARACTERS = 2 , ALLOW_NONSHORTEST = 4 } utf8_flags ;
5298+
5299+ static STRLEN skip_sequence (const U8 * cur , const STRLEN len ) {
5300+ STRLEN i , n = xs_utf8_sequence_len [* cur ];
5301+
5302+ if (n < 1 || len < 2 )
5303+ return 1 ;
5304+
5305+ switch (cur [0 ]) {
5306+ case 0xE0 : if ((cur [1 ] & 0xE0 ) != 0xA0 ) return 1 ; break ;
5307+ case 0xED : if ((cur [1 ] & 0xE0 ) != 0x80 ) return 1 ; break ;
5308+ case 0xF4 : if ((cur [1 ] & 0xF0 ) != 0x80 ) return 1 ; break ;
5309+ case 0xF0 : if ((cur [1 ] & 0xF0 ) == 0x80 ) return 1 ; /* FALLTROUGH */
5310+ default : if ((cur [1 ] & 0xC0 ) != 0x80 ) return 1 ; break ;
5311+ }
5312+
5313+ if (n > len )
5314+ n = len ;
5315+ for (i = 2 ; i < n ; i ++ )
5316+ if ((cur [i ] & 0xC0 ) != 0x80 )
5317+ break ;
5318+ return i ;
5319+ }
5320+
5321+ static void report_illformed (pTHX_ const U8 * cur , STRLEN len , bool eof ) __attribute__noreturn__ ;
5322+ static void report_illformed (pTHX_ const U8 * cur , STRLEN len , bool eof ) {
5323+ static const char * hex = "0123456789ABCDEF" ;
5324+ const char * fmt ;
5325+ char seq [UTF8_MAX_BYTES * 3 ];
5326+ char * d = seq ;
5327+
5328+ if (eof )
5329+ fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file" ;
5330+ else
5331+ fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>" ;
5332+
5333+ while (len -- > 0 ) {
5334+ const U8 c = * cur ++ ;
5335+ * d ++ = hex [c >> 4 ];
5336+ * d ++ = hex [c & 15 ];
5337+ if (len )
5338+ * d ++ = ' ' ;
5339+ }
5340+ * d = 0 ;
5341+ Perl_croak (aTHX_ fmt , seq );
5342+ }
5343+
5344+ static void report_noncharacter (pTHX_ UV usv ) __attribute__noreturn__ ;
5345+ static void report_noncharacter (pTHX_ UV usv ) {
5346+ static const char * fmt = "Can't interchange noncharacter code point U+%" UVXf ;
5347+ Perl_croak (aTHX_ fmt , usv );
5348+ }
5349+
5350+ static STRLEN validate (pTHX_ const U8 * buf , const U8 * end , const int flags , PerlIO * handle ) {
5351+ const bool eof = PerlIO_eof (handle );
5352+ const U8 * cur = buf ;
5353+ const U8 * end4 = end - 4 ;
5354+ STRLEN skip = 0 ;
5355+ U32 v ;
5356+
5357+ while (cur < end4 ) {
5358+ while (cur < end4 && * cur < 0x80 )
5359+ cur ++ ;
5360+
5361+ check :
5362+ switch (xs_utf8_sequence_len [* cur ]) {
5363+ case 0 :
5364+ goto illformed ;
5365+ case 1 :
5366+ cur += 1 ;
5367+ break ;
5368+ case 2 :
5369+ /* 110xxxxx 10xxxxxx */
5370+ if ((cur [1 ] & 0xC0 ) != 0x80 )
5371+ goto illformed ;
5372+ cur += 2 ;
5373+ break ;
5374+ case 3 :
5375+ v = ((U32 )cur [0 ] << 16 )
5376+ | ((U32 )cur [1 ] << 8 )
5377+ | ((U32 )cur [2 ]);
5378+ /* 1110xxxx 10xxxxxx 10xxxxxx */
5379+ if ((v & 0x00F0C0C0 ) != 0x00E08080 ||
5380+ /* Non-shortest form */
5381+ v < 0x00E0A080 )
5382+ goto illformed ;
5383+ /* Surrogates U+D800..U+DFFF */
5384+ if (!(flags & ALLOW_SURROGATES ) && (v & 0x00EFA080 ) == 0x00EDA080 )
5385+ goto illformed ;
5386+ /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
5387+ if (!(flags & ALLOW_NONCHARACTERS ) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE ))
5388+ goto noncharacter ;
5389+ cur += 3 ;
5390+ break ;
5391+ case 4 :
5392+ v = ((U32 )cur [0 ] << 24 )
5393+ | ((U32 )cur [1 ] << 16 )
5394+ | ((U32 )cur [2 ] << 8 )
5395+ | ((U32 )cur [3 ]);
5396+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
5397+ if ((v & 0xF8C0C0C0 ) != 0xF0808080 ||
5398+ /* Non-shortest form */
5399+ v < 0xF0908080 ||
5400+ /* Greater than U+10FFFF */
5401+ v > 0xF48FBFBF )
5402+ goto illformed ;
5403+ /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
5404+ if (!(flags & ALLOW_NONCHARACTERS ) && (v & 0x000FBFBE ) == 0x000FBFBE )
5405+ goto noncharacter ;
5406+ cur += 4 ;
5407+ break ;
5408+ }
5409+ }
5410+
5411+ if (cur < end ) {
5412+ if (cur + xs_utf8_sequence_len [* cur ] <= end )
5413+ goto check ;
5414+ skip = skip_sequence (cur , end - cur );
5415+ if (eof || cur + skip < end )
5416+ goto illformed ;
5417+ }
5418+ return cur - buf ;
5419+
5420+ illformed :
5421+ if (!skip )
5422+ skip = skip_sequence (cur , end - cur );
5423+ PerlIOBase (handle )-> flags |= PERLIO_F_ERROR ;
5424+ report_illformed (aTHX_ cur , skip , eof );
5425+
5426+ noncharacter :
5427+ if (v < 0x10000 )
5428+ v = (v & 0x3F ) | (v & 0x1F00 ) >> 2 ;
5429+ else
5430+ v = (v & 0x3F ) | (v & 0x1F00 ) >> 2 | (v & 0x0F0000 ) >> 4 ;
5431+ PerlIOBase (handle )-> flags |= PERLIO_F_ERROR ;
5432+ report_noncharacter (aTHX_ v );
5433+ }
5434+
5435+ typedef struct {
5436+ PerlIOBuf buf ;
5437+ STDCHAR leftovers [UTF8_MAX_BYTES ];
5438+ size_t leftover_length ;
5439+ int flags ;
5440+ } PerlIOUnicode ;
5441+
5442+ static struct {
5443+ const char * name ;
5444+ size_t length ;
5445+ utf8_flags value ;
5446+ } map [] = {
5447+ { STR_WITH_LEN ("allow_surrogates" ), ALLOW_SURROGATES },
5448+ { STR_WITH_LEN ("allow_noncharacters" ), ALLOW_NONCHARACTERS },
5449+ { STR_WITH_LEN ("allow_nonshortest" ), ALLOW_NONSHORTEST },
5450+ { STR_WITH_LEN ("strict" ), STRICT_UTF8 },
5451+ { STR_WITH_LEN ("loose" ), (utf8_flags )(ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST ) },
5452+ };
5453+
5454+ static int lookup_parameter (pTHX_ const char * ptr , size_t len ) {
5455+ unsigned i ;
5456+ for (i = 0 ; i < sizeof map / sizeof * map ; ++ i ) {
5457+ if (map [i ].length == len && memcmp (ptr , map [i ].name , len ) == 0 )
5458+ return map [i ].value ;
5459+ }
5460+ Perl_croak (aTHX_ "Unknown argument to :utf8: %*s" , (int )len , ptr );
5461+ }
5462+ static int parse_parameters (pTHX_ SV * param ) {
5463+ STRLEN len ;
5464+ const char * begin , * delim ;
5465+ if (!param || !SvOK (param ))
5466+ return 0 ;
5467+
5468+ begin = SvPV (param , len );
5469+ delim = strchr (begin , ',' );
5470+ if (delim ) {
5471+ int ret = 0 ;
5472+ const char * end = begin + len ;
5473+ do {
5474+ ret |= lookup_parameter (aTHX_ begin , delim - begin );
5475+ begin = delim + 1 ;
5476+ delim = strchr (begin , ',' );
5477+ } while (delim );
5478+ if (begin < end )
5479+ ret |= lookup_parameter (aTHX_ begin , end - begin );
5480+ return ret ;
5481+ }
5482+ else {
5483+ return lookup_parameter (aTHX_ begin , len );
5484+ }
5485+ }
5486+
5487+ static IV PerlIOUnicode_pushed (pTHX_ PerlIO * f , const char * mode , SV * arg , PerlIO_funcs * tab ) {
5488+ int flags = parse_parameters (aTHX_ arg );
5489+ if (PerlIOBuf_pushed (aTHX_ f , mode , arg , tab ) == 0 ) {
5490+ PerlIOBase (f )-> flags |= PERLIO_F_UTF8 ;
5491+ PerlIOSelf (f , PerlIOUnicode )-> flags = flags ;
5492+ return 0 ;
5493+ }
5494+ return -1 ;
5495+ }
5496+
5497+ static IV PerlIOUnicode_fill (pTHX_ PerlIO * f ) {
5498+ PerlIOUnicode * const u = PerlIOSelf (f , PerlIOUnicode );
5499+ PerlIOBuf * const b = & u -> buf ;
5500+ PerlIO * n = PerlIONext (f );
5501+ SSize_t avail ;
5502+ Size_t read_bytes = 0 ;
5503+ STDCHAR * end ;
5504+ SSize_t fit ;
5505+
5506+ if (PerlIO_flush (f ) != 0 )
5507+ return -1 ;
5508+ if (PerlIOBase (f )-> flags & PERLIO_F_TTY )
5509+ PerlIOBase_flush_linebuf (aTHX );
5510+
5511+ if (!b -> buf )
5512+ PerlIO_get_base (f );
5513+
5514+ assert (b -> buf );
5515+
5516+ if (u -> leftover_length ) {
5517+ Copy (u -> leftovers , b -> buf , u -> leftover_length , STDCHAR );
5518+ b -> end = b -> buf + u -> leftover_length ;
5519+ read_bytes = u -> leftover_length ;
5520+ u -> leftover_length = 0 ;
5521+ }
5522+ else {
5523+ b -> ptr = b -> end = b -> buf ;
5524+ }
5525+ fit = (SSize_t )b -> bufsiz - (b -> end - b -> buf );
5526+
5527+ if (!PerlIOValid (n )) {
5528+ PerlIOBase (f )-> flags |= PERLIO_F_EOF ;
5529+ return -1 ;
5530+ }
5531+
5532+ if (PerlIO_fast_gets (n )) {
5533+ /*
5534+ * Layer below is also buffered. We do _NOT_ want to call its
5535+ * ->Read() because that will loop till it gets what we asked for
5536+ * which may hang on a pipe etc. Instead take anything it has to
5537+ * hand, or ask it to fill _once_.
5538+ */
5539+ avail = PerlIO_get_cnt (n );
5540+ if (avail <= 0 ) {
5541+ avail = PerlIO_fill (n );
5542+ if (avail == 0 )
5543+ avail = PerlIO_get_cnt (n );
5544+ else {
5545+ if (!PerlIO_error (n ) && PerlIO_eof (n ))
5546+ avail = 0 ;
5547+ }
5548+ }
5549+ if (avail > 0 ) {
5550+ STDCHAR * ptr = PerlIO_get_ptr (n );
5551+ const SSize_t cnt = avail ;
5552+ if (avail > fit )
5553+ avail = fit ;
5554+ Copy (ptr , b -> end , avail , STDCHAR );
5555+ PerlIO_set_ptrcnt (n , ptr + avail , cnt - avail );
5556+ read_bytes += avail ;
5557+ }
5558+ }
5559+ else {
5560+ avail = PerlIO_read (n , b -> end , fit );
5561+ if (avail > 0 )
5562+ read_bytes += avail ;
5563+ }
5564+ if (avail <= 0 ) {
5565+ if (avail < 0 || (read_bytes == 0 && PerlIO_eof (n ))) {
5566+ PerlIOBase (f )-> flags |= (avail == 0 ) ? PERLIO_F_EOF : PERLIO_F_ERROR ;
5567+ return -1 ;
5568+ }
5569+ }
5570+ end = b -> buf + read_bytes ;
5571+ b -> end = b -> buf ;
5572+ b -> end += validate (aTHX_ (const U8 * )b -> end , (const U8 * )end , u -> flags , n );
5573+ if (b -> end < end ) {
5574+ size_t len = b -> buf + read_bytes - b -> end ;
5575+ Copy (b -> end , u -> leftovers , len , char );
5576+ u -> leftover_length = len ;
5577+ }
5578+ PerlIOBase (f )-> flags |= PERLIO_F_RDBUF ;
5579+
5580+ return 0 ;
5581+ }
5582+
5583+ PERLIO_FUNCS_DECL (PerlIO_utf8 ) = {
5584+ sizeof (PerlIO_funcs ),
5585+ "utf8" ,
5586+ sizeof (PerlIOUnicode ),
5587+ PERLIO_K_BUFFERED |PERLIO_K_UTF8 ,
5588+ PerlIOUnicode_pushed ,
5589+ PerlIOBuf_popped ,
5590+ PerlIOBuf_open ,
5591+ PerlIOBase_binmode ,
5592+ NULL ,
5593+ PerlIOBase_fileno ,
5594+ PerlIOBuf_dup ,
5595+ PerlIOBuf_read ,
5596+ PerlIOBuf_unread ,
5597+ PerlIOBuf_write ,
5598+ PerlIOBuf_seek ,
5599+ PerlIOBuf_tell ,
5600+ PerlIOBuf_close ,
5601+ PerlIOBuf_flush ,
5602+ PerlIOUnicode_fill ,
5603+ PerlIOBase_eof ,
5604+ PerlIOBase_error ,
5605+ PerlIOBase_clearerr ,
5606+ PerlIOBase_setlinebuf ,
5607+ PerlIOBuf_get_base ,
5608+ PerlIOBuf_bufsiz ,
5609+ PerlIOBuf_get_ptr ,
5610+ PerlIOBuf_get_cnt ,
5611+ PerlIOBuf_set_ptrcnt ,
5612+ };
5613+
52755614PerlIO *
52765615Perl_PerlIO_stdin (pTHX )
52775616{
0 commit comments