@@ -11088,7 +11088,7 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b
1108811088 continue ;
1108911089 } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max ) {
1109011090 if (w == 0x1F9 ) {
11091- s = 0xA8Bf ;
11091+ s = 0xA8BF ;
1109211092 } else {
1109311093 s = ucs_a1_cp936_table [w - ucs_a1_cp936_table_min ];
1109411094 }
@@ -11560,6 +11560,319 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo
1156011560 MB_CONVERT_BUF_STORE (buf , out , limit );
1156111561}
1156211562
11563+ static const unsigned short gb18030_2022_pua_tbl3 [] = {
11564+ /* 0xFE50 */
11565+ 0x0000 ,0xE816 ,0xE817 ,0xE818 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11566+ 0x0000 ,0x9FB4 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11567+ 0x0000 ,0x9FB5 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x9FB6 ,0x9FB7 ,
11568+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0xE831 ,0x9FB8 ,0x0000 ,0x0000 ,
11569+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0xE83B ,0x0000 ,
11570+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x9FB9 ,0x0000 ,
11571+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11572+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11573+ 0x9FBA ,0xE855 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11574+ 0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,0x0000 ,
11575+ /* 0xFEA0 */
11576+ 0x9FBB
11577+ };
11578+
11579+ static size_t mb_gb18030_2022_to_wchar (unsigned char * * in , size_t * in_len , uint32_t * buf , size_t bufsize , unsigned int * state )
11580+ {
11581+ unsigned char * p = * in , * e = p + * in_len ;
11582+ uint32_t * out = buf , * limit = buf + bufsize ;
11583+
11584+ while (p < e && out < limit ) {
11585+ unsigned char c = * p ++ ;
11586+
11587+ if (c < 0x80 ) {
11588+ * out ++ = c ;
11589+ } else if (c == 0x80 || c == 0xFF ) {
11590+ * out ++ = MBFL_BAD_INPUT ;
11591+ } else {
11592+ if (p == e ) {
11593+ * out ++ = MBFL_BAD_INPUT ;
11594+ break ;
11595+ }
11596+ unsigned char c2 = * p ++ ;
11597+
11598+ if (((c >= 0x81 && c <= 0x84 ) || (c >= 0x90 && c <= 0xE3 )) && c2 >= 0x30 && c2 <= 0x39 ) {
11599+ if (p >= e ) {
11600+ * out ++ = MBFL_BAD_INPUT ;
11601+ break ;
11602+ }
11603+ unsigned char c3 = * p ++ ;
11604+
11605+ if (c3 >= 0x81 && c3 <= 0xFE && p < e ) {
11606+ unsigned char c4 = * p ++ ;
11607+
11608+ if (c4 >= 0x30 && c4 <= 0x39 ) {
11609+ if (c >= 0x90 && c <= 0xE3 ) {
11610+ unsigned int w = ((((c - 0x90 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 )))* 10 + (c4 - 0x30 ) + 0x10000 ;
11611+ * out ++ = (w > 0x10FFFF ) ? MBFL_BAD_INPUT : w ;
11612+ } else {
11613+ /* Unicode BMP */
11614+ unsigned int w = (((c - 0x81 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 ))* 10 + (c4 - 0x30 );
11615+ if (w == 0x98A4 ) {
11616+ * out ++ = 0xE78D ;
11617+ } else if (w == 0x98A6 ) {
11618+ * out ++ = 0xE78E ;
11619+ } else if (w == 0x98A5 ) {
11620+ * out ++ = 0xE78F ;
11621+ } else if (w >= 0x98A7 && w <= 0x98AD ) {
11622+ * out ++ = w + (0xE790 - 0x98A7 );
11623+ } else if (w == 0x1D21 ) {
11624+ * out ++ = 0xE7C7 ;
11625+ } else if (w == 0x4A71 ) {
11626+ * out ++ = 0xE81E ;
11627+ } else if (w == 0x4A72 ) {
11628+ * out ++ = 0xE826 ;
11629+ } else if (w >= 0x4A73 && w <= 0x4A74 ) {
11630+ * out ++ = w + (0xE82B - 0x4A73 );
11631+ } else if (w == 0x4A75 ) {
11632+ * out ++ = 0xE832 ;
11633+ } else if (w == 0x4A76 ) {
11634+ * out ++ = 0xE843 ;
11635+ } else if (w == 0x4A77 ) {
11636+ * out ++ = 0xE854 ;
11637+ } else if (w == 0x4A78 ) {
11638+ * out ++ = 0xE864 ;
11639+ } else if (w <= 0x99FB ) {
11640+ * out ++ = w + mbfl_gb_uni_ofst [mbfl_bisec_srch (w , mbfl_gb2uni_tbl , mbfl_gb_uni_max )];
11641+ } else {
11642+ * out ++ = MBFL_BAD_INPUT ;
11643+ }
11644+ }
11645+ } else {
11646+ * out ++ = MBFL_BAD_INPUT ;
11647+ }
11648+ } else {
11649+ * out ++ = MBFL_BAD_INPUT ;
11650+ }
11651+ } else if (((c >= 0xAA && c <= 0xAF ) || (c >= 0xF8 && c <= 0xFE )) && (c2 >= 0xA1 && c2 <= 0xFE )) {
11652+ /* UDA part 1, 2: U+E000-U+E4C5 */
11653+ * out ++ = 94 * (c >= 0xF8 ? c - 0xF2 : c - 0xAA ) + (c2 - 0xA1 ) + 0xE000 ;
11654+ } else if (c >= 0xA1 && c <= 0xA7 && c2 >= 0x40 && c2 < 0xA1 && c2 != 0x7F ) {
11655+ /* UDA part 3: U+E4C6-U+E765 */
11656+ * out ++ = 96 * (c - 0xA1 ) + c2 - (c2 >= 0x80 ? 0x41 : 0x40 ) + 0xE4C6 ;
11657+ } else if (c2 >= 0x40 && c2 != 0x7F && c2 != 0xFF ) {
11658+ unsigned int w = (c - 0x81 )* 192 + c2 - 0x40 ;
11659+
11660+ if (w >= 0x192B ) {
11661+ if (w <= 0x1EBE ) {
11662+ if (w != 0x1963 && w != 0x1DBF && (w < 0x1E49 || w > 0x1E55 ) && w != 0x1E7F ) {
11663+ * out ++ = gb18030_2022_pua_tbl1 [w - 0x192B ];
11664+ continue ;
11665+ }
11666+ } else if (w >= 0x413A ) {
11667+ if (w <= 0x413E ) {
11668+ * out ++ = cp936_pua_tbl2 [w - 0x413A ];
11669+ continue ;
11670+ } else if (w >= 0x5DD0 && w <= 0x5E20 ) {
11671+ unsigned int c = gb18030_2022_pua_tbl3 [w - 0x5DD0 ];
11672+ if (c ) {
11673+ * out ++ = c ;
11674+ continue ;
11675+ }
11676+ }
11677+ }
11678+ }
11679+
11680+ if ((c >= 0x81 && c <= 0xA9 ) || (c >= 0xB0 && c <= 0xF7 && c2 >= 0xA1 ) || (c >= 0xAA && c <= 0xFE && c2 <= 0xA0 )) {
11681+ ZEND_ASSERT (w < cp936_ucs_table_size );
11682+ * out ++ = cp936_ucs_table [w ];
11683+ } else {
11684+ * out ++ = MBFL_BAD_INPUT ;
11685+ }
11686+ } else {
11687+ * out ++ = MBFL_BAD_INPUT ;
11688+ }
11689+ }
11690+ }
11691+
11692+ * in_len = e - p ;
11693+ * in = p ;
11694+ return out - buf ;
11695+ }
11696+
11697+ static void mb_wchar_to_gb18030_2022 (uint32_t * in , size_t len , mb_convert_buf * buf , bool end )
11698+ {
11699+ unsigned char * out , * limit ;
11700+ MB_CONVERT_BUF_LOAD (buf , out , limit );
11701+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
11702+
11703+ while (len -- ) {
11704+ uint32_t w = * in ++ ;
11705+ unsigned int s = 0 ;
11706+
11707+ if (w == 0 ) {
11708+ out = mb_convert_buf_add (out , 0 );
11709+ continue ;
11710+ } else if (w >= ucs_a1_cp936_table_min && w < ucs_a1_cp936_table_max ) {
11711+ if (w == 0x1F9 ) {
11712+ s = 0xA8BF ;
11713+ } else {
11714+ s = ucs_a1_cp936_table [w - ucs_a1_cp936_table_min ];
11715+ }
11716+ } else if (w >= ucs_a2_cp936_table_min && w < ucs_a2_cp936_table_max ) {
11717+ if (w == 0x20AC ) { /* Euro sign */
11718+ s = 0xA2E3 ;
11719+ } else {
11720+ s = ucs_a2_cp936_table [w - ucs_a2_cp936_table_min ];
11721+ }
11722+ } else if (w >= ucs_a3_cp936_table_min && w < ucs_a3_cp936_table_max ) {
11723+ s = ucs_a3_cp936_table [w - ucs_a3_cp936_table_min ];
11724+ } else if (w >= 0x9FB4 && w <= 0x9FBB ) {
11725+ /* Newly mapped in GB18030-2022 */
11726+ if (w == 0x9FB4 ) {
11727+ s = 0xFE59 ;
11728+ } else if (w == 0x9FB5 ) {
11729+ s = 0xFE61 ;
11730+ } else if (w == 0x9FB6 ) {
11731+ s = 0xFE66 ;
11732+ } else if (w == 0x9FB7 ) {
11733+ s = 0xFE67 ;
11734+ } else if (w == 0x9FB8 ) {
11735+ s = 0xFE6D ;
11736+ } else if (w == 0x9FB9 ) {
11737+ s = 0xFE7E ;
11738+ } else if (w == 0x9FBA ) {
11739+ s = 0xFE90 ;
11740+ } else {
11741+ s = 0xFEA0 ;
11742+ }
11743+ } else if (w >= ucs_i_cp936_table_min && w < ucs_i_cp936_table_max ) {
11744+ s = ucs_i_cp936_table [w - ucs_i_cp936_table_min ];
11745+ } else if (w >= ucs_ci_cp936_table_min && w < ucs_ci_cp936_table_max ) {
11746+ /* U+F900-U+FA2F CJK Compatibility Ideographs */
11747+ if (w == 0xF92C ) {
11748+ s = 0xFD9C ;
11749+ } else if (w == 0xF979 ) {
11750+ s = 0xFD9D ;
11751+ } else if (w == 0xF995 ) {
11752+ s = 0xFD9E ;
11753+ } else if (w == 0xF9E7 ) {
11754+ s = 0xFD9F ;
11755+ } else if (w == 0xF9F1 ) {
11756+ s = 0xFDA0 ;
11757+ } else if (w >= 0xFA0C && w <= 0xFA29 ) {
11758+ s = ucs_ci_s_cp936_table [w - 0xFA0C ];
11759+ }
11760+ } else if (w >= ucs_cf_cp936_table_min && w < ucs_cf_cp936_table_max ) {
11761+ /* CJK Compatibility Forms */
11762+ s = ucs_cf_cp936_table [w - ucs_cf_cp936_table_min ];
11763+ } else if (w >= ucs_sfv_cp936_table_min && w < ucs_sfv_cp936_table_max ) {
11764+ /* U+FE50-U+FE6F Small Form Variants */
11765+ s = ucs_sfv_cp936_table [w - ucs_sfv_cp936_table_min ];
11766+ } else if (w >= ucs_hff_cp936_table_min && w < ucs_hff_cp936_table_max ) {
11767+ /* U+FF00-U+FFFF HW/FW Forms */
11768+ if (w == 0xFF04 ) {
11769+ s = 0xA1E7 ;
11770+ } else if (w == 0xFF5E ) {
11771+ s = 0xA1AB ;
11772+ } else if (w >= 0xFF01 && w <= 0xFF5D ) {
11773+ s = w - 0xFF01 + 0xA3A1 ;
11774+ } else if (w >= 0xFFE0 && w <= 0xFFE5 ) {
11775+ s = ucs_hff_s_cp936_table [w - 0xFFE0 ];
11776+ }
11777+ } else if (w >= 0xE000 && w <= 0xE864 ) {
11778+ /* PUA */
11779+ if (w < 0xE766 ) {
11780+ if (w < 0xE4C6 ) {
11781+ unsigned int c1 = w - 0xE000 ;
11782+ s = (c1 % 94 ) + 0xA1 ;
11783+ c1 /= 94 ;
11784+ s |= (c1 + (c1 < 0x06 ? 0xAA : 0xF2 )) << 8 ;
11785+ } else {
11786+ unsigned int c1 = w - 0xE4C6 ;
11787+ s = ((c1 / 96 ) + 0xA1 ) << 8 ;
11788+ c1 %= 96 ;
11789+ s |= c1 + (c1 >= 0x3F ? 0x41 : 0x40 );
11790+ }
11791+ } else {
11792+ /* U+E766-U+E864 */
11793+ unsigned int k1 = 0 , k2 = mbfl_gb18030_2022_pua_tbl_max ;
11794+ while (k1 < k2 ) {
11795+ unsigned int k = (k1 + k2 ) >> 1 ;
11796+ if (w < mbfl_gb18030_2022_pua_tbl [k ][0 ]) {
11797+ k2 = k ;
11798+ } else if (w > mbfl_gb18030_2022_pua_tbl [k ][1 ]) {
11799+ k1 = k + 1 ;
11800+ } else {
11801+ s = w - mbfl_gb18030_2022_pua_tbl [k ][0 ] + mbfl_gb18030_2022_pua_tbl [k ][2 ];
11802+ break ;
11803+ }
11804+ }
11805+ }
11806+ } else if (w >= 0xFE10 && w <= 0xFE19 ) {
11807+ /* Newly mapped codepoints in GB18030-2022 */
11808+ if (w == 0xFE11 ) {
11809+ s = 0xA6DB ;
11810+ } else if (w == 0xFE12 ) {
11811+ s = 0xA6DA ;
11812+ } else if (w <= 0xFE16 ) {
11813+ s = w - (0xFE10 - 0xA6D9 );
11814+ } else if (w <= 0xFE18 ) {
11815+ s = w - (0xFE17 - 0xA6EC );
11816+ } else {
11817+ s = 0xA6F3 ;
11818+ }
11819+ } else if (w == 0x1E3F ) {
11820+ /* Newly mapped codepoint in GB18030-2022 */
11821+ s = 0xA8BC ;
11822+ }
11823+
11824+ /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
11825+ * do a binary search in a table of differing codepoints to see if we have one */
11826+ if (!s && w >= mbfl_gb18030_c_tbl_key [0 ] && w <= mbfl_gb18030_c_tbl_key [mbfl_gb18030_c_tbl_max - 1 ]) {
11827+ int i = mbfl_bisec_srch2 (w , mbfl_gb18030_c_tbl_key , mbfl_gb18030_c_tbl_max );
11828+ if (i >= 0 ) {
11829+ s = mbfl_gb18030_c_tbl_val [i ];
11830+ }
11831+ }
11832+
11833+ /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
11834+ if (!s && w >= 0x80 && w <= 0xFFFF ) {
11835+ /* BMP */
11836+ int i = mbfl_bisec_srch (w , mbfl_uni2gb2022_tbl , mbfl_gb2022_uni_max );
11837+ if (i >= 0 ) {
11838+ unsigned int c1 = w - mbfl_gb2022_uni_ofst [i ];
11839+ s = (c1 % 10 ) + 0x30 ;
11840+ c1 /= 10 ;
11841+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
11842+ c1 /= 126 ;
11843+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
11844+ c1 /= 10 ;
11845+ s |= (c1 + 0x81 ) << 24 ;
11846+ }
11847+ } else if (w >= 0x10000 && w <= 0x10FFFF ) {
11848+ /* Code set 3: Unicode U+10000-U+10FFFF */
11849+ unsigned int c1 = w - 0x10000 ;
11850+ s = (c1 % 10 ) + 0x30 ;
11851+ c1 /= 10 ;
11852+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
11853+ c1 /= 126 ;
11854+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
11855+ c1 /= 10 ;
11856+ s |= (c1 + 0x90 ) << 24 ;
11857+ }
11858+
11859+ if (!s ) {
11860+ MB_CONVERT_ERROR (buf , out , limit , w , mb_wchar_to_gb18030 );
11861+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len );
11862+ } else if (s < 0x80 ) {
11863+ out = mb_convert_buf_add (out , s );
11864+ } else if (s > 0xFFFFFF ) {
11865+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 4 );
11866+ out = mb_convert_buf_add4 (out , (s >> 24 ) & 0xFF , (s >> 16 ) & 0xFF , (s >> 8 ) & 0xFF , s & 0xFF );
11867+ } else {
11868+ MB_CONVERT_BUF_ENSURE (buf , out , limit , len + 2 );
11869+ out = mb_convert_buf_add2 (out , (s >> 8 ) & 0xFF , s & 0xFF );
11870+ }
11871+ }
11872+
11873+ MB_CONVERT_BUF_STORE (buf , out , limit );
11874+ }
11875+
1156311876/* Step through a GB18030 string one character at a time. Find the last position at or
1156411877 * before `limit` which falls directly after the end of a (single or multi-byte) character */
1156511878static zend_always_inline unsigned char * step_through_gb18030_str (unsigned char * p , unsigned char * limit )
@@ -11673,6 +11986,21 @@ const mbfl_encoding mbfl_encoding_cp936 = {
1167311986 NULL ,
1167411987};
1167511988
11989+ const mbfl_encoding mbfl_encoding_gb18030_2022 = {
11990+ mbfl_no_encoding_gb18030_2022 ,
11991+ "GB18030-2022" ,
11992+ "GB18030-2022" ,
11993+ NULL ,
11994+ NULL ,
11995+ MBFL_ENCTYPE_GL_UNSAFE ,
11996+ NULL ,
11997+ NULL ,
11998+ mb_gb18030_2022_to_wchar ,
11999+ mb_wchar_to_gb18030_2022 ,
12000+ NULL ,
12001+ mb_cut_gb18030 ,
12002+ };
12003+
1167612004/*
1167712005 * BIG5/CP950
1167812006 */
0 commit comments