Skip to content

Commit 5e5243a

Browse files
committed
CP5022{0,1,2}: convert Unicode codepoints in 'user' area (0xE000-E757) correctly
Unicode has a range of 'private' codepoints which individual applications can use for their own purposes. When they were inventing CP932, MicroSoft mapped these 'private' or 'user' codepoints to ten new rows added to the JIS X 0208 character table. (JIS X 0208 is based on a 94x94 table; MS used rows 95-114 for private characters.) `mbfl_filt_conv_wchar_jis_ms` converted these private codepoints to rows 85-94 rather than 95-114. The code included a link to a document on the OpenGroup web site, dating back to 1996 [1], which proposed mapping private codepoints to these rows. However, that is not consistent with what mbstring does when converting CP5022x to Unicode. There seems to be a dearth of information on CP5022x on the web. However, I did find one (Japanese-language) page on CP50221, which states that it maps kuten codes 0x7F21-0x927E to the 'private' Unicode codepoints [2]. As a side note, using rows higher than 95 does seem to defeat one purpose of using an ISO-2022-JP variant: ISO-2022-JP was specifically designed to be "7-bit clean", but once you go beyond row 95, the ku codes are 0x80 and up, so 8 bits are needed. [1] https://web.archive.org/web/20000229180004/http://www.opengroup.or.jp/jvc/cde/ucs-conv.html [2] https://www.wdic.org/w/WDIC/Microsoft%20Windows%20Codepage%20%3A%2050221
1 parent 6e9c838 commit 5e5243a

File tree

1 file changed

+28
-31
lines changed

1 file changed

+28
-31
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter)
389389
/* PUE => Microsoft extended (pseudo 95ku - 114ku) */
390390
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
391391
s = c - 0xe000;
392-
s = (s / 94 + 0x75) << 8 | (s % 94 + 0x21);
392+
s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
393393
} else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
394394
/* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
395395
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
@@ -489,8 +489,8 @@ mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter)
489489
CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
490490
}
491491
filter->status = 0x200;
492-
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
493-
CK((*filter->output_function)(s & 0x7f, filter->data));
492+
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
493+
CK((*filter->output_function)(s & 0xff, filter->data));
494494
} else if (s < 0x10000) { /* X 0212 */
495495
if ((filter->status & 0xff00) != 0x300) {
496496
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
@@ -600,16 +600,10 @@ mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
600600
s = ucs_i_jis_table[c - ucs_i_jis_table_min];
601601
} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
602602
s = ucs_r_jis_table[c - ucs_r_jis_table_min];
603-
} else if (c >= 0xe000 && c < (0xe000 + 10 * 94)) {
604-
/* PUE => Microsoft extended */
605-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
606-
s = c - 0xe000;
607-
s = (s / 94 + 0x75) << 8 | (s % 94 + 0x21);
608-
} else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
609-
/* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
610-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
611-
s = c - (0xe000 + 10 * 94);
612-
s = (s / 94 + 0xf5) << 8 | (s % 94 + 0xa1);
603+
} else if (c >= 0xE000 && c <= 0xE757) {
604+
/* 'private'/'user' codepoints */
605+
s = c - 0xE000;
606+
s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
613607
}
614608

615609
if (s <= 0) {
@@ -631,7 +625,16 @@ mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
631625
s = 0x224c;
632626
}
633627
}
634-
if (s <= 0 || (s >= 0x8080 && s < 0x10000)) {
628+
629+
/* Above, we do a series of lookups in `ucs_*_jis_table` to find a
630+
* corresponding kuten code for this Unicode codepoint
631+
* If we get zero, that means the codepoint is not in JIS X 0208
632+
* On the other hand, if we get a result with the high bits set on both
633+
* upper and lower bytes, that is not a code in JIS X 0208 but rather
634+
* in JIS X 0213
635+
* In either case, check if this codepoint is one of the extensions added
636+
* to JIS X 0208 by MicroSoft (to make CP932) */
637+
if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
635638
int i;
636639
s = -1;
637640

@@ -697,15 +700,15 @@ mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter)
697700
filter->status = 0x500;
698701
}
699702
CK((*filter->output_function)(s - 0x80, filter->data));
700-
} else if (s < 0x8080) { /* X 0208 */
703+
} else if (s <= 0x927E) { /* X 0208 + extensions */
701704
if ((filter->status & 0xff00) != 0x200) {
702705
CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
703706
CK((*filter->output_function)(0x24, filter->data)); /* '$' */
704707
CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
705708
filter->status = 0x200;
706709
}
707-
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
708-
CK((*filter->output_function)(s & 0x7f, filter->data));
710+
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
711+
CK((*filter->output_function)(s & 0xff, filter->data));
709712
} else if (s < 0x10000) { /* X0212 */
710713
CK(mbfl_filt_conv_illegal_output(c, filter));
711714
} else { /* X 0201 latin */
@@ -742,16 +745,10 @@ mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
742745
s = ucs_i_jis_table[c - ucs_i_jis_table_min];
743746
} else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
744747
s = ucs_r_jis_table[c - ucs_r_jis_table_min];
745-
} else if (c >= 0xe000 && c < (0xe000 + 10 * 94)) {
746-
/* PUE => Microsoft extended */
747-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
748-
s = c - 0xe000;
749-
s = (s / 94 + 0x75) << 8 | (s % 94 + 0x21);
750-
} else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
751-
/* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
752-
/* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
753-
s = c - (0xe000 + 10 * 94);
754-
s = (s / 94 + 0xf5) << 8 | (s % 94 + 0xa1);
748+
} else if (c >= 0xE000 && c <= 0xE757) {
749+
/* 'private'/'user' codepoints */
750+
s = c - 0xE000;
751+
s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
755752
}
756753

757754
if (s <= 0) {
@@ -773,7 +770,7 @@ mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
773770
s = 0x224c;
774771
}
775772
}
776-
if (s <= 0 || (s >= 0x8080 && s < 0x10000)) {
773+
if (s == 0 || ((s & 0x8000) && (s & 0x80))) {
777774
int i;
778775
s = -1;
779776

@@ -839,7 +836,7 @@ mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
839836
filter->status = 0x500;
840837
}
841838
CK((*filter->output_function)(s - 0x80, filter->data));
842-
} else if (s < 0x8080) { /* X 0208 */
839+
} else if (s <= 0x927E) { /* X 0208 */
843840
if ((filter->status & 0xff00) == 0x500) {
844841
CK((*filter->output_function)(0x0f, filter->data)); /* SO */
845842
filter->status = 0;
@@ -850,8 +847,8 @@ mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter)
850847
CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
851848
filter->status = 0x200;
852849
}
853-
CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
854-
CK((*filter->output_function)(s & 0x7f, filter->data));
850+
CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
851+
CK((*filter->output_function)(s & 0xff, filter->data));
855852
} else if (s < 0x10000) { /* X0212 */
856853
CK(mbfl_filt_conv_illegal_output(c, filter));
857854
} else { /* X 0201 latin */

0 commit comments

Comments
 (0)