Skip to content

Commit 4f3bd2e

Browse files
committed
Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants
Converting U+203E to 0x7E was especially wrong for CP932, where 0x7E represents a tilde. For vanilla Shift-JIS and Shift-JIS-2004, converting to 0x7E is acceptable, since 0x7E does represent an overline/macron in those encodings. Follow the same principle in CP51932, which is closely related to CP932.
1 parent 0d0029d commit 4f3bd2e

File tree

9 files changed

+14
-9
lines changed

9 files changed

+14
-9
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp51932.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,6 @@ mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
216216
if (s1 <= 0) {
217217
if (c == 0xa5) { /* YEN SIGN */
218218
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
219-
} else if (c == 0x203e) { /* OVER LINE */
220-
s1 = 0x007e; /* FULLWIDTH MACRON */
221219
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
222220
s1 = 0x2140;
223221
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */

ext/mbstring/libmbfl/filters/mbfilter_cp932.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,6 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
253253
if (s1 <= 0) {
254254
if (c == 0xa5) { /* YEN SIGN */
255255
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
256-
} else if (c == 0x203e) { /* OVER LINE */
257-
s1 = 0x007e; /* FULLWIDTH MACRON */
258256
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
259257
s1 = 0x2140;
260258
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
211211
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
212212
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
213213
s1 = 0x2141;
214+
} else if (c == 0x203E) { /* U+203E is OVERLINE */
215+
s1 = 0x7E; /* Halfwidth overline/macron */
214216
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
215217
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
216218
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
@@ -223,8 +225,6 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
223225
if (s1 <= 0) {
224226
if (c == 0xA5) { /* YEN SIGN */
225227
s1 = 0x5C;
226-
} else if (c == 0x203E) { /* OVER LINE */
227-
s1 = 0x7E;
228228
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
229229
s1 = 0x2140;
230230
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */

ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -749,8 +749,6 @@ int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
749749
if (s1 <= 0) {
750750
if (c == 0xA5) { /* YEN SIGN */
751751
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
752-
} else if (c == 0x203E) { /* OVER LINE */
753-
s1 = 0x2131; /* FULLWIDTH MACRON */
754752
} else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
755753
s1 = 0x2140;
756754
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */

ext/mbstring/libmbfl/filters/unicode_table_jis.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2444,7 +2444,7 @@ const unsigned short ucs_a2_jis_table[] = {
24442444
0x2277,0x2278,0x0000,0x0000,0x0000,0x2145,0x2144,0x0000,
24452445
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
24462446
0x2273,0x0000,0x216C,0x216D,0x0000,0x0000,0x0000,0x0000,
2447-
0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x0000,0x0000,
2447+
0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x2131,0x0000,
24482448
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
24492449
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
24502450
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,

ext/mbstring/tests/cp51932_encoding.phpt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ for ($i = 0; $i <= 0x7F; $i++)
8686

8787
/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
8888
$fromUnicode["\x00\xA5"] = "\xA1\xEF";
89+
/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
90+
$fromUnicode["\x20\x3E"] = "\xA1\xB1";
8991

9092
testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
9193
testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);

ext/mbstring/tests/cp932_encoding.phpt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ $fromUnicode["\x20\x16"] = "\x81\x61";
4747
* but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
4848
$fromUnicode["\x00\xAC"] = "\x81\xCA";
4949

50+
/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
51+
$fromUnicode["\x20\x3E"] = "\x81\x50";
52+
5053
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
5154

5255
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));

ext/mbstring/tests/eucjp_encoding.phpt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ $fromUnicode["\x00\x00\x00\x7E"] = "\x7E";
4343
/* Likewise with 0x005C */
4444
$fromUnicode["\x00\x00\x00\x5C"] = "\x5C";
4545

46+
/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
47+
$fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1";
48+
4649
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3));
4750

4851
/* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7)

ext/mbstring/tests/sjismac_encoding.phpt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ $fromUnicode["\x00\x7F"] = "\x7F";
6262
* and U+2015 */
6363
$fromUnicode["\x20\x15"] = "\x81\x5C";
6464

65+
/* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */
66+
$fromUnicode["\x20\x3E"] = "\x81\x50";
67+
6568
testAllValidChars($validChars, 'SJIS-mac', 'UTF-32BE');
6669
echo "MacJapanese verification and conversion works on all valid characters\n";
6770

0 commit comments

Comments
 (0)