Skip to content

Commit 2759874

Browse files
committed
Enhance handling of CP932 text encoding
- Don't allow control characters to appear in the middle of a multi-byte character. (This was a strange feature of mbstring; it doesn't make much sense, and iconv doesn't allow it.) - Treat truncated multi-byte characters as an error.
1 parent b489c1b commit 2759874

File tree

3 files changed

+8116
-9
lines changed

3 files changed

+8116
-9
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cp932.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
#include "unicode_table_cp932_ext.h"
3434
#include "unicode_table_jis.h"
3535

36+
static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter);
37+
3638
static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
3739
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3840
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -71,7 +73,7 @@ const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
7173
mbfl_filt_conv_common_ctor,
7274
NULL,
7375
mbfl_filt_conv_cp932_wchar,
74-
mbfl_filt_conv_common_flush,
76+
mbfl_filt_conv_cp932_wchar_flush,
7577
NULL,
7678
};
7779

@@ -193,17 +195,11 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
193195
}
194196
}
195197
if (w <= 0) {
196-
w = (s1 << 8) | s2;
197-
w &= MBFL_WCSPLANE_MASK;
198-
w |= MBFL_WCSPLANE_WINCP932;
198+
w = (s1 << 8) | s2 | MBFL_WCSPLANE_WINCP932;
199199
}
200200
CK((*filter->output_function)(w, filter->data));
201-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
202-
CK((*filter->output_function)(c, filter->data));
203201
} else {
204-
w = (c1 << 8) | c;
205-
w &= MBFL_WCSGROUP_MASK;
206-
w |= MBFL_WCSGROUP_THROUGH;
202+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
207203
CK((*filter->output_function)(w, filter->data));
208204
}
209205
break;
@@ -216,6 +212,19 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
216212
return c;
217213
}
218214

215+
static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
216+
{
217+
if (filter->status) {
218+
(*filter->filter_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter);
219+
}
220+
221+
if (filter->flush_function) {
222+
(*filter->flush_function)(filter->data);
223+
}
224+
225+
return 0;
226+
}
227+
219228
/*
220229
* wchar => SJIS-win
221230
*/
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
--TEST--
2+
Exhaustive test of CP932 encoding verification and conversion
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--FILE--
6+
<?php
7+
srand(4321); /* Make results consistent */
8+
include('encoding_tests.inc');
9+
mb_substitute_character(0x25); // '%'
10+
11+
/* Read in the table of all characters in CP932 */
12+
readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode);
13+
14+
/* Aside from the characters in that table, we also support a 'user' area
15+
* from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
16+
$codepoint = 0xE000;
17+
for ($i = 0xF0; $i <= 0xF9; $i++) {
18+
for ($j = 0x40; $j <= 0xFC; $j++) {
19+
if ($j == 0x7F)
20+
continue;
21+
$utf16 = pack('n', $codepoint);
22+
$cp932 = chr($i) . chr($j);
23+
$validChars[$cp932] = $utf16;
24+
$fromUnicode[$utf16] = $cp932;
25+
$codepoint++;
26+
}
27+
}
28+
29+
/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */
30+
$fromUnicode["\x00\xA2"] = "\x81\x91";
31+
/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */
32+
$fromUnicode["\x00\xA3"] = "\x81\x92";
33+
34+
/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
35+
* But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */
36+
$fromUnicode["\x30\x1C"] = "\x81\x60";
37+
/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
38+
* but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */
39+
$fromUnicode["\x22\x12"] = "\x81\x7C";
40+
/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
41+
* but when converting Unicode to CP932, we also accept U+2016
42+
* (DOUBLE VERTICAL LINE) */
43+
$fromUnicode["\x20\x16"] = "\x81\x61";
44+
/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN),
45+
* but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
46+
$fromUnicode["\x00\xAC"] = "\x81\xCA";
47+
48+
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
49+
50+
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
51+
52+
/* There are 396 Unicode codepoints which are non-invertible in CP932
53+
* (multiple CP932 byte sequences map to the same codepoint)
54+
* Some of these are 3-way pile-ups. I wonder what the fine folks at MS
55+
* were thinking when they designed this text encoding. */
56+
57+
/* Everything from 0xED00-0xEEFF falls in this unfortunate category
58+
* (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when
59+
* converting from Unicode back to CP932, we favor the F's rather than the E's) */
60+
$nonInvertible = array();
61+
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
62+
$bytes = pack('n', $i);
63+
if (isset($validChars[$bytes])) {
64+
unset($fromUnicode[$validChars[$bytes]]);
65+
$nonInvertible[$bytes] = $validChars[$bytes];
66+
unset($validChars[$bytes]); // will test these separately
67+
}
68+
}
69+
70+
/* There are 23 other collisions between 2-byte sequences which variously
71+
* start with 0x81, 0x87, or 0xFA
72+
* We _love_ 0x81 and use it when possible. 0x87 is a second favorite */
73+
for ($i = 0xFA4A; $i <= 0xFA53; $i++) {
74+
$bytes = pack('n', $i);
75+
unset($fromUnicode[$validChars[$bytes]]);
76+
$nonInvertible[$bytes] = $validChars[$bytes];
77+
unset($validChars[$bytes]); // will test these separately
78+
}
79+
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) {
80+
$bytes = pack('n', $i);
81+
unset($fromUnicode[$validChars[$bytes]]);
82+
$nonInvertible[$bytes] = $validChars[$bytes];
83+
unset($validChars[$bytes]); // will test these separately
84+
}
85+
86+
testAllValidChars($validChars, 'CP932', 'UTF-16BE');
87+
foreach ($nonInvertible as $cp932 => $unicode)
88+
testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false);
89+
echo "CP932 verification and conversion works on all valid characters\n";
90+
91+
testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%");
92+
echo "CP932 verification and conversion works on all invalid characters\n";
93+
94+
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
95+
echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
96+
?>
97+
--EXPECT--
98+
CP932 verification and conversion works on all valid characters
99+
CP932 verification and conversion works on all invalid characters
100+
Unicode -> CP932 conversion works on all invalid codepoints

0 commit comments

Comments
 (0)