Enhance handling of CP932 text encoding

alexdowad · alexdowad · commit 2759874a4250 · 2020-11-25T19:52:19.000+02:00
- Don't allow control characters to appear in the middle of a multi-byte
  character. (This was a strange feature of mbstring; it doesn't make much
  sense, and iconv doesn't allow it.)
- Treat truncated multi-byte characters as an error.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
@@ -33,6 +33,8 @@
 #include "unicode_table_cp932_ext.h"
 #include "unicode_table_jis.h"
 
+static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter);
+
 static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -71,7 +73,7 @@ const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
 	mbfl_filt_conv_common_ctor,
 	NULL,
 	mbfl_filt_conv_cp932_wchar,
-	mbfl_filt_conv_common_flush,
+	mbfl_filt_conv_cp932_wchar_flush,
 	NULL,
 };
 
@@ -193,17 +195,11 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
 				}
 			}
 			if (w <= 0) {
-				w = (s1 << 8) | s2;
-				w &= MBFL_WCSPLANE_MASK;
-				w |= MBFL_WCSPLANE_WINCP932;
+				w = (s1 << 8) | s2 | MBFL_WCSPLANE_WINCP932;
 			}
 			CK((*filter->output_function)(w, filter->data));
-		} else if ((c >= 0 && c < 0x21) || c == 0x7f) {		/* CTLs */
-			CK((*filter->output_function)(c, filter->data));
 		} else {
-			w = (c1 << 8) | c;
-			w &= MBFL_WCSGROUP_MASK;
-			w |= MBFL_WCSGROUP_THROUGH;
+			w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
 			CK((*filter->output_function)(w, filter->data));
 		}
 		break;
@@ -216,6 +212,19 @@ mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
 	return c;
 }
 
+static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
+{
+	if (filter->status) {
+		(*filter->filter_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter);
+	}
+
+	if (filter->flush_function) {
+		(*filter->flush_function)(filter->data);
+	}
+
+	return 0;
+}
+
 /*
  * wchar => SJIS-win
  */
diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt
@@ -0,0 +1,100 @@
+--TEST--
+Exhaustive test of CP932 encoding verification and conversion
+--SKIPIF--
+<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
+--FILE--
+<?php
+srand(4321); /* Make results consistent */
+include('encoding_tests.inc');
+mb_substitute_character(0x25); // '%'
+
+/* Read in the table of all characters in CP932 */
+readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode);
+
+/* Aside from the characters in that table, we also support a 'user' area
+ * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
+$codepoint = 0xE000;
+for ($i = 0xF0; $i <= 0xF9; $i++) {
+	for ($j = 0x40; $j <= 0xFC; $j++) {
+		if ($j == 0x7F)
+			continue;
+		$utf16 = pack('n', $codepoint);
+		$cp932 = chr($i) . chr($j);
+		$validChars[$cp932] = $utf16;
+		$fromUnicode[$utf16] = $cp932;
+		$codepoint++;
+	}
+}
+
+/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */
+$fromUnicode["\x00\xA2"] = "\x81\x91";
+/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */
+$fromUnicode["\x00\xA3"] = "\x81\x92";
+
+/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
+ * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */
+$fromUnicode["\x30\x1C"] = "\x81\x60";
+/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
+ * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */
+$fromUnicode["\x22\x12"] = "\x81\x7C";
+/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
+ * but when converting Unicode to CP932, we also accept U+2016
+ * (DOUBLE VERTICAL LINE) */
+$fromUnicode["\x20\x16"] = "\x81\x61";
+/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN),
+ * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
+$fromUnicode["\x00\xAC"] = "\x81\xCA";
+
+findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
+
+findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
+
+/* There are 396 Unicode codepoints which are non-invertible in CP932
+ * (multiple CP932 byte sequences map to the same codepoint)
+ * Some of these are 3-way pile-ups. I wonder what the fine folks at MS
+ * were thinking when they designed this text encoding. */
+
+/* Everything from 0xED00-0xEEFF falls in this unfortunate category
+ * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when
+ * converting from Unicode back to CP932, we favor the F's rather than the E's) */
+$nonInvertible = array();
+for ($i = 0xED00; $i <= 0xEEFF; $i++) {
+	$bytes = pack('n', $i);
+	if (isset($validChars[$bytes])) {
+		unset($fromUnicode[$validChars[$bytes]]);
+		$nonInvertible[$bytes] = $validChars[$bytes];
+		unset($validChars[$bytes]); // will test these separately
+	}
+}
+
+/* There are 23 other collisions between 2-byte sequences which variously
+ * start with 0x81, 0x87, or 0xFA
+ * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */
+for ($i = 0xFA4A; $i <= 0xFA53; $i++) {
+	$bytes = pack('n', $i);
+	unset($fromUnicode[$validChars[$bytes]]);
+	$nonInvertible[$bytes] = $validChars[$bytes];
+	unset($validChars[$bytes]); // will test these separately
+}
+foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) {
+	$bytes = pack('n', $i);
+	unset($fromUnicode[$validChars[$bytes]]);
+	$nonInvertible[$bytes] = $validChars[$bytes];
+	unset($validChars[$bytes]); // will test these separately
+}
+
+testAllValidChars($validChars, 'CP932', 'UTF-16BE');
+foreach ($nonInvertible as $cp932 => $unicode)
+	testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false);
+echo "CP932 verification and conversion works on all valid characters\n";
+
+testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%");
+echo "CP932 verification and conversion works on all invalid characters\n";
+
+convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
+echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
+?>
+--EXPECT--
+CP932 verification and conversion works on all valid characters
+CP932 verification and conversion works on all invalid characters
+Unicode -> CP932 conversion works on all invalid codepoints
diff --git a/ext/mbstring/tests/data/CP932.txt b/ext/mbstring/tests/data/CP932.txt