|
| 1 | +--TEST-- |
| 2 | +Exhaustive test of CP932 encoding verification and conversion |
| 3 | +--SKIPIF-- |
| 4 | +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> |
| 5 | +--FILE-- |
| 6 | +<?php |
| 7 | +srand(4321); /* Make results consistent */ |
| 8 | +include('encoding_tests.inc'); |
| 9 | +mb_substitute_character(0x25); // '%' |
| 10 | + |
| 11 | +/* Read in the table of all characters in CP932 */ |
| 12 | +readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode); |
| 13 | + |
| 14 | +/* Aside from the characters in that table, we also support a 'user' area |
| 15 | + * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */ |
| 16 | +$codepoint = 0xE000; |
| 17 | +for ($i = 0xF0; $i <= 0xF9; $i++) { |
| 18 | + for ($j = 0x40; $j <= 0xFC; $j++) { |
| 19 | + if ($j == 0x7F) |
| 20 | + continue; |
| 21 | + $utf16 = pack('n', $codepoint); |
| 22 | + $cp932 = chr($i) . chr($j); |
| 23 | + $validChars[$cp932] = $utf16; |
| 24 | + $fromUnicode[$utf16] = $cp932; |
| 25 | + $codepoint++; |
| 26 | + } |
| 27 | +} |
| 28 | + |
| 29 | +/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */ |
| 30 | +$fromUnicode["\x00\xA2"] = "\x81\x91"; |
| 31 | +/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */ |
| 32 | +$fromUnicode["\x00\xA3"] = "\x81\x92"; |
| 33 | + |
| 34 | +/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) |
| 35 | + * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */ |
| 36 | +$fromUnicode["\x30\x1C"] = "\x81\x60"; |
| 37 | +/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN), |
| 38 | + * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */ |
| 39 | +$fromUnicode["\x22\x12"] = "\x81\x7C"; |
| 40 | +/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO), |
| 41 | + * but when converting Unicode to CP932, we also accept U+2016 |
| 42 | + * (DOUBLE VERTICAL LINE) */ |
| 43 | +$fromUnicode["\x20\x16"] = "\x81\x61"; |
| 44 | +/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN), |
| 45 | + * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */ |
| 46 | +$fromUnicode["\x00\xAC"] = "\x81\xCA"; |
| 47 | + |
| 48 | +findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); |
| 49 | + |
| 50 | +findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); |
| 51 | + |
| 52 | +/* There are 396 Unicode codepoints which are non-invertible in CP932 |
| 53 | + * (multiple CP932 byte sequences map to the same codepoint) |
| 54 | + * Some of these are 3-way pile-ups. I wonder what the fine folks at MS |
| 55 | + * were thinking when they designed this text encoding. */ |
| 56 | + |
| 57 | +/* Everything from 0xED00-0xEEFF falls in this unfortunate category |
| 58 | + * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when |
| 59 | + * converting from Unicode back to CP932, we favor the F's rather than the E's) */ |
| 60 | +$nonInvertible = array(); |
| 61 | +for ($i = 0xED00; $i <= 0xEEFF; $i++) { |
| 62 | + $bytes = pack('n', $i); |
| 63 | + if (isset($validChars[$bytes])) { |
| 64 | + unset($fromUnicode[$validChars[$bytes]]); |
| 65 | + $nonInvertible[$bytes] = $validChars[$bytes]; |
| 66 | + unset($validChars[$bytes]); // will test these separately |
| 67 | + } |
| 68 | +} |
| 69 | + |
| 70 | +/* There are 23 other collisions between 2-byte sequences which variously |
| 71 | + * start with 0x81, 0x87, or 0xFA |
| 72 | + * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */ |
| 73 | +for ($i = 0xFA4A; $i <= 0xFA53; $i++) { |
| 74 | + $bytes = pack('n', $i); |
| 75 | + unset($fromUnicode[$validChars[$bytes]]); |
| 76 | + $nonInvertible[$bytes] = $validChars[$bytes]; |
| 77 | + unset($validChars[$bytes]); // will test these separately |
| 78 | +} |
| 79 | +foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) { |
| 80 | + $bytes = pack('n', $i); |
| 81 | + unset($fromUnicode[$validChars[$bytes]]); |
| 82 | + $nonInvertible[$bytes] = $validChars[$bytes]; |
| 83 | + unset($validChars[$bytes]); // will test these separately |
| 84 | +} |
| 85 | + |
| 86 | +testAllValidChars($validChars, 'CP932', 'UTF-16BE'); |
| 87 | +foreach ($nonInvertible as $cp932 => $unicode) |
| 88 | + testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false); |
| 89 | +echo "CP932 verification and conversion works on all valid characters\n"; |
| 90 | + |
| 91 | +testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%"); |
| 92 | +echo "CP932 verification and conversion works on all invalid characters\n"; |
| 93 | + |
| 94 | +convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%'); |
| 95 | +echo "Unicode -> CP932 conversion works on all invalid codepoints\n"; |
| 96 | +?> |
| 97 | +--EXPECT-- |
| 98 | +CP932 verification and conversion works on all valid characters |
| 99 | +CP932 verification and conversion works on all invalid characters |
| 100 | +Unicode -> CP932 conversion works on all invalid codepoints |
0 commit comments