diff --git a/webencodings/__init__.py b/webencodings/__init__.py index 828126d..11212b9 100644 --- a/webencodings/__init__.py +++ b/webencodings/__init__.py @@ -22,12 +22,18 @@ VERSION = '0.6-dev' -# Some names in Encoding are not valid Python aliases. Remap these. PYTHON_NAMES = { - 'iso-8859-8-i': 'iso-8859-8', + # Some names in Encoding are not valid Python aliases. Remap these: + 'iso-8859-8-i': 'iso8859-8', 'x-mac-cyrillic': 'mac-cyrillic', 'macintosh': 'mac-roman', - 'windows-874': 'cp874'} + 'windows-874': 'cp874', + # Some WHATWG-defined names conflict with a Python alias for an + # incompatible codec. These should be remapped to the correct one: + 'shift_jis': 'cp932', + 'big5': 'big5hkscs', + 'euc-kr': 'cp949', +} CACHE = {} diff --git a/webencodings/tests.py b/webencodings/tests.py index e12c10d..a35bec3 100644 --- a/webencodings/tests.py +++ b/webencodings/tests.py @@ -14,7 +14,7 @@ from __future__ import unicode_literals from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode, - IncrementalDecoder, IncrementalEncoder, UTF8) + IncrementalDecoder, IncrementalEncoder, UTF8, PYTHON_NAMES) def assert_raises(exception, function, *args, **kwargs): @@ -45,6 +45,21 @@ def test_labels(): assert lookup('LATİN1') is None # ASCII-only case insensitivity. +def test_remapping(): + def codec_name(name): + encoding = lookup(name) + assert encoding is not None + return encoding.codec_info.name + + assert codec_name('iso-8859-8-i') == 'iso8859-8' + assert codec_name('x-mac-cyrillic') == 'mac-cyrillic' + assert codec_name('macintosh') == 'mac-roman' + assert codec_name('windows-874') == 'cp874' + assert codec_name('shift_jis') == 'cp932' + assert codec_name('big5') == 'big5hkscs' + assert codec_name('euc-kr') == 'cp949' + + def test_all_labels(): for label in LABELS: assert decode(b'', label) == ('', lookup(label)) @@ -95,6 +110,16 @@ def test_decode(): assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le')) +def test_decode_legacy_cjk(): + assert decode(b'\x87\x82\x87@ \xedB', "windows-31j") == ( + "№① 鍈", lookup("shift-jis")) + assert decode(b'\xc7g\xc6\xf1\xc6\xfd\xc7g\xc6\xf1\xc6\xfd', "big5-hkscs") == ( + "むかしむかし", lookup("big5")) + assert decode(b'\x8cc\xb9\xe6\xb0\xa2\xc7\xcf', "windows-949") == ( + "똠방각하", lookup("euc-kr")) + assert decode(b'\x92w', 'big5') == ('㐵', lookup('big5')) + + def test_encode(): assert encode('é', 'latin1') == b'\xe9' assert encode('é', 'utf8') == b'\xc3\xa9'