|
50 | 50 | import java.nio.charset.CharacterCodingException;
|
51 | 51 | import java.nio.charset.Charset;
|
52 | 52 | import java.nio.charset.CodingErrorAction;
|
| 53 | +import java.nio.charset.StandardCharsets; |
53 | 54 | import java.util.Arrays;
|
54 | 55 | import java.util.HashMap;
|
55 | 56 | import java.util.List;
|
|
83 | 84 | public class CodecsModuleBuiltins extends PythonBuiltins {
|
84 | 85 | public static final String DEFAULT_ENCODING = "utf-8";
|
85 | 86 |
|
| 87 | + private static final Charset UTF32 = Charset.forName("utf-32"); |
| 88 | + |
86 | 89 | // python to java codecs mapping
|
87 |
| - private static final Map<String, String> PY_CODECS_ALIASES = new HashMap<>(); |
| 90 | + private static final Map<String, Charset> CHARSET_MAP = new HashMap<>(); |
88 | 91 | static {
|
89 | 92 | // ascii
|
90 |
| - PY_CODECS_ALIASES.put("us-ascii", "us-ascii"); |
91 |
| - PY_CODECS_ALIASES.put("ascii", "us-ascii"); |
92 |
| - PY_CODECS_ALIASES.put("646", "us-ascii"); |
| 93 | + CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII); |
| 94 | + CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII); |
| 95 | + CHARSET_MAP.put("646", StandardCharsets.US_ASCII); |
93 | 96 |
|
94 | 97 | // latin 1
|
95 |
| - PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1"); |
96 |
| - PY_CODECS_ALIASES.put("latin-1", "iso-8859-1"); |
97 |
| - PY_CODECS_ALIASES.put("latin_1", "iso-8859-1"); |
98 |
| - PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1"); |
99 |
| - PY_CODECS_ALIASES.put("iso8859-1", "iso-8859-1"); |
100 |
| - PY_CODECS_ALIASES.put("8859", "iso-8859-1"); |
101 |
| - PY_CODECS_ALIASES.put("cp819", "iso-8859-1"); |
102 |
| - PY_CODECS_ALIASES.put("latin", "iso-8859-1"); |
103 |
| - PY_CODECS_ALIASES.put("latin1", "iso-8859-1"); |
104 |
| - PY_CODECS_ALIASES.put("L1", "iso-8859-1"); |
| 98 | + CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1); |
| 99 | + CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1); |
| 100 | + CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1); |
| 101 | + CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1); |
| 102 | + CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1); |
| 103 | + CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1); |
| 104 | + CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1); |
| 105 | + CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1); |
| 106 | + CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1); |
105 | 107 |
|
106 | 108 | // utf-8
|
107 |
| - PY_CODECS_ALIASES.put("utf-8", "utf-8"); |
108 |
| - PY_CODECS_ALIASES.put("utf_8", "utf-8"); |
109 |
| - PY_CODECS_ALIASES.put("U8", "utf-8"); |
110 |
| - PY_CODECS_ALIASES.put("UTF", "utf-8"); |
111 |
| - PY_CODECS_ALIASES.put("utf8", "utf-8"); |
| 109 | + CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8); |
| 110 | + CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8); |
| 111 | + CHARSET_MAP.put("U8", StandardCharsets.UTF_8); |
| 112 | + CHARSET_MAP.put("UTF", StandardCharsets.UTF_8); |
| 113 | + CHARSET_MAP.put("utf8", StandardCharsets.UTF_8); |
112 | 114 |
|
113 | 115 | // utf-16
|
114 |
| - PY_CODECS_ALIASES.put("utf-16", "utf-16"); |
115 |
| - PY_CODECS_ALIASES.put("utf_16", "utf-16"); |
116 |
| - PY_CODECS_ALIASES.put("U16", "utf-16"); |
117 |
| - PY_CODECS_ALIASES.put("utf16", "utf-16"); |
| 116 | + CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16); |
| 117 | + CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16); |
| 118 | + CHARSET_MAP.put("U16", StandardCharsets.UTF_16); |
| 119 | + CHARSET_MAP.put("utf16", StandardCharsets.UTF_16); |
118 | 120 | // TODO BMP only
|
119 |
| - PY_CODECS_ALIASES.put("utf_16_be", "utf-16be"); |
120 |
| - PY_CODECS_ALIASES.put("utf_16_le", "utf-16le"); |
| 121 | + CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE); |
| 122 | + CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE); |
121 | 123 |
|
122 | 124 | // utf-32
|
123 |
| - PY_CODECS_ALIASES.put("utf-32", "utf-32"); |
124 |
| - PY_CODECS_ALIASES.put("utf_32", "utf-32"); |
125 |
| - PY_CODECS_ALIASES.put("U32", "utf-32"); |
126 |
| - PY_CODECS_ALIASES.put("utf_32_be", "utf-32be"); |
127 |
| - PY_CODECS_ALIASES.put("utf_32_le", "utf-32le"); |
128 |
| - PY_CODECS_ALIASES.put("utf32", "utf-32"); |
| 125 | + final Charset utf32be = Charset.forName("utf-32be"); |
| 126 | + final Charset utf32le = Charset.forName("utf-32le"); |
| 127 | + final Charset ibm437 = Charset.forName("IBM437"); |
| 128 | + |
| 129 | + CHARSET_MAP.put("utf-32", UTF32); |
| 130 | + CHARSET_MAP.put("utf_32", UTF32); |
| 131 | + CHARSET_MAP.put("U32", UTF32); |
| 132 | + CHARSET_MAP.put("utf-32be", utf32be); |
| 133 | + CHARSET_MAP.put("utf_32_be", utf32be); |
| 134 | + CHARSET_MAP.put("utf-32le", utf32le); |
| 135 | + CHARSET_MAP.put("utf_32_le", utf32le); |
| 136 | + CHARSET_MAP.put("utf32", UTF32); |
129 | 137 | // big5 big5-tw, csbig5 Traditional Chinese
|
130 | 138 | // big5hkscs big5-hkscs, hkscs Traditional Chinese
|
131 | 139 | // cp037 IBM037, IBM039 English
|
132 | 140 | // cp424 EBCDIC-CP-HE, IBM424 Hebrew
|
133 | 141 | // cp437 437, IBM437 English
|
134 |
| - PY_CODECS_ALIASES.put("IBM437", "IBM437"); |
135 |
| - PY_CODECS_ALIASES.put("IBM437 English", "IBM437"); |
136 |
| - PY_CODECS_ALIASES.put("437", "IBM437"); |
137 |
| - PY_CODECS_ALIASES.put("cp437", "IBM437"); |
| 142 | + CHARSET_MAP.put("IBM437", ibm437); |
| 143 | + CHARSET_MAP.put("IBM437 English", ibm437); |
| 144 | + CHARSET_MAP.put("437", ibm437); |
| 145 | + CHARSET_MAP.put("cp437", ibm437); |
138 | 146 | // cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
|
139 | 147 | // cp720 Arabic
|
140 | 148 | // cp737 Greek
|
@@ -220,15 +228,7 @@ public class CodecsModuleBuiltins extends PythonBuiltins {
|
220 | 228 |
|
221 | 229 | @TruffleBoundary
|
222 | 230 | static Charset getCharset(String encoding) {
|
223 |
| - if (encoding == null) { |
224 |
| - return Charset.forName(DEFAULT_ENCODING); |
225 |
| - } else { |
226 |
| - String val = PY_CODECS_ALIASES.get(encoding); |
227 |
| - if (val != null) { |
228 |
| - return Charset.forName(val); |
229 |
| - } |
230 |
| - return Charset.forName(encoding); |
231 |
| - } |
| 231 | + return CHARSET_MAP.get(encoding); |
232 | 232 | }
|
233 | 233 |
|
234 | 234 | @Override
|
@@ -453,8 +453,7 @@ private Object[] encodeString(String self, String errors) {
|
453 | 453 | CodingErrorAction errorAction = convertCodingErrorAction(errors);
|
454 | 454 |
|
455 | 455 | try {
|
456 |
| - Charset charset = getCharset("utf-32"); |
457 |
| - ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self)); |
| 456 | + ByteBuffer encoded = UTF32.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self)); |
458 | 457 | int n = encoded.remaining();
|
459 | 458 | ByteBuffer buf = ByteBuffer.allocate(n);
|
460 | 459 | assert n % Integer.BYTES == 0;
|
@@ -606,8 +605,7 @@ String decodeBytes(ByteBuffer bytes, String errors) {
|
606 | 605 | buf.putInt(val);
|
607 | 606 | }
|
608 | 607 | buf.flip();
|
609 |
| - Charset charset = getCharset("utf-32"); |
610 |
| - CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf); |
| 608 | + CharBuffer decoded = UTF32.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf); |
611 | 609 | return String.valueOf(decoded);
|
612 | 610 | } catch (CharacterCodingException e) {
|
613 | 611 | throw raise(UnicodeDecodeError, e);
|
|
0 commit comments