|
53 | 53 | import java.nio.charset.CoderResult;
|
54 | 54 | import java.nio.charset.CodingErrorAction;
|
55 | 55 | import java.nio.charset.StandardCharsets;
|
56 |
| -import java.util.HashMap; |
57 | 56 | import java.util.List;
|
58 |
| -import java.util.Map; |
59 | 57 |
|
60 | 58 | import com.oracle.graal.python.builtins.Builtin;
|
61 | 59 | import com.oracle.graal.python.builtins.CoreFunctions;
|
|
82 | 80 | import com.oracle.graal.python.nodes.util.CastToJavaStringNode;
|
83 | 81 | import com.oracle.graal.python.nodes.util.CastToJavaStringNodeGen;
|
84 | 82 | import com.oracle.graal.python.runtime.PythonCore;
|
| 83 | +import com.oracle.graal.python.util.CharsetMapping; |
85 | 84 | import com.oracle.truffle.api.CompilerDirectives;
|
86 | 85 | import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
|
87 | 86 | import com.oracle.truffle.api.dsl.Cached;
|
|
98 | 97 | public class CodecsModuleBuiltins extends PythonBuiltins {
|
99 | 98 | private static final Charset UTF32 = Charset.forName("utf-32");
|
100 | 99 |
|
101 |
| - // python to java codecs mapping |
102 |
| - private static final Map<String, Charset> CHARSET_MAP = new HashMap<>(); |
103 |
| - static { |
104 |
| - // ascii |
105 |
| - CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII); |
106 |
| - CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII); |
107 |
| - CHARSET_MAP.put("646", StandardCharsets.US_ASCII); |
108 |
| - |
109 |
| - // latin 1 |
110 |
| - CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1); |
111 |
| - CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1); |
112 |
| - CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1); |
113 |
| - CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1); |
114 |
| - CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1); |
115 |
| - CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1); |
116 |
| - CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1); |
117 |
| - CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1); |
118 |
| - CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1); |
119 |
| - |
120 |
| - // utf-8 |
121 |
| - CHARSET_MAP.put("UTF-8", StandardCharsets.UTF_8); |
122 |
| - CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8); |
123 |
| - CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8); |
124 |
| - CHARSET_MAP.put("U8", StandardCharsets.UTF_8); |
125 |
| - CHARSET_MAP.put("UTF", StandardCharsets.UTF_8); |
126 |
| - CHARSET_MAP.put("utf8", StandardCharsets.UTF_8); |
127 |
| - |
128 |
| - // utf-16 |
129 |
| - CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16); |
130 |
| - CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16); |
131 |
| - CHARSET_MAP.put("U16", StandardCharsets.UTF_16); |
132 |
| - CHARSET_MAP.put("utf16", StandardCharsets.UTF_16); |
133 |
| - // TODO BMP only |
134 |
| - CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE); |
135 |
| - CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE); |
136 |
| - |
137 |
| - // utf-32 |
138 |
| - final Charset utf32be = Charset.forName("utf-32be"); |
139 |
| - final Charset utf32le = Charset.forName("utf-32le"); |
140 |
| - final Charset ibm437 = Charset.forName("IBM437"); |
141 |
| - |
142 |
| - CHARSET_MAP.put("utf-32", UTF32); |
143 |
| - CHARSET_MAP.put("utf_32", UTF32); |
144 |
| - CHARSET_MAP.put("U32", UTF32); |
145 |
| - CHARSET_MAP.put("utf-32be", utf32be); |
146 |
| - CHARSET_MAP.put("utf_32_be", utf32be); |
147 |
| - CHARSET_MAP.put("utf-32le", utf32le); |
148 |
| - CHARSET_MAP.put("utf_32_le", utf32le); |
149 |
| - CHARSET_MAP.put("utf32", UTF32); |
150 |
| - // big5 big5-tw, csbig5 Traditional Chinese |
151 |
| - // big5hkscs big5-hkscs, hkscs Traditional Chinese |
152 |
| - // cp037 IBM037, IBM039 English |
153 |
| - // cp424 EBCDIC-CP-HE, IBM424 Hebrew |
154 |
| - // cp437 437, IBM437 English |
155 |
| - CHARSET_MAP.put("IBM437", ibm437); |
156 |
| - CHARSET_MAP.put("IBM437 English", ibm437); |
157 |
| - CHARSET_MAP.put("437", ibm437); |
158 |
| - CHARSET_MAP.put("cp437", ibm437); |
159 |
| - // cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe |
160 |
| - // cp720 Arabic |
161 |
| - // cp737 Greek |
162 |
| - // cp775 IBM775 Baltic languages |
163 |
| - // cp850 850, IBM850 Western Europe |
164 |
| - // cp852 852, IBM852 Central and Eastern Europe |
165 |
| - // cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian |
166 |
| - // cp856 Hebrew |
167 |
| - // cp857 857, IBM857 Turkish |
168 |
| - // cp858 858, IBM858 Western Europe |
169 |
| - // cp860 860, IBM860 Portuguese |
170 |
| - // cp861 861, CP-IS, IBM861 Icelandic |
171 |
| - // cp862 862, IBM862 Hebrew |
172 |
| - // cp863 863, IBM863 Canadian |
173 |
| - // cp864 IBM864 Arabic |
174 |
| - // cp865 865, IBM865 Danish, Norwegian |
175 |
| - // cp866 866, IBM866 Russian |
176 |
| - // cp869 869, CP-GR, IBM869 Greek |
177 |
| - // cp874 Thai |
178 |
| - // cp875 Greek |
179 |
| - // cp932 932, ms932, mskanji, ms-kanji Japanese |
180 |
| - // cp949 949, ms949, uhc Korean |
181 |
| - // cp950 950, ms950 Traditional Chinese |
182 |
| - // cp1006 Urdu |
183 |
| - // cp1026 ibm1026 Turkish |
184 |
| - // cp1140 ibm1140 Western Europe |
185 |
| - // cp1250 windows-1250 Central and Eastern Europe |
186 |
| - // cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian |
187 |
| - // cp1252 windows-1252 Western Europe |
188 |
| - // cp1253 windows-1253 Greek |
189 |
| - // cp1254 windows-1254 Turkish |
190 |
| - // cp1255 windows-1255 Hebrew |
191 |
| - // cp1256 windows-1256 Arabic |
192 |
| - // cp1257 windows-1257 Baltic languages |
193 |
| - // cp1258 windows-1258 Vietnamese |
194 |
| - // euc_jp eucjp, ujis, u-jis Japanese |
195 |
| - // euc_jis_2004 jisx0213, eucjis2004 Japanese |
196 |
| - // euc_jisx0213 eucjisx0213 Japanese |
197 |
| - // euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 Korean |
198 |
| - // gb2312 chinese, csiso58gb231280, euc- cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80, |
199 |
| - // iso- ir-58 Simplified Chinese |
200 |
| - // gbk 936, cp936, ms936 Unified Chinese |
201 |
| - // gb18030 gb18030-2000 Unified Chinese |
202 |
| - // hz hzgb, hz-gb, hz-gb-2312 Simplified Chinese |
203 |
| - // iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp Japanese |
204 |
| - // iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 Japanese |
205 |
| - // iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 Japanese, Korean, Simplified Chinese, Western |
206 |
| - // Europe, Greek |
207 |
| - // iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004 Japanese |
208 |
| - // iso2022_jp_3 iso2022jp-3, iso-2022-jp-3 Japanese |
209 |
| - // iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext Japanese |
210 |
| - // iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr Korean |
211 |
| - // iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe |
212 |
| - // iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese |
213 |
| - // iso8859_4 iso-8859-4, latin4, L4 Baltic languages |
214 |
| - // iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian |
215 |
| - // iso8859_6 iso-8859-6, arabic Arabic |
216 |
| - // iso8859_7 iso-8859-7, greek, greek8 Greek |
217 |
| - // iso8859_8 iso-8859-8, hebrew Hebrew |
218 |
| - // iso8859_9 iso-8859-9, latin5, L5 Turkish |
219 |
| - // iso8859_10 iso-8859-10, latin6, L6 Nordic languages |
220 |
| - // iso8859_11 iso-8859-11, thai Thai languages |
221 |
| - // iso8859_13 iso-8859-13, latin7, L7 Baltic languages |
222 |
| - // iso8859_14 iso-8859-14, latin8, L8 Celtic languages |
223 |
| - // iso8859_15 iso-8859-15, latin9, L9 Western Europe |
224 |
| - // iso8859_16 iso-8859-16, latin10, L10 South-Eastern Europe |
225 |
| - // johab cp1361, ms1361 Korean |
226 |
| - // koi8_r Russian |
227 |
| - // koi8_u Ukrainian |
228 |
| - // mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian |
229 |
| - // mac_greek macgreek Greek |
230 |
| - // mac_iceland maciceland Icelandic |
231 |
| - // mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe |
232 |
| - // mac_roman macroman Western Europe |
233 |
| - // mac_turkish macturkish Turkish |
234 |
| - // ptcp154 csptcp154, pt154, cp154, cyrillic-asian Kazakh |
235 |
| - // shift_jis csshiftjis, shiftjis, sjis, s_jis Japanese |
236 |
| - // shift_jis_2004 shiftjis2004, sjis_2004, sjis2004 Japanese |
237 |
| - // shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 Japanese |
238 |
| - // utf_7 U7, unicode-1-1-utf-7 all languages |
239 |
| - // utf_8_sig |
240 |
| - } |
241 |
| - |
242 |
| - @TruffleBoundary |
243 |
| - static Charset getCharset(String encoding) { |
244 |
| - return CHARSET_MAP.get(encoding); |
245 |
| - } |
246 |
| - |
247 | 100 | @Override
|
248 | 101 | protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
|
249 | 102 | return CodecsModuleBuiltinsFactory.getFactories();
|
@@ -379,7 +232,7 @@ Object encode(Object str, @SuppressWarnings("unused") Object encoding, @Suppress
|
379 | 232 | @TruffleBoundary
|
380 | 233 | private PBytes encodeString(String self, String encoding, String errors) {
|
381 | 234 | CodingErrorAction errorAction = convertCodingErrorAction(errors);
|
382 |
| - Charset charset = getCharset(encoding); |
| 235 | + Charset charset = CharsetMapping.getCharset(encoding); |
383 | 236 | if (charset == null) {
|
384 | 237 | throw raise(LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
|
385 | 238 | }
|
@@ -503,7 +356,7 @@ private static ByteBuffer wrap(byte[] bytes) {
|
503 | 356 | @TruffleBoundary
|
504 | 357 | String decodeBytes(ByteBuffer byteBuffer, String encoding, String errors, boolean finalData) {
|
505 | 358 | CodingErrorAction errorAction = convertCodingErrorAction(errors);
|
506 |
| - Charset charset = getCharset(encoding); |
| 359 | + Charset charset = CharsetMapping.getCharset(encoding); |
507 | 360 | if (charset == null) {
|
508 | 361 | throw raise(LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
|
509 | 362 | }
|
@@ -625,7 +478,7 @@ abstract static class CodecsLookupNode extends PythonBuiltinNode {
|
625 | 478 | // This is replaced in the core _codecs.py with the full functionality
|
626 | 479 | @Specialization
|
627 | 480 | Object lookup(String encoding) {
|
628 |
| - if (getCharset(encoding) != null) { |
| 481 | + if (CharsetMapping.getCharset(encoding) != null) { |
629 | 482 | return true;
|
630 | 483 | } else {
|
631 | 484 | return PNone.NONE;
|
|
0 commit comments