Skip to content

Commit 46aa200

Browse files
committed
Extract charset mapping to an utility class
1 parent 0ca1527 commit 46aa200

File tree

2 files changed

+164
-151
lines changed

2 files changed

+164
-151
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 4 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,7 @@
5353
import java.nio.charset.CoderResult;
5454
import java.nio.charset.CodingErrorAction;
5555
import java.nio.charset.StandardCharsets;
56-
import java.util.HashMap;
5756
import java.util.List;
58-
import java.util.Map;
5957

6058
import com.oracle.graal.python.builtins.Builtin;
6159
import com.oracle.graal.python.builtins.CoreFunctions;
@@ -82,6 +80,7 @@
8280
import com.oracle.graal.python.nodes.util.CastToJavaStringNode;
8381
import com.oracle.graal.python.nodes.util.CastToJavaStringNodeGen;
8482
import com.oracle.graal.python.runtime.PythonCore;
83+
import com.oracle.graal.python.util.CharsetMapping;
8584
import com.oracle.truffle.api.CompilerDirectives;
8685
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
8786
import com.oracle.truffle.api.dsl.Cached;
@@ -98,152 +97,6 @@
9897
public class CodecsModuleBuiltins extends PythonBuiltins {
9998
private static final Charset UTF32 = Charset.forName("utf-32");
10099

101-
// python to java codecs mapping
102-
private static final Map<String, Charset> CHARSET_MAP = new HashMap<>();
103-
static {
104-
// ascii
105-
CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII);
106-
CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII);
107-
CHARSET_MAP.put("646", StandardCharsets.US_ASCII);
108-
109-
// latin 1
110-
CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1);
111-
CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1);
112-
CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1);
113-
CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1);
114-
CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1);
115-
CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1);
116-
CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1);
117-
CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1);
118-
CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1);
119-
120-
// utf-8
121-
CHARSET_MAP.put("UTF-8", StandardCharsets.UTF_8);
122-
CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8);
123-
CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8);
124-
CHARSET_MAP.put("U8", StandardCharsets.UTF_8);
125-
CHARSET_MAP.put("UTF", StandardCharsets.UTF_8);
126-
CHARSET_MAP.put("utf8", StandardCharsets.UTF_8);
127-
128-
// utf-16
129-
CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16);
130-
CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16);
131-
CHARSET_MAP.put("U16", StandardCharsets.UTF_16);
132-
CHARSET_MAP.put("utf16", StandardCharsets.UTF_16);
133-
// TODO BMP only
134-
CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE);
135-
CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE);
136-
137-
// utf-32
138-
final Charset utf32be = Charset.forName("utf-32be");
139-
final Charset utf32le = Charset.forName("utf-32le");
140-
final Charset ibm437 = Charset.forName("IBM437");
141-
142-
CHARSET_MAP.put("utf-32", UTF32);
143-
CHARSET_MAP.put("utf_32", UTF32);
144-
CHARSET_MAP.put("U32", UTF32);
145-
CHARSET_MAP.put("utf-32be", utf32be);
146-
CHARSET_MAP.put("utf_32_be", utf32be);
147-
CHARSET_MAP.put("utf-32le", utf32le);
148-
CHARSET_MAP.put("utf_32_le", utf32le);
149-
CHARSET_MAP.put("utf32", UTF32);
150-
// big5 big5-tw, csbig5 Traditional Chinese
151-
// big5hkscs big5-hkscs, hkscs Traditional Chinese
152-
// cp037 IBM037, IBM039 English
153-
// cp424 EBCDIC-CP-HE, IBM424 Hebrew
154-
// cp437 437, IBM437 English
155-
CHARSET_MAP.put("IBM437", ibm437);
156-
CHARSET_MAP.put("IBM437 English", ibm437);
157-
CHARSET_MAP.put("437", ibm437);
158-
CHARSET_MAP.put("cp437", ibm437);
159-
// cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
160-
// cp720 Arabic
161-
// cp737 Greek
162-
// cp775 IBM775 Baltic languages
163-
// cp850 850, IBM850 Western Europe
164-
// cp852 852, IBM852 Central and Eastern Europe
165-
// cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian
166-
// cp856 Hebrew
167-
// cp857 857, IBM857 Turkish
168-
// cp858 858, IBM858 Western Europe
169-
// cp860 860, IBM860 Portuguese
170-
// cp861 861, CP-IS, IBM861 Icelandic
171-
// cp862 862, IBM862 Hebrew
172-
// cp863 863, IBM863 Canadian
173-
// cp864 IBM864 Arabic
174-
// cp865 865, IBM865 Danish, Norwegian
175-
// cp866 866, IBM866 Russian
176-
// cp869 869, CP-GR, IBM869 Greek
177-
// cp874 Thai
178-
// cp875 Greek
179-
// cp932 932, ms932, mskanji, ms-kanji Japanese
180-
// cp949 949, ms949, uhc Korean
181-
// cp950 950, ms950 Traditional Chinese
182-
// cp1006 Urdu
183-
// cp1026 ibm1026 Turkish
184-
// cp1140 ibm1140 Western Europe
185-
// cp1250 windows-1250 Central and Eastern Europe
186-
// cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian
187-
// cp1252 windows-1252 Western Europe
188-
// cp1253 windows-1253 Greek
189-
// cp1254 windows-1254 Turkish
190-
// cp1255 windows-1255 Hebrew
191-
// cp1256 windows-1256 Arabic
192-
// cp1257 windows-1257 Baltic languages
193-
// cp1258 windows-1258 Vietnamese
194-
// euc_jp eucjp, ujis, u-jis Japanese
195-
// euc_jis_2004 jisx0213, eucjis2004 Japanese
196-
// euc_jisx0213 eucjisx0213 Japanese
197-
// euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 Korean
198-
// gb2312 chinese, csiso58gb231280, euc- cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80,
199-
// iso- ir-58 Simplified Chinese
200-
// gbk 936, cp936, ms936 Unified Chinese
201-
// gb18030 gb18030-2000 Unified Chinese
202-
// hz hzgb, hz-gb, hz-gb-2312 Simplified Chinese
203-
// iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp Japanese
204-
// iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 Japanese
205-
// iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 Japanese, Korean, Simplified Chinese, Western
206-
// Europe, Greek
207-
// iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004 Japanese
208-
// iso2022_jp_3 iso2022jp-3, iso-2022-jp-3 Japanese
209-
// iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext Japanese
210-
// iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr Korean
211-
// iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe
212-
// iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese
213-
// iso8859_4 iso-8859-4, latin4, L4 Baltic languages
214-
// iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian
215-
// iso8859_6 iso-8859-6, arabic Arabic
216-
// iso8859_7 iso-8859-7, greek, greek8 Greek
217-
// iso8859_8 iso-8859-8, hebrew Hebrew
218-
// iso8859_9 iso-8859-9, latin5, L5 Turkish
219-
// iso8859_10 iso-8859-10, latin6, L6 Nordic languages
220-
// iso8859_11 iso-8859-11, thai Thai languages
221-
// iso8859_13 iso-8859-13, latin7, L7 Baltic languages
222-
// iso8859_14 iso-8859-14, latin8, L8 Celtic languages
223-
// iso8859_15 iso-8859-15, latin9, L9 Western Europe
224-
// iso8859_16 iso-8859-16, latin10, L10 South-Eastern Europe
225-
// johab cp1361, ms1361 Korean
226-
// koi8_r Russian
227-
// koi8_u Ukrainian
228-
// mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian
229-
// mac_greek macgreek Greek
230-
// mac_iceland maciceland Icelandic
231-
// mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe
232-
// mac_roman macroman Western Europe
233-
// mac_turkish macturkish Turkish
234-
// ptcp154 csptcp154, pt154, cp154, cyrillic-asian Kazakh
235-
// shift_jis csshiftjis, shiftjis, sjis, s_jis Japanese
236-
// shift_jis_2004 shiftjis2004, sjis_2004, sjis2004 Japanese
237-
// shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 Japanese
238-
// utf_7 U7, unicode-1-1-utf-7 all languages
239-
// utf_8_sig
240-
}
241-
242-
@TruffleBoundary
243-
static Charset getCharset(String encoding) {
244-
return CHARSET_MAP.get(encoding);
245-
}
246-
247100
@Override
248101
protected List<? extends NodeFactory<? extends PythonBuiltinBaseNode>> getNodeFactories() {
249102
return CodecsModuleBuiltinsFactory.getFactories();
@@ -379,7 +232,7 @@ Object encode(Object str, @SuppressWarnings("unused") Object encoding, @Suppress
379232
@TruffleBoundary
380233
private PBytes encodeString(String self, String encoding, String errors) {
381234
CodingErrorAction errorAction = convertCodingErrorAction(errors);
382-
Charset charset = getCharset(encoding);
235+
Charset charset = CharsetMapping.getCharset(encoding);
383236
if (charset == null) {
384237
throw raise(LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
385238
}
@@ -503,7 +356,7 @@ private static ByteBuffer wrap(byte[] bytes) {
503356
@TruffleBoundary
504357
String decodeBytes(ByteBuffer byteBuffer, String encoding, String errors, boolean finalData) {
505358
CodingErrorAction errorAction = convertCodingErrorAction(errors);
506-
Charset charset = getCharset(encoding);
359+
Charset charset = CharsetMapping.getCharset(encoding);
507360
if (charset == null) {
508361
throw raise(LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
509362
}
@@ -625,7 +478,7 @@ abstract static class CodecsLookupNode extends PythonBuiltinNode {
625478
// This is replaced in the core _codecs.py with the full functionality
626479
@Specialization
627480
Object lookup(String encoding) {
628-
if (getCharset(encoding) != null) {
481+
if (CharsetMapping.getCharset(encoding) != null) {
629482
return true;
630483
} else {
631484
return PNone.NONE;
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
package com.oracle.graal.python.util;
2+
3+
import java.nio.charset.Charset;
4+
import java.nio.charset.StandardCharsets;
5+
import java.util.HashMap;
6+
import java.util.Map;
7+
8+
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
9+
10+
/**
11+
* Utility class for mapping Python encodings to Java charsets
12+
*/
13+
public class CharsetMapping {
14+
private static final Map<String, Charset> CHARSET_MAP = new HashMap<>();
15+
16+
@TruffleBoundary
17+
public static Charset getCharset(String encoding) {
18+
return CHARSET_MAP.get(encoding);
19+
}
20+
21+
static {
22+
// ascii
23+
CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII);
24+
CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII);
25+
CHARSET_MAP.put("646", StandardCharsets.US_ASCII);
26+
27+
// latin 1
28+
CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1);
29+
CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1);
30+
CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1);
31+
CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1);
32+
CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1);
33+
CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1);
34+
CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1);
35+
CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1);
36+
CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1);
37+
38+
// utf-8
39+
CHARSET_MAP.put("UTF-8", StandardCharsets.UTF_8);
40+
CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8);
41+
CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8);
42+
CHARSET_MAP.put("U8", StandardCharsets.UTF_8);
43+
CHARSET_MAP.put("UTF", StandardCharsets.UTF_8);
44+
CHARSET_MAP.put("utf8", StandardCharsets.UTF_8);
45+
46+
// utf-16
47+
CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16);
48+
CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16);
49+
CHARSET_MAP.put("U16", StandardCharsets.UTF_16);
50+
CHARSET_MAP.put("utf16", StandardCharsets.UTF_16);
51+
// TODO BMP only
52+
CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE);
53+
CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE);
54+
55+
// utf-32
56+
final Charset utf32 = Charset.forName("utf-32");
57+
final Charset utf32be = Charset.forName("utf-32be");
58+
final Charset utf32le = Charset.forName("utf-32le");
59+
final Charset ibm437 = Charset.forName("IBM437");
60+
61+
CHARSET_MAP.put("utf-32", utf32);
62+
CHARSET_MAP.put("utf_32", utf32);
63+
CHARSET_MAP.put("U32", utf32);
64+
CHARSET_MAP.put("utf-32be", utf32be);
65+
CHARSET_MAP.put("utf_32_be", utf32be);
66+
CHARSET_MAP.put("utf-32le", utf32le);
67+
CHARSET_MAP.put("utf_32_le", utf32le);
68+
CHARSET_MAP.put("utf32", utf32);
69+
// big5 big5-tw, csbig5 Traditional Chinese
70+
// big5hkscs big5-hkscs, hkscs Traditional Chinese
71+
// cp037 IBM037, IBM039 English
72+
// cp424 EBCDIC-CP-HE, IBM424 Hebrew
73+
// cp437 437, IBM437 English
74+
CHARSET_MAP.put("IBM437", ibm437);
75+
CHARSET_MAP.put("IBM437 English", ibm437);
76+
CHARSET_MAP.put("437", ibm437);
77+
CHARSET_MAP.put("cp437", ibm437);
78+
// cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
79+
// cp720 Arabic
80+
// cp737 Greek
81+
// cp775 IBM775 Baltic languages
82+
// cp850 850, IBM850 Western Europe
83+
// cp852 852, IBM852 Central and Eastern Europe
84+
// cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian
85+
// cp856 Hebrew
86+
// cp857 857, IBM857 Turkish
87+
// cp858 858, IBM858 Western Europe
88+
// cp860 860, IBM860 Portuguese
89+
// cp861 861, CP-IS, IBM861 Icelandic
90+
// cp862 862, IBM862 Hebrew
91+
// cp863 863, IBM863 Canadian
92+
// cp864 IBM864 Arabic
93+
// cp865 865, IBM865 Danish, Norwegian
94+
// cp866 866, IBM866 Russian
95+
// cp869 869, CP-GR, IBM869 Greek
96+
// cp874 Thai
97+
// cp875 Greek
98+
// cp932 932, ms932, mskanji, ms-kanji Japanese
99+
// cp949 949, ms949, uhc Korean
100+
// cp950 950, ms950 Traditional Chinese
101+
// cp1006 Urdu
102+
// cp1026 ibm1026 Turkish
103+
// cp1140 ibm1140 Western Europe
104+
// cp1250 windows-1250 Central and Eastern Europe
105+
// cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian
106+
// cp1252 windows-1252 Western Europe
107+
// cp1253 windows-1253 Greek
108+
// cp1254 windows-1254 Turkish
109+
// cp1255 windows-1255 Hebrew
110+
// cp1256 windows-1256 Arabic
111+
// cp1257 windows-1257 Baltic languages
112+
// cp1258 windows-1258 Vietnamese
113+
// euc_jp eucjp, ujis, u-jis Japanese
114+
// euc_jis_2004 jisx0213, eucjis2004 Japanese
115+
// euc_jisx0213 eucjisx0213 Japanese
116+
// euc_kr euckr, korean, ksc5601, ks_c-5601, ks_c-5601-1987, ksx1001, ks_x-1001 Korean
117+
// gb2312 chinese, csiso58gb231280, euc- cn, euccn, eucgb2312-cn, gb2312-1980, gb2312-80,
118+
// iso- ir-58 Simplified Chinese
119+
// gbk 936, cp936, ms936 Unified Chinese
120+
// gb18030 gb18030-2000 Unified Chinese
121+
// hz hzgb, hz-gb, hz-gb-2312 Simplified Chinese
122+
// iso2022_jp csiso2022jp, iso2022jp, iso-2022-jp Japanese
123+
// iso2022_jp_1 iso2022jp-1, iso-2022-jp-1 Japanese
124+
// iso2022_jp_2 iso2022jp-2, iso-2022-jp-2 Japanese, Korean, Simplified Chinese, Western
125+
// Europe, Greek
126+
// iso2022_jp_2004 iso2022jp-2004, iso-2022-jp-2004 Japanese
127+
// iso2022_jp_3 iso2022jp-3, iso-2022-jp-3 Japanese
128+
// iso2022_jp_ext iso2022jp-ext, iso-2022-jp-ext Japanese
129+
// iso2022_kr csiso2022kr, iso2022kr, iso-2022-kr Korean
130+
// iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe
131+
// iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese
132+
// iso8859_4 iso-8859-4, latin4, L4 Baltic languages
133+
// iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian
134+
// iso8859_6 iso-8859-6, arabic Arabic
135+
// iso8859_7 iso-8859-7, greek, greek8 Greek
136+
// iso8859_8 iso-8859-8, hebrew Hebrew
137+
// iso8859_9 iso-8859-9, latin5, L5 Turkish
138+
// iso8859_10 iso-8859-10, latin6, L6 Nordic languages
139+
// iso8859_11 iso-8859-11, thai Thai languages
140+
// iso8859_13 iso-8859-13, latin7, L7 Baltic languages
141+
// iso8859_14 iso-8859-14, latin8, L8 Celtic languages
142+
// iso8859_15 iso-8859-15, latin9, L9 Western Europe
143+
// iso8859_16 iso-8859-16, latin10, L10 South-Eastern Europe
144+
// johab cp1361, ms1361 Korean
145+
// koi8_r Russian
146+
// koi8_u Ukrainian
147+
// mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian
148+
// mac_greek macgreek Greek
149+
// mac_iceland maciceland Icelandic
150+
// mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe
151+
// mac_roman macroman Western Europe
152+
// mac_turkish macturkish Turkish
153+
// ptcp154 csptcp154, pt154, cp154, cyrillic-asian Kazakh
154+
// shift_jis csshiftjis, shiftjis, sjis, s_jis Japanese
155+
// shift_jis_2004 shiftjis2004, sjis_2004, sjis2004 Japanese
156+
// shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 Japanese
157+
// utf_7 U7, unicode-1-1-utf-7 all languages
158+
// utf_8_sig
159+
}
160+
}

0 commit comments

Comments
 (0)