Skip to content

Commit 20a5436

Browse files
eregontimfel
authored andcommitted
Store Charset instances directly in the Map instead of doing lazy lookups through CHarset.forName()
1 parent 6ebfdc6 commit 20a5436

File tree

1 file changed

+46
-48
lines changed

1 file changed

+46
-48
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import java.nio.charset.CharacterCodingException;
5151
import java.nio.charset.Charset;
5252
import java.nio.charset.CodingErrorAction;
53+
import java.nio.charset.StandardCharsets;
5354
import java.util.Arrays;
5455
import java.util.HashMap;
5556
import java.util.List;
@@ -83,58 +84,65 @@
8384
public class CodecsModuleBuiltins extends PythonBuiltins {
8485
public static final String DEFAULT_ENCODING = "utf-8";
8586

87+
private static final Charset UTF32 = Charset.forName("utf-32");
88+
8689
// python to java codecs mapping
87-
private static final Map<String, String> PY_CODECS_ALIASES = new HashMap<>();
90+
private static final Map<String, Charset> CHARSET_MAP = new HashMap<>();
8891
static {
8992
// ascii
90-
PY_CODECS_ALIASES.put("us-ascii", "us-ascii");
91-
PY_CODECS_ALIASES.put("ascii", "us-ascii");
92-
PY_CODECS_ALIASES.put("646", "us-ascii");
93+
CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII);
94+
CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII);
95+
CHARSET_MAP.put("646", StandardCharsets.US_ASCII);
9396

9497
// latin 1
95-
PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
96-
PY_CODECS_ALIASES.put("latin-1", "iso-8859-1");
97-
PY_CODECS_ALIASES.put("latin_1", "iso-8859-1");
98-
PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
99-
PY_CODECS_ALIASES.put("iso8859-1", "iso-8859-1");
100-
PY_CODECS_ALIASES.put("8859", "iso-8859-1");
101-
PY_CODECS_ALIASES.put("cp819", "iso-8859-1");
102-
PY_CODECS_ALIASES.put("latin", "iso-8859-1");
103-
PY_CODECS_ALIASES.put("latin1", "iso-8859-1");
104-
PY_CODECS_ALIASES.put("L1", "iso-8859-1");
98+
CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1);
99+
CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1);
100+
CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1);
101+
CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1);
102+
CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1);
103+
CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1);
104+
CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1);
105+
CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1);
106+
CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1);
105107

106108
// utf-8
107-
PY_CODECS_ALIASES.put("utf-8", "utf-8");
108-
PY_CODECS_ALIASES.put("utf_8", "utf-8");
109-
PY_CODECS_ALIASES.put("U8", "utf-8");
110-
PY_CODECS_ALIASES.put("UTF", "utf-8");
111-
PY_CODECS_ALIASES.put("utf8", "utf-8");
109+
CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8);
110+
CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8);
111+
CHARSET_MAP.put("U8", StandardCharsets.UTF_8);
112+
CHARSET_MAP.put("UTF", StandardCharsets.UTF_8);
113+
CHARSET_MAP.put("utf8", StandardCharsets.UTF_8);
112114

113115
// utf-16
114-
PY_CODECS_ALIASES.put("utf-16", "utf-16");
115-
PY_CODECS_ALIASES.put("utf_16", "utf-16");
116-
PY_CODECS_ALIASES.put("U16", "utf-16");
117-
PY_CODECS_ALIASES.put("utf16", "utf-16");
116+
CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16);
117+
CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16);
118+
CHARSET_MAP.put("U16", StandardCharsets.UTF_16);
119+
CHARSET_MAP.put("utf16", StandardCharsets.UTF_16);
118120
// TODO BMP only
119-
PY_CODECS_ALIASES.put("utf_16_be", "utf-16be");
120-
PY_CODECS_ALIASES.put("utf_16_le", "utf-16le");
121+
CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE);
122+
CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE);
121123

122124
// utf-32
123-
PY_CODECS_ALIASES.put("utf-32", "utf-32");
124-
PY_CODECS_ALIASES.put("utf_32", "utf-32");
125-
PY_CODECS_ALIASES.put("U32", "utf-32");
126-
PY_CODECS_ALIASES.put("utf_32_be", "utf-32be");
127-
PY_CODECS_ALIASES.put("utf_32_le", "utf-32le");
128-
PY_CODECS_ALIASES.put("utf32", "utf-32");
125+
final Charset utf32be = Charset.forName("utf-32be");
126+
final Charset utf32le = Charset.forName("utf-32le");
127+
final Charset ibm437 = Charset.forName("IBM437");
128+
129+
CHARSET_MAP.put("utf-32", UTF32);
130+
CHARSET_MAP.put("utf_32", UTF32);
131+
CHARSET_MAP.put("U32", UTF32);
132+
CHARSET_MAP.put("utf-32be", utf32be);
133+
CHARSET_MAP.put("utf_32_be", utf32be);
134+
CHARSET_MAP.put("utf-32le", utf32le);
135+
CHARSET_MAP.put("utf_32_le", utf32le);
136+
CHARSET_MAP.put("utf32", UTF32);
129137
// big5 big5-tw, csbig5 Traditional Chinese
130138
// big5hkscs big5-hkscs, hkscs Traditional Chinese
131139
// cp037 IBM037, IBM039 English
132140
// cp424 EBCDIC-CP-HE, IBM424 Hebrew
133141
// cp437 437, IBM437 English
134-
PY_CODECS_ALIASES.put("IBM437", "IBM437");
135-
PY_CODECS_ALIASES.put("IBM437 English", "IBM437");
136-
PY_CODECS_ALIASES.put("437", "IBM437");
137-
PY_CODECS_ALIASES.put("cp437", "IBM437");
142+
CHARSET_MAP.put("IBM437", ibm437);
143+
CHARSET_MAP.put("IBM437 English", ibm437);
144+
CHARSET_MAP.put("437", ibm437);
145+
CHARSET_MAP.put("cp437", ibm437);
138146
// cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
139147
// cp720 Arabic
140148
// cp737 Greek
@@ -220,15 +228,7 @@ public class CodecsModuleBuiltins extends PythonBuiltins {
220228

221229
@TruffleBoundary
222230
static Charset getCharset(String encoding) {
223-
if (encoding == null) {
224-
return Charset.forName(DEFAULT_ENCODING);
225-
} else {
226-
String val = PY_CODECS_ALIASES.get(encoding);
227-
if (val != null) {
228-
return Charset.forName(val);
229-
}
230-
return Charset.forName(encoding);
231-
}
231+
return CHARSET_MAP.get(encoding);
232232
}
233233

234234
@Override
@@ -453,8 +453,7 @@ private Object[] encodeString(String self, String errors) {
453453
CodingErrorAction errorAction = convertCodingErrorAction(errors);
454454

455455
try {
456-
Charset charset = getCharset("utf-32");
457-
ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
456+
ByteBuffer encoded = UTF32.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
458457
int n = encoded.remaining();
459458
ByteBuffer buf = ByteBuffer.allocate(n);
460459
assert n % Integer.BYTES == 0;
@@ -606,8 +605,7 @@ String decodeBytes(ByteBuffer bytes, String errors) {
606605
buf.putInt(val);
607606
}
608607
buf.flip();
609-
Charset charset = getCharset("utf-32");
610-
CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
608+
CharBuffer decoded = UTF32.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
611609
return String.valueOf(decoded);
612610
} catch (CharacterCodingException e) {
613611
throw raise(UnicodeDecodeError, e);

0 commit comments

Comments
 (0)