Skip to content

Commit f3c7a51

Browse files
committed
[GR-15586] Store Charset instances statically
PullRequest: graalpython/507
2 parents e144f47 + 31e5248 commit f3c7a51

File tree

2 files changed

+69
-59
lines changed

2 files changed

+69
-59
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 60 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@
5050
import java.nio.charset.CharacterCodingException;
5151
import java.nio.charset.Charset;
5252
import java.nio.charset.CodingErrorAction;
53+
import java.nio.charset.IllegalCharsetNameException;
54+
import java.nio.charset.StandardCharsets;
55+
import java.nio.charset.UnsupportedCharsetException;
5356
import java.util.Arrays;
5457
import java.util.HashMap;
5558
import java.util.List;
@@ -82,60 +85,65 @@
8285

8386
@CoreFunctions(defineModule = "_codecs")
8487
public class CodecsModuleBuiltins extends PythonBuiltins {
85-
public static final String DEFAULT_ENCODING = "utf-8";
88+
private static final Charset UTF32 = Charset.forName("utf-32");
8689

8790
// python to java codecs mapping
88-
private static final Map<String, String> PY_CODECS_ALIASES = new HashMap<>();
91+
private static final Map<String, Charset> CHARSET_MAP = new HashMap<>();
8992
static {
9093
// ascii
91-
PY_CODECS_ALIASES.put("us-ascii", "us-ascii");
92-
PY_CODECS_ALIASES.put("ascii", "us-ascii");
93-
PY_CODECS_ALIASES.put("646", "us-ascii");
94+
CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII);
95+
CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII);
96+
CHARSET_MAP.put("646", StandardCharsets.US_ASCII);
9497

9598
// latin 1
96-
PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
97-
PY_CODECS_ALIASES.put("latin-1", "iso-8859-1");
98-
PY_CODECS_ALIASES.put("latin_1", "iso-8859-1");
99-
PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
100-
PY_CODECS_ALIASES.put("iso8859-1", "iso-8859-1");
101-
PY_CODECS_ALIASES.put("8859", "iso-8859-1");
102-
PY_CODECS_ALIASES.put("cp819", "iso-8859-1");
103-
PY_CODECS_ALIASES.put("latin", "iso-8859-1");
104-
PY_CODECS_ALIASES.put("latin1", "iso-8859-1");
105-
PY_CODECS_ALIASES.put("L1", "iso-8859-1");
99+
CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1);
100+
CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1);
101+
CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1);
102+
CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1);
103+
CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1);
104+
CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1);
105+
CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1);
106+
CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1);
107+
CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1);
106108

107109
// utf-8
108-
PY_CODECS_ALIASES.put("utf-8", "utf-8");
109-
PY_CODECS_ALIASES.put("utf_8", "utf-8");
110-
PY_CODECS_ALIASES.put("U8", "utf-8");
111-
PY_CODECS_ALIASES.put("UTF", "utf-8");
112-
PY_CODECS_ALIASES.put("utf8", "utf-8");
110+
CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8);
111+
CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8);
112+
CHARSET_MAP.put("U8", StandardCharsets.UTF_8);
113+
CHARSET_MAP.put("UTF", StandardCharsets.UTF_8);
114+
CHARSET_MAP.put("utf8", StandardCharsets.UTF_8);
113115

114116
// utf-16
115-
PY_CODECS_ALIASES.put("utf-16", "utf-16");
116-
PY_CODECS_ALIASES.put("utf_16", "utf-16");
117-
PY_CODECS_ALIASES.put("U16", "utf-16");
118-
PY_CODECS_ALIASES.put("utf16", "utf-16");
117+
CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16);
118+
CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16);
119+
CHARSET_MAP.put("U16", StandardCharsets.UTF_16);
120+
CHARSET_MAP.put("utf16", StandardCharsets.UTF_16);
119121
// TODO BMP only
120-
PY_CODECS_ALIASES.put("utf_16_be", "utf-16be");
121-
PY_CODECS_ALIASES.put("utf_16_le", "utf-16le");
122+
CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE);
123+
CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE);
122124

123125
// utf-32
124-
PY_CODECS_ALIASES.put("utf-32", "utf-32");
125-
PY_CODECS_ALIASES.put("utf_32", "utf-32");
126-
PY_CODECS_ALIASES.put("U32", "utf-32");
127-
PY_CODECS_ALIASES.put("utf_32_be", "utf-32be");
128-
PY_CODECS_ALIASES.put("utf_32_le", "utf-32le");
129-
PY_CODECS_ALIASES.put("utf32", "utf-32");
126+
final Charset utf32be = Charset.forName("utf-32be");
127+
final Charset utf32le = Charset.forName("utf-32le");
128+
final Charset ibm437 = Charset.forName("IBM437");
129+
130+
CHARSET_MAP.put("utf-32", UTF32);
131+
CHARSET_MAP.put("utf_32", UTF32);
132+
CHARSET_MAP.put("U32", UTF32);
133+
CHARSET_MAP.put("utf-32be", utf32be);
134+
CHARSET_MAP.put("utf_32_be", utf32be);
135+
CHARSET_MAP.put("utf-32le", utf32le);
136+
CHARSET_MAP.put("utf_32_le", utf32le);
137+
CHARSET_MAP.put("utf32", UTF32);
130138
// big5 big5-tw, csbig5 Traditional Chinese
131139
// big5hkscs big5-hkscs, hkscs Traditional Chinese
132140
// cp037 IBM037, IBM039 English
133141
// cp424 EBCDIC-CP-HE, IBM424 Hebrew
134142
// cp437 437, IBM437 English
135-
PY_CODECS_ALIASES.put("IBM437", "IBM437");
136-
PY_CODECS_ALIASES.put("IBM437 English", "IBM437");
137-
PY_CODECS_ALIASES.put("437", "IBM437");
138-
PY_CODECS_ALIASES.put("cp437", "IBM437");
143+
CHARSET_MAP.put("IBM437", ibm437);
144+
CHARSET_MAP.put("IBM437 English", ibm437);
145+
CHARSET_MAP.put("437", ibm437);
146+
CHARSET_MAP.put("cp437", ibm437);
139147
// cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
140148
// cp720 Arabic
141149
// cp737 Greek
@@ -221,15 +229,7 @@ public class CodecsModuleBuiltins extends PythonBuiltins {
221229

222230
@TruffleBoundary
223231
static Charset getCharset(String encoding) {
224-
if (encoding == null) {
225-
return Charset.forName(DEFAULT_ENCODING);
226-
} else {
227-
String val = PY_CODECS_ALIASES.get(encoding);
228-
if (val != null) {
229-
return Charset.forName(val);
230-
}
231-
return Charset.forName(encoding);
232-
}
232+
return CHARSET_MAP.get(encoding);
233233
}
234234

235235
@Override
@@ -412,15 +412,18 @@ Object encode(Object str, @SuppressWarnings("unused") Object encoding, @Suppress
412412
@TruffleBoundary
413413
private PBytes encodeString(String self, String encoding, String errors) {
414414
CodingErrorAction errorAction = convertCodingErrorAction(errors);
415+
Charset charset;
416+
try {
417+
charset = getCharset(encoding);
418+
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
419+
throw raise(LookupError, "unknown encoding: %s", encoding);
420+
}
415421
try {
416-
Charset charset = getCharset(encoding);
417422
ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
418423
int n = encoded.remaining();
419424
byte[] data = new byte[n];
420425
encoded.get(data);
421426
return factory().createBytes(data);
422-
} catch (IllegalArgumentException e) {
423-
throw raise(LookupError, "unknown encoding: %s", encoding);
424427
} catch (CharacterCodingException e) {
425428
throw raise(UnicodeEncodeError, e);
426429
}
@@ -454,8 +457,7 @@ private Object[] encodeString(String self, String errors) {
454457
CodingErrorAction errorAction = convertCodingErrorAction(errors);
455458

456459
try {
457-
Charset charset = getCharset("utf-32");
458-
ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
460+
ByteBuffer encoded = UTF32.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
459461
int n = encoded.remaining();
460462
ByteBuffer buf = ByteBuffer.allocate(n);
461463
assert n % Integer.BYTES == 0;
@@ -543,12 +545,15 @@ private ByteBuffer getBytesBuffer(PIBytesLike bytesLike) {
543545
@TruffleBoundary
544546
String decodeBytes(ByteBuffer bytes, String encoding, String errors) {
545547
CodingErrorAction errorAction = convertCodingErrorAction(errors);
548+
Charset charset;
549+
try {
550+
charset = getCharset(encoding);
551+
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
552+
throw raise(LookupError, "unknown encoding: %s", encoding);
553+
}
546554
try {
547-
Charset charset = getCharset(encoding);
548555
CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(bytes);
549556
return String.valueOf(decoded);
550-
} catch (IllegalArgumentException e) {
551-
throw raise(LookupError, "unknown encoding: %s", encoding);
552557
} catch (CharacterCodingException e) {
553558
throw raise(UnicodeDecodeError, e);
554559
}
@@ -607,8 +612,7 @@ String decodeBytes(ByteBuffer bytes, String errors) {
607612
buf.putInt(val);
608613
}
609614
buf.flip();
610-
Charset charset = getCharset("utf-32");
611-
CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
615+
CharBuffer decoded = UTF32.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
612616
return String.valueOf(decoded);
613617
} catch (CharacterCodingException e) {
614618
throw raise(UnicodeDecodeError, e);
@@ -626,7 +630,7 @@ Object lookup(String encoding) {
626630
try {
627631
getCharset(encoding);
628632
return true;
629-
} catch (IllegalArgumentException e) {
633+
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
630634
return PNone.NONE;
631635
}
632636
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
import java.nio.charset.CharacterCodingException;
5656
import java.nio.charset.Charset;
5757
import java.nio.charset.CodingErrorAction;
58+
import java.nio.charset.IllegalCharsetNameException;
59+
import java.nio.charset.UnsupportedCharsetException;
5860
import java.util.Arrays;
5961
import java.util.List;
6062

@@ -1459,6 +1461,7 @@ private static int op(String self, String substr, int start) {
14591461
}
14601462
}
14611463

1464+
// This is only used during bootstrap and then replaced with Python code
14621465
@Builtin(name = "encode", minNumOfPositionalArgs = 1, parameterNames = {"self", "encoding", "errors"})
14631466
@GenerateNodeFactory
14641467
@TypeSystemReference(PythonArithmeticTypes.class)
@@ -1499,15 +1502,18 @@ private Object encodeString(String self, String encoding, String errors) {
14991502
break;
15001503
}
15011504

1505+
Charset cs;
1506+
try {
1507+
cs = Charset.forName(encoding);
1508+
} catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
1509+
throw raise(LookupError, "unknown encoding: %s", encoding);
1510+
}
15021511
try {
1503-
Charset cs = Charset.forName(encoding);
15041512
ByteBuffer encoded = cs.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
15051513
int n = encoded.remaining();
15061514
byte[] data = new byte[n];
15071515
encoded.get(data);
15081516
return factory().createBytes(data);
1509-
} catch (IllegalArgumentException e) {
1510-
throw raise(LookupError, "unknown encoding: %s", encoding);
15111517
} catch (CharacterCodingException e) {
15121518
throw raise(UnicodeEncodeError, e);
15131519
}

0 commit comments

Comments
 (0)