[GR-15586] Store Charset instances statically

timfel · timfel · commit f3c7a51d2bcf · 2019-05-07T22:29:13.000-07:00
PullRequest: graalpython/507
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java
@@ -50,6 +50,9 @@
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
@@ -82,60 +85,65 @@
 
 @CoreFunctions(defineModule = "_codecs")
 public class CodecsModuleBuiltins extends PythonBuiltins {
-    public static final String DEFAULT_ENCODING = "utf-8";
+    private static final Charset UTF32 = Charset.forName("utf-32");
 
     // python to java codecs mapping
-    private static final Map<String, String> PY_CODECS_ALIASES = new HashMap<>();
+    private static final Map<String, Charset> CHARSET_MAP = new HashMap<>();
     static {
         // ascii
-        PY_CODECS_ALIASES.put("us-ascii", "us-ascii");
-        PY_CODECS_ALIASES.put("ascii", "us-ascii");
-        PY_CODECS_ALIASES.put("646", "us-ascii");
+        CHARSET_MAP.put("us-ascii", StandardCharsets.US_ASCII);
+        CHARSET_MAP.put("ascii", StandardCharsets.US_ASCII);
+        CHARSET_MAP.put("646", StandardCharsets.US_ASCII);
 
         // latin 1
-        PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("latin-1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("latin_1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("iso-8859-1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("iso8859-1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("8859", "iso-8859-1");
-        PY_CODECS_ALIASES.put("cp819", "iso-8859-1");
-        PY_CODECS_ALIASES.put("latin", "iso-8859-1");
-        PY_CODECS_ALIASES.put("latin1", "iso-8859-1");
-        PY_CODECS_ALIASES.put("L1", "iso-8859-1");
+        CHARSET_MAP.put("iso-8859-1", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("latin-1", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("latin_1", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("iso8859-1", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("8859", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("cp819", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("latin", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("latin1", StandardCharsets.ISO_8859_1);
+        CHARSET_MAP.put("L1", StandardCharsets.ISO_8859_1);
 
         // utf-8
-        PY_CODECS_ALIASES.put("utf-8", "utf-8");
-        PY_CODECS_ALIASES.put("utf_8", "utf-8");
-        PY_CODECS_ALIASES.put("U8", "utf-8");
-        PY_CODECS_ALIASES.put("UTF", "utf-8");
-        PY_CODECS_ALIASES.put("utf8", "utf-8");
+        CHARSET_MAP.put("utf-8", StandardCharsets.UTF_8);
+        CHARSET_MAP.put("utf_8", StandardCharsets.UTF_8);
+        CHARSET_MAP.put("U8", StandardCharsets.UTF_8);
+        CHARSET_MAP.put("UTF", StandardCharsets.UTF_8);
+        CHARSET_MAP.put("utf8", StandardCharsets.UTF_8);
 
         // utf-16
-        PY_CODECS_ALIASES.put("utf-16", "utf-16");
-        PY_CODECS_ALIASES.put("utf_16", "utf-16");
-        PY_CODECS_ALIASES.put("U16", "utf-16");
-        PY_CODECS_ALIASES.put("utf16", "utf-16");
+        CHARSET_MAP.put("utf-16", StandardCharsets.UTF_16);
+        CHARSET_MAP.put("utf_16", StandardCharsets.UTF_16);
+        CHARSET_MAP.put("U16", StandardCharsets.UTF_16);
+        CHARSET_MAP.put("utf16", StandardCharsets.UTF_16);
         // TODO BMP only
-        PY_CODECS_ALIASES.put("utf_16_be", "utf-16be");
-        PY_CODECS_ALIASES.put("utf_16_le", "utf-16le");
+        CHARSET_MAP.put("utf_16_be", StandardCharsets.UTF_16BE);
+        CHARSET_MAP.put("utf_16_le", StandardCharsets.UTF_16LE);
 
         // utf-32
-        PY_CODECS_ALIASES.put("utf-32", "utf-32");
-        PY_CODECS_ALIASES.put("utf_32", "utf-32");
-        PY_CODECS_ALIASES.put("U32", "utf-32");
-        PY_CODECS_ALIASES.put("utf_32_be", "utf-32be");
-        PY_CODECS_ALIASES.put("utf_32_le", "utf-32le");
-        PY_CODECS_ALIASES.put("utf32", "utf-32");
+        final Charset utf32be = Charset.forName("utf-32be");
+        final Charset utf32le = Charset.forName("utf-32le");
+        final Charset ibm437 = Charset.forName("IBM437");
+
+        CHARSET_MAP.put("utf-32", UTF32);
+        CHARSET_MAP.put("utf_32", UTF32);
+        CHARSET_MAP.put("U32", UTF32);
+        CHARSET_MAP.put("utf-32be", utf32be);
+        CHARSET_MAP.put("utf_32_be", utf32be);
+        CHARSET_MAP.put("utf-32le", utf32le);
+        CHARSET_MAP.put("utf_32_le", utf32le);
+        CHARSET_MAP.put("utf32", UTF32);
         // big5 big5-tw, csbig5 Traditional Chinese
         // big5hkscs big5-hkscs, hkscs Traditional Chinese
         // cp037 IBM037, IBM039 English
         // cp424 EBCDIC-CP-HE, IBM424 Hebrew
         // cp437 437, IBM437 English
-        PY_CODECS_ALIASES.put("IBM437", "IBM437");
-        PY_CODECS_ALIASES.put("IBM437 English", "IBM437");
-        PY_CODECS_ALIASES.put("437", "IBM437");
-        PY_CODECS_ALIASES.put("cp437", "IBM437");
+        CHARSET_MAP.put("IBM437", ibm437);
+        CHARSET_MAP.put("IBM437 English", ibm437);
+        CHARSET_MAP.put("437", ibm437);
+        CHARSET_MAP.put("cp437", ibm437);
         // cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
         // cp720 Arabic
         // cp737 Greek
@@ -221,15 +229,7 @@ public class CodecsModuleBuiltins extends PythonBuiltins {
 
     @TruffleBoundary
     static Charset getCharset(String encoding) {
-        if (encoding == null) {
-            return Charset.forName(DEFAULT_ENCODING);
-        } else {
-            String val = PY_CODECS_ALIASES.get(encoding);
-            if (val != null) {
-                return Charset.forName(val);
-            }
-            return Charset.forName(encoding);
-        }
+        return CHARSET_MAP.get(encoding);
     }
 
     @Override
@@ -412,15 +412,18 @@ Object encode(Object str, @SuppressWarnings("unused") Object encoding, @Suppress
         @TruffleBoundary
         private PBytes encodeString(String self, String encoding, String errors) {
             CodingErrorAction errorAction = convertCodingErrorAction(errors);
+            Charset charset;
+            try {
+                charset = getCharset(encoding);
+            } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
+                throw raise(LookupError, "unknown encoding: %s", encoding);
+            }
             try {
-                Charset charset = getCharset(encoding);
                 ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
                 int n = encoded.remaining();
                 byte[] data = new byte[n];
                 encoded.get(data);
                 return factory().createBytes(data);
-            } catch (IllegalArgumentException e) {
-                throw raise(LookupError, "unknown encoding: %s", encoding);
             } catch (CharacterCodingException e) {
                 throw raise(UnicodeEncodeError, e);
             }
@@ -454,8 +457,7 @@ private Object[] encodeString(String self, String errors) {
             CodingErrorAction errorAction = convertCodingErrorAction(errors);
 
             try {
-                Charset charset = getCharset("utf-32");
-                ByteBuffer encoded = charset.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
+                ByteBuffer encoded = UTF32.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
                 int n = encoded.remaining();
                 ByteBuffer buf = ByteBuffer.allocate(n);
                 assert n % Integer.BYTES == 0;
@@ -543,12 +545,15 @@ private ByteBuffer getBytesBuffer(PIBytesLike bytesLike) {
         @TruffleBoundary
         String decodeBytes(ByteBuffer bytes, String encoding, String errors) {
             CodingErrorAction errorAction = convertCodingErrorAction(errors);
+            Charset charset;
+            try {
+                charset = getCharset(encoding);
+            } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
+                throw raise(LookupError, "unknown encoding: %s", encoding);
+            }
             try {
-                Charset charset = getCharset(encoding);
                 CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(bytes);
                 return String.valueOf(decoded);
-            } catch (IllegalArgumentException e) {
-                throw raise(LookupError, "unknown encoding: %s", encoding);
             } catch (CharacterCodingException e) {
                 throw raise(UnicodeDecodeError, e);
             }
@@ -607,8 +612,7 @@ String decodeBytes(ByteBuffer bytes, String errors) {
                     buf.putInt(val);
                 }
                 buf.flip();
-                Charset charset = getCharset("utf-32");
-                CharBuffer decoded = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
+                CharBuffer decoded = UTF32.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).decode(buf);
                 return String.valueOf(decoded);
             } catch (CharacterCodingException e) {
                 throw raise(UnicodeDecodeError, e);
@@ -626,7 +630,7 @@ Object lookup(String encoding) {
             try {
                 getCharset(encoding);
                 return true;
-            } catch (IllegalArgumentException e) {
+            } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
                 return PNone.NONE;
             }
         }
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java
@@ -55,6 +55,8 @@
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.Arrays;
 import java.util.List;
 
@@ -1459,6 +1461,7 @@ private static int op(String self, String substr, int start) {
         }
     }
 
+    // This is only used during bootstrap and then replaced with Python code
     @Builtin(name = "encode", minNumOfPositionalArgs = 1, parameterNames = {"self", "encoding", "errors"})
     @GenerateNodeFactory
     @TypeSystemReference(PythonArithmeticTypes.class)
@@ -1499,15 +1502,18 @@ private Object encodeString(String self, String encoding, String errors) {
                     break;
             }
 
+            Charset cs;
+            try {
+                cs = Charset.forName(encoding);
+            } catch (UnsupportedCharsetException | IllegalCharsetNameException e) {
+                throw raise(LookupError, "unknown encoding: %s", encoding);
+            }
             try {
-                Charset cs = Charset.forName(encoding);
                 ByteBuffer encoded = cs.newEncoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction).encode(CharBuffer.wrap(self));
                 int n = encoded.remaining();
                 byte[] data = new byte[n];
                 encoded.get(data);
                 return factory().createBytes(data);
-            } catch (IllegalArgumentException e) {
-                throw raise(LookupError, "unknown encoding: %s", encoding);
             } catch (CharacterCodingException e) {
                 throw raise(UnicodeEncodeError, e);
             }