More fixes for UTF BOM handling

msimacek · msimacek · commit fc161ccefbc2 · 2021-12-20T12:55:10.000+01:00
diff --git a/graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py b/graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py
@@ -2,6 +2,7 @@
 # Copyright (C) 1996-2017 Python Software Foundation
 #
 # Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
+import sys
 
 
 def coding_checker(self, coder):
@@ -323,3 +324,37 @@ def test_codecs_builtins(self):
 
         encoded = codecs.ascii_encode(s)
         self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
+
+
+class UTFByteOrderTest(unittest.TestCase):
+    def test_utf16_byteorder(self):
+        self.assertEqual("😂".encode("utf-16-le"), b'=\xd8\x02\xde')
+        self.assertEqual("😂".encode("utf-16-be"), b'\xd8=\xde\x02')
+        if sys.byteorder == 'little':
+            self.assertEqual("😂".encode("utf-16"), b'\xff\xfe=\xd8\x02\xde')
+        else:
+            self.assertEqual("😂".encode("utf-16"), b'\xfe\xff\xd8=\xde\x02')
+        self.assertEqual(b'=\xd8\x02\xde'.decode('utf-16-le'), "😂")
+        self.assertEqual(b'\xd8=\xde\x02'.decode('utf-16-be'), "😂")
+        self.assertEqual(b'\xff\xfe=\xd8\x02\xde'.decode('utf-16'), "😂")
+        self.assertEqual(b'\xfe\xff\xd8=\xde\x02'.decode('utf-16'), "😂")
+        if sys.byteorder == 'little':
+            self.assertEqual(b'=\xd8\x02\xde'.decode('utf-16'), "😂")
+        else:
+            self.assertEqual(b'\xd8=\xde\x02'.decode('utf-16'), "😂")
+
+    def test_utf32_byteorder(self):
+        self.assertEqual("😂".encode("utf-32-le"), b'\x02\xf6\x01\x00')
+        self.assertEqual("😂".encode("utf-32-be"), b'\x00\x01\xf6\x02')
+        if sys.byteorder == 'little':
+            self.assertEqual("😂".encode("utf-32"), b'\xff\xfe\x00\x00\x02\xf6\x01\x00')
+        else:
+            self.assertEqual("😂".encode("utf-32"), b'\x00\x00\xfe\xff\xd8=\xde\x02')
+        self.assertEqual(b'\x02\xf6\x01\x00'.decode('utf-32-le'), "😂")
+        self.assertEqual(b'\x00\x01\xf6\x02'.decode('utf-32-be'), "😂")
+        self.assertEqual(b'\xff\xfe\x00\x00\x02\xf6\x01\x00'.decode('utf-32'), "😂")
+        self.assertEqual(b'\x00\x00\xfe\xff\x00\x01\xf6\x02'.decode('utf-32'), "😂")
+        if sys.byteorder == 'little':
+            self.assertEqual(b'\x02\xf6\x01\x00'.decode('utf-32'), "😂")
+        else:
+            self.assertEqual(b'\x00\x01\xf6\x02'.decode('utf-32'), "😂")
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java
@@ -58,6 +58,7 @@
  * Utility class for mapping Python encodings to Java charsets
  */
 public class CharsetMapping {
+    private static final Charset UTF_32 = Charset.forName("UTF_32");
     private static final ConcurrentMap<String, Charset> JAVA_CHARSETS = new ConcurrentHashMap<>();
     // Name maps are populated by static initializer and are immutable afterwards
     private static final Map<String, String> CHARSET_NAME_MAP = new HashMap<>();
@@ -80,13 +81,13 @@ public static Charset getCharsetForDecoding(String encoding, byte[] bytes, int l
              * JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of the
              * platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject big endian
              * BOM. CPython defaults to platform endian and accepts both BOMs. So, in order to get
-             * the behavior we need, we have to take a peek at the possible BOM and if it's BE BOM,
-             * we use BE encoding, otherwise LE encoding.
+             * the behavior we need, we have to take a peek at the possible BOM and if it has a BOM
+             * use the UTF-16/32 encoding and let it detect, otherwise default to UTF-16/32-LE.
              */
-            if ("utf_16".equals(normalized) && len >= 2 && bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) {
-                return StandardCharsets.UTF_16BE;
-            } else if ("utf_32".equals(normalized) && len >= 4 && bytes[0] == 0 && bytes[1] == 0 && bytes[2] == (byte) 0xFE && bytes[3] == (byte) 0xFF) {
-                return getJavaCharset("UTF-32BE");
+            if ("utf_16".equals(normalized) && hasUTF16BOM(bytes, len)) {
+                return StandardCharsets.UTF_16;
+            } else if ("utf_32".equals(normalized) && hasUTF32BOM(bytes, len)) {
+                return UTF_32;
             }
         }
         String name = CHARSET_NAME_MAP.get(normalized);
@@ -96,6 +97,22 @@ public static Charset getCharsetForDecoding(String encoding, byte[] bytes, int l
         return null;
     }
 
+    private static boolean hasUTF16BOM(byte[] bytes, int len) {
+        if (len < 2) {
+            return false;
+        }
+        short head = PythonUtils.arrayAccessor.getShort(bytes, 0);
+        return head == (short) 0xFFFE || head == (short) 0xFEFF;
+    }
+
+    private static boolean hasUTF32BOM(byte[] bytes, int len) {
+        if (len < 4) {
+            return false;
+        }
+        int head = PythonUtils.arrayAccessor.getInt(bytes, 0);
+        return head == 0xFFFE0000 || head == 0x0000FEFF;
+    }
+
     @TruffleBoundary
     public static String getPythonEncodingNameFromJavaName(String javaEncodingName) {
         return CHARSET_NAME_MAP_REVERSE.get(javaEncodingName.toLowerCase());