Skip to content

Commit fc161cc

Browse files
committed
More fixes for UTF BOM handling
1 parent 0c1ef60 commit fc161cc

File tree

2 files changed

+58
-6
lines changed

2 files changed

+58
-6
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (C) 1996-2017 Python Software Foundation
33
#
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
5+
import sys
56

67

78
def coding_checker(self, coder):
@@ -323,3 +324,37 @@ def test_codecs_builtins(self):
323324

324325
encoded = codecs.ascii_encode(s)
325326
self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
327+
328+
329+
class UTFByteOrderTest(unittest.TestCase):
330+
def test_utf16_byteorder(self):
331+
self.assertEqual("😂".encode("utf-16-le"), b'=\xd8\x02\xde')
332+
self.assertEqual("😂".encode("utf-16-be"), b'\xd8=\xde\x02')
333+
if sys.byteorder == 'little':
334+
self.assertEqual("😂".encode("utf-16"), b'\xff\xfe=\xd8\x02\xde')
335+
else:
336+
self.assertEqual("😂".encode("utf-16"), b'\xfe\xff\xd8=\xde\x02')
337+
self.assertEqual(b'=\xd8\x02\xde'.decode('utf-16-le'), "😂")
338+
self.assertEqual(b'\xd8=\xde\x02'.decode('utf-16-be'), "😂")
339+
self.assertEqual(b'\xff\xfe=\xd8\x02\xde'.decode('utf-16'), "😂")
340+
self.assertEqual(b'\xfe\xff\xd8=\xde\x02'.decode('utf-16'), "😂")
341+
if sys.byteorder == 'little':
342+
self.assertEqual(b'=\xd8\x02\xde'.decode('utf-16'), "😂")
343+
else:
344+
self.assertEqual(b'\xd8=\xde\x02'.decode('utf-16'), "😂")
345+
346+
def test_utf32_byteorder(self):
347+
self.assertEqual("😂".encode("utf-32-le"), b'\x02\xf6\x01\x00')
348+
self.assertEqual("😂".encode("utf-32-be"), b'\x00\x01\xf6\x02')
349+
if sys.byteorder == 'little':
350+
self.assertEqual("😂".encode("utf-32"), b'\xff\xfe\x00\x00\x02\xf6\x01\x00')
351+
else:
352+
self.assertEqual("😂".encode("utf-32"), b'\x00\x00\xfe\xff\xd8=\xde\x02')
353+
self.assertEqual(b'\x02\xf6\x01\x00'.decode('utf-32-le'), "😂")
354+
self.assertEqual(b'\x00\x01\xf6\x02'.decode('utf-32-be'), "😂")
355+
self.assertEqual(b'\xff\xfe\x00\x00\x02\xf6\x01\x00'.decode('utf-32'), "😂")
356+
self.assertEqual(b'\x00\x00\xfe\xff\x00\x01\xf6\x02'.decode('utf-32'), "😂")
357+
if sys.byteorder == 'little':
358+
self.assertEqual(b'\x02\xf6\x01\x00'.decode('utf-32'), "😂")
359+
else:
360+
self.assertEqual(b'\x00\x01\xf6\x02'.decode('utf-32'), "😂")

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
* Utility class for mapping Python encodings to Java charsets
5959
*/
6060
public class CharsetMapping {
61+
private static final Charset UTF_32 = Charset.forName("UTF_32");
6162
private static final ConcurrentMap<String, Charset> JAVA_CHARSETS = new ConcurrentHashMap<>();
6263
// Name maps are populated by static initializer and are immutable afterwards
6364
private static final Map<String, String> CHARSET_NAME_MAP = new HashMap<>();
@@ -80,13 +81,13 @@ public static Charset getCharsetForDecoding(String encoding, byte[] bytes, int l
8081
* JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of the
8182
* platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject big endian
8283
* BOM. CPython defaults to platform endian and accepts both BOMs. So, in order to get
83-
* the behavior we need, we have to take a peek at the possible BOM and if it's BE BOM,
84-
* we use BE encoding, otherwise LE encoding.
84+
* the behavior we need, we have to take a peek at the possible BOM and if it has a BOM
85+
* use the UTF-16/32 encoding and let it detect, otherwise default to UTF-16/32-LE.
8586
*/
86-
if ("utf_16".equals(normalized) && len >= 2 && bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) {
87-
return StandardCharsets.UTF_16BE;
88-
} else if ("utf_32".equals(normalized) && len >= 4 && bytes[0] == 0 && bytes[1] == 0 && bytes[2] == (byte) 0xFE && bytes[3] == (byte) 0xFF) {
89-
return getJavaCharset("UTF-32BE");
87+
if ("utf_16".equals(normalized) && hasUTF16BOM(bytes, len)) {
88+
return StandardCharsets.UTF_16;
89+
} else if ("utf_32".equals(normalized) && hasUTF32BOM(bytes, len)) {
90+
return UTF_32;
9091
}
9192
}
9293
String name = CHARSET_NAME_MAP.get(normalized);
@@ -96,6 +97,22 @@ public static Charset getCharsetForDecoding(String encoding, byte[] bytes, int l
9697
return null;
9798
}
9899

100+
private static boolean hasUTF16BOM(byte[] bytes, int len) {
101+
if (len < 2) {
102+
return false;
103+
}
104+
short head = PythonUtils.arrayAccessor.getShort(bytes, 0);
105+
return head == (short) 0xFFFE || head == (short) 0xFEFF;
106+
}
107+
108+
private static boolean hasUTF32BOM(byte[] bytes, int len) {
109+
if (len < 4) {
110+
return false;
111+
}
112+
int head = PythonUtils.arrayAccessor.getInt(bytes, 0);
113+
return head == 0xFFFE0000 || head == 0x0000FEFF;
114+
}
115+
99116
@TruffleBoundary
100117
public static String getPythonEncodingNameFromJavaName(String javaEncodingName) {
101118
return CHARSET_NAME_MAP_REVERSE.get(javaEncodingName.toLowerCase());

0 commit comments

Comments
 (0)