Skip to content

Commit 9853bc9

Browse files
committed
Match UTF-16/UTF-32 endianness/BOM handling with CPython
1 parent 89ff88d commit 9853bc9

File tree

8 files changed

+60
-108
lines changed

8 files changed

+60
-108
lines changed

graalpython/com.oracle.graal.python.cext/src/unicodeobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode) {
408408

409409
UPCALL_ID(PyTruffle_Unicode_AsWideChar);
410410
Py_UNICODE* PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) {
411-
PyObject* bytes = UPCALL_CEXT_O(_jls_PyTruffle_Unicode_AsWideChar, native_to_java(unicode), Py_UNICODE_SIZE, native_to_java(Py_None), ERROR_MARKER);
411+
PyObject* bytes = UPCALL_CEXT_O(_jls_PyTruffle_Unicode_AsWideChar, native_to_java(unicode), Py_UNICODE_SIZE, ERROR_MARKER);
412412
if (bytes != NULL) {
413413
// exclude null terminator at the end
414414
*size = PyBytes_Size(bytes) / Py_UNICODE_SIZE;

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_codecs.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@
122122
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_handlers
123123
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_issue8941
124124
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_mixed_readline_and_read
125+
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_partial
125126
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_readline
126127
*graalpython.lib-python.3.test.test_codecs.UTF32Test.test_readlinequeue
127128
*graalpython.lib-python.3.test.test_codecs.UTF7Test.test_ascii

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ Object decode(VirtualFrame frame, PNodeWithRaiseAndIndirectCall node, Object inp
585585
int len = bufferLib.getBufferLength(buffer);
586586
byte[] bytes = bufferLib.getInternalOrCopiedByteArray(buffer);
587587
CodingErrorAction errorAction = convertCodingErrorAction(errors);
588-
Charset charset = CharsetMapping.getCharset(encoding);
588+
Charset charset = CharsetMapping.getCharsetForDecoding(encoding, bytes, len);
589589
if (charset == null) {
590590
throw raiseNode.raise(LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
591591
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextBuiltins.java

Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@
151151
import com.oracle.graal.python.builtins.objects.cext.capi.PythonNativeWrapper;
152152
import com.oracle.graal.python.builtins.objects.cext.capi.PythonNativeWrapperLibrary;
153153
import com.oracle.graal.python.builtins.objects.cext.capi.UnicodeObjectNodes.UnicodeAsWideCharNode;
154-
import com.oracle.graal.python.builtins.objects.cext.capi.UnicodeObjectNodesFactory.UnicodeAsWideCharNodeGen;
155154
import com.oracle.graal.python.builtins.objects.cext.common.CArrayWrappers.CStringWrapper;
156155
import com.oracle.graal.python.builtins.objects.cext.common.CExtAsPythonObjectNode;
157156
import com.oracle.graal.python.builtins.objects.cext.common.CExtCommonNodes;
@@ -264,6 +263,7 @@
264263
import com.oracle.graal.python.nodes.function.builtins.PythonVarargsBuiltinNode;
265264
import com.oracle.graal.python.nodes.function.builtins.clinic.ArgumentClinicProvider;
266265
import com.oracle.graal.python.nodes.object.GetClassNode;
266+
import com.oracle.graal.python.nodes.truffle.PythonArithmeticTypes;
267267
import com.oracle.graal.python.nodes.truffle.PythonTypes;
268268
import com.oracle.graal.python.nodes.util.CannotCastException;
269269
import com.oracle.graal.python.nodes.util.CastToJavaIntExactNode;
@@ -1432,27 +1432,16 @@ private String decodeUTF32(byte[] data, int size, String errors, int byteorder)
14321432
}
14331433
}
14341434

1435-
@Builtin(name = "PyTruffle_Unicode_AsWideChar", minNumOfPositionalArgs = 4)
1435+
@Builtin(name = "PyTruffle_Unicode_AsWideChar", minNumOfPositionalArgs = 3)
1436+
@TypeSystemReference(PythonArithmeticTypes.class)
14361437
@GenerateNodeFactory
14371438
abstract static class PyTruffle_Unicode_AsWideChar extends NativeUnicodeBuiltin {
1438-
@Child private UnicodeAsWideCharNode asWideCharNode;
1439-
14401439
@Specialization
1441-
Object doUnicode(VirtualFrame frame, Object s, long elementSize, @SuppressWarnings("unused") PNone elements, Object errorMarker,
1442-
@Shared("castStr") @Cached CastToJavaStringNode castStr) {
1443-
return doUnicode(frame, s, elementSize, -1, errorMarker, castStr);
1444-
}
1445-
1446-
@Specialization
1447-
Object doUnicode(VirtualFrame frame, Object s, long elementSize, long elements, Object errorMarker,
1448-
@Shared("castStr") @Cached CastToJavaStringNode castStr) {
1440+
Object doUnicode(VirtualFrame frame, Object s, long elementSize, Object errorMarker,
1441+
@Cached UnicodeAsWideCharNode asWideCharNode,
1442+
@Cached CastToJavaStringNode castStr) {
14491443
try {
1450-
if (asWideCharNode == null) {
1451-
CompilerDirectives.transferToInterpreterAndInvalidate();
1452-
asWideCharNode = insert(UnicodeAsWideCharNodeGen.create());
1453-
}
1454-
1455-
PBytes wchars = asWideCharNode.executeLittleEndian(castStr.execute(s), elementSize, elements);
1444+
PBytes wchars = asWideCharNode.executeLittleEndian(castStr.execute(s), elementSize);
14561445
if (wchars != null) {
14571446
return wchars;
14581447
} else {
@@ -1463,26 +1452,6 @@ Object doUnicode(VirtualFrame frame, Object s, long elementSize, long elements,
14631452
return raiseNative(frame, errorMarker, PythonErrorType.LookupError, "%m", e);
14641453
}
14651454
}
1466-
1467-
@Specialization
1468-
Object doUnicode(VirtualFrame frame, String s, PInt elementSize, @SuppressWarnings("unused") PNone elements, Object errorMarker,
1469-
@Shared("castStr") @Cached CastToJavaStringNode castStr) {
1470-
try {
1471-
return doUnicode(frame, s, elementSize.longValueExact(), -1, errorMarker, castStr);
1472-
} catch (OverflowException e) {
1473-
return raiseNative(frame, errorMarker, PythonErrorType.ValueError, ErrorMessages.INVALID_PARAMS);
1474-
}
1475-
}
1476-
1477-
@Specialization
1478-
Object doUnicode(VirtualFrame frame, String s, PInt elementSize, PInt elements, Object errorMarker,
1479-
@Shared("castStr") @Cached CastToJavaStringNode castStr) {
1480-
try {
1481-
return doUnicode(frame, s, elementSize.longValueExact(), elements.longValueExact(), errorMarker, castStr);
1482-
} catch (OverflowException e) {
1483-
return raiseNative(frame, errorMarker, PythonErrorType.ValueError, ErrorMessages.INVALID_PARAMS);
1484-
}
1485-
}
14861455
}
14871456

14881457
@Builtin(name = "PyTruffle_Bytes_AsString", minNumOfPositionalArgs = 2)

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/capi/DynamicObjectNativeWrapper.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -882,26 +882,24 @@ static Object doComplexCVal(PComplex object, @SuppressWarnings("unused") PythonN
882882
@Specialization(guards = "eq(UNICODE_WSTR, key)")
883883
static Object doWstr(PString object, @SuppressWarnings("unused") PythonNativeWrapper nativeWrapper, @SuppressWarnings("unused") String key,
884884
@Shared("asWideCharNode") @Cached UnicodeAsWideCharNode asWideCharNode,
885-
@Shared("sizeofWcharNode") @Cached SizeofWCharNode sizeofWcharNode,
886-
@Shared("strLen") @Cached StringLenNode stringLenNode) {
885+
@Shared("sizeofWcharNode") @Cached SizeofWCharNode sizeofWcharNode) {
887886
int elementSize = (int) sizeofWcharNode.execute(CApiContext.LAZY_CONTEXT);
888-
return new PySequenceArrayWrapper(asWideCharNode.executeNativeOrder(object, elementSize, stringLenNode.execute(object)), elementSize);
887+
return new PySequenceArrayWrapper(asWideCharNode.executeNativeOrder(object, elementSize), elementSize);
889888
}
890889

891890
@Specialization(guards = "eq(UNICODE_WSTR_LENGTH, key)")
892891
static long doWstrLength(PString object, @SuppressWarnings("unused") PythonNativeWrapper nativeWrapper, @SuppressWarnings("unused") String key,
893892
@Shared("asWideCharNode") @Cached UnicodeAsWideCharNode asWideCharNode,
894893
@Cached SequenceStorageNodes.LenNode lenNode,
895-
@Shared("sizeofWcharNode") @Cached SizeofWCharNode sizeofWcharNode,
896-
@Shared("strLen") @Cached StringLenNode stringLenNode) {
894+
@Shared("sizeofWcharNode") @Cached SizeofWCharNode sizeofWcharNode) {
897895
long sizeofWchar = sizeofWcharNode.execute(CApiContext.LAZY_CONTEXT);
898-
PBytes result = asWideCharNode.executeNativeOrder(object, sizeofWchar, stringLenNode.execute(object));
896+
PBytes result = asWideCharNode.executeNativeOrder(object, sizeofWchar);
899897
return lenNode.execute(result.getSequenceStorage()) / sizeofWchar;
900898
}
901899

902900
@Specialization(guards = "eq(UNICODE_LENGTH, key)")
903901
static long doUnicodeLength(PString object, @SuppressWarnings("unused") PythonNativeWrapper nativeWrapper, @SuppressWarnings("unused") String key,
904-
@Shared("strLen") @Cached StringLenNode stringLenNode) {
902+
@Cached StringLenNode stringLenNode) {
905903
return stringLenNode.execute(object);
906904
}
907905

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/capi/PyUnicodeWrappers.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,12 @@
5454
import java.nio.charset.StandardCharsets;
5555

5656
import com.oracle.graal.python.builtins.objects.cext.capi.CExtNodes.IsPointerNode;
57-
import com.oracle.graal.python.builtins.objects.cext.common.CExtCommonNodes.SizeofWCharNode;
5857
import com.oracle.graal.python.builtins.objects.cext.capi.DynamicObjectNativeWrapper.PAsPointerNode;
5958
import com.oracle.graal.python.builtins.objects.cext.capi.DynamicObjectNativeWrapper.ToPyObjectNode;
6059
import com.oracle.graal.python.builtins.objects.cext.capi.UnicodeObjectNodes.UnicodeAsWideCharNode;
60+
import com.oracle.graal.python.builtins.objects.cext.common.CExtCommonNodes.SizeofWCharNode;
6161
import com.oracle.graal.python.builtins.objects.str.NativeCharSequence;
6262
import com.oracle.graal.python.builtins.objects.str.PString;
63-
import com.oracle.graal.python.builtins.objects.str.StringNodes.StringLenNode;
6463
import com.oracle.graal.python.builtins.objects.str.StringNodes.StringMaterializeNode;
6564
import com.oracle.graal.python.runtime.GilNode;
6665
import com.oracle.truffle.api.CompilerDirectives;
@@ -163,7 +162,6 @@ Object readMember(String member,
163162
@CachedLibrary("this") PythonNativeWrapperLibrary lib,
164163
@Cached UnicodeAsWideCharNode asWideCharNode,
165164
@Cached SizeofWCharNode sizeofWcharNode,
166-
@Exclusive @Cached StringLenNode stringLenNode,
167165
@Exclusive @Cached GilNode gil) throws UnknownIdentifierException {
168166
boolean mustRelease = gil.acquire();
169167
try {
@@ -176,7 +174,7 @@ Object readMember(String member,
176174
// in this case, we can just return the pointer
177175
return ((NativeCharSequence) content).getPtr();
178176
}
179-
return new PySequenceArrayWrapper(asWideCharNode.executeNativeOrder(s, elementSize, stringLenNode.execute(s)), elementSize);
177+
return new PySequenceArrayWrapper(asWideCharNode.executeNativeOrder(s, elementSize), elementSize);
180178
}
181179
throw UnknownIdentifierException.create(member);
182180
} finally {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/cext/capi/UnicodeObjectNodes.java

Lines changed: 16 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
package com.oracle.graal.python.builtins.objects.cext.capi;
4242

4343
import java.nio.ByteBuffer;
44+
import java.nio.ByteOrder;
4445
import java.nio.charset.Charset;
4546

4647
import com.oracle.graal.python.builtins.objects.bytes.PBytes;
@@ -58,56 +59,46 @@ public abstract class UnicodeObjectNodes {
5859

5960
@GenerateUncached
6061
public abstract static class UnicodeAsWideCharNode extends Node {
61-
private static final int NATIVE_ORDER = 0;
62-
private static final int LITTLE_ENDIAN = -1;
63-
private static final int BIG_ENDIAN = 1;
64-
private static Charset UTF32;
65-
private static Charset UTF32LE;
66-
private static Charset UTF32BE;
62+
private static Charset UTF32LE = Charset.forName("UTF-32LE");
63+
private static Charset UTF32BE = Charset.forName("UTF-32BE");
6764

68-
public final PBytes executeNativeOrder(Object obj, long elementSize, long elements) {
69-
return execute(obj, elementSize, elements, UnicodeAsWideCharNode.NATIVE_ORDER);
65+
public final PBytes executeNativeOrder(Object obj, long elementSize) {
66+
return execute(obj, elementSize, ByteOrder.nativeOrder());
7067
}
7168

72-
public final PBytes executeLittleEndian(Object obj, long elementSize, long elements) {
73-
return execute(obj, elementSize, elements, UnicodeAsWideCharNode.LITTLE_ENDIAN);
69+
public final PBytes executeLittleEndian(Object obj, long elementSize) {
70+
return execute(obj, elementSize, ByteOrder.LITTLE_ENDIAN);
7471
}
7572

76-
public final PBytes executeBigEndian(Object obj, long elementSize, long elements) {
77-
return execute(obj, elementSize, elements, UnicodeAsWideCharNode.BIG_ENDIAN);
73+
public final PBytes executeBigEndian(Object obj, long elementSize) {
74+
return execute(obj, elementSize, ByteOrder.BIG_ENDIAN);
7875
}
7976

80-
public abstract PBytes execute(Object obj, long elementSize, long elements, int byteOrder);
77+
public abstract PBytes execute(Object obj, long elementSize, ByteOrder byteOrder);
8178

8279
@Specialization
83-
static PBytes doUnicode(PString s, long elementSize, long elements, int byteOrder,
80+
static PBytes doUnicode(PString s, long elementSize, ByteOrder byteOrder,
8481
@Cached StringMaterializeNode materializeNode,
8582
@Shared("factory") @Cached PythonObjectFactory factory) {
86-
return doUnicode(materializeNode.execute(s), elementSize, elements, byteOrder, factory);
83+
return doUnicode(materializeNode.execute(s), elementSize, byteOrder, factory);
8784
}
8885

8986
@Specialization
9087
@TruffleBoundary
91-
static PBytes doUnicode(String s, long elementSize, long elements, int byteOrder,
88+
static PBytes doUnicode(String s, long elementSize, ByteOrder byteOrder,
9289
@Shared("factory") @Cached PythonObjectFactory factory) {
93-
// use native byte order
94-
Charset utf32Charset = getUTF32Charset(-1);
90+
Charset utf32Charset = byteOrder == ByteOrder.LITTLE_ENDIAN ? UTF32LE : UTF32BE;
9591

9692
// elementSize == 2: Store String in 'wchar_t' of size == 2, i.e., use UCS2. This is
9793
// achieved by decoding to UTF32 (which is basically UCS4) and ignoring the two
9894
// MSBs.
9995
if (elementSize == 2L) {
10096
ByteBuffer bytes = ByteBuffer.wrap(s.getBytes(utf32Charset));
10197
// FIXME unsafe narrowing
102-
int size;
103-
if (elements >= 0) {
104-
size = Math.min(bytes.remaining() / 2, (int) (elements * elementSize));
105-
} else {
106-
size = bytes.remaining() / 2;
107-
}
98+
int size = bytes.remaining() / 2;
10899
ByteBuffer buf = ByteBuffer.allocate(size);
109100
while (bytes.remaining() >= 4) {
110-
if (byteOrder < NATIVE_ORDER) {
101+
if (byteOrder != ByteOrder.nativeOrder()) {
111102
buf.putChar((char) ((bytes.getInt() & 0xFFFF0000) >> 16));
112103
} else {
113104
buf.putChar((char) (bytes.getInt() & 0x0000FFFF));
@@ -123,36 +114,5 @@ static PBytes doUnicode(String s, long elementSize, long elements, int byteOrder
123114
throw new RuntimeException("unsupported wchar size; was: " + elementSize);
124115
}
125116
}
126-
127-
protected static Charset getUTF32Charset(int byteorder) {
128-
String utf32Name = getUTF32Name(byteorder);
129-
if (byteorder == NATIVE_ORDER) {
130-
if (UTF32 == null) {
131-
UTF32 = Charset.forName(utf32Name);
132-
}
133-
return UTF32;
134-
} else if (byteorder < NATIVE_ORDER) {
135-
if (UTF32LE == null) {
136-
UTF32LE = Charset.forName(utf32Name);
137-
}
138-
return UTF32LE;
139-
}
140-
if (UTF32BE == null) {
141-
UTF32BE = Charset.forName(utf32Name);
142-
}
143-
return UTF32BE;
144-
}
145-
146-
protected static String getUTF32Name(int byteorder) {
147-
String csName;
148-
if (byteorder == 0) {
149-
csName = "UTF-32";
150-
} else if (byteorder < 0) {
151-
csName = "UTF-32LE";
152-
} else {
153-
csName = "UTF-32BE";
154-
}
155-
return csName;
156-
}
157117
}
158118
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
*/
4141
package com.oracle.graal.python.util;
4242

43+
import java.nio.ByteOrder;
4344
import java.nio.charset.Charset;
4445
import java.nio.charset.StandardCharsets;
4546
import java.nio.charset.UnsupportedCharsetException;
@@ -71,6 +72,30 @@ public static Charset getCharset(String encoding) {
7172
return null;
7273
}
7374

75+
@TruffleBoundary
76+
public static Charset getCharsetForDecoding(String encoding, byte[] bytes, int len) {
77+
String normalized = normalize(encoding);
78+
if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) {
79+
/*
80+
* JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of the
81+
* platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject big endian
82+
* BOM. CPython defaults to platform endian and accepts both BOMs. So, in order to get
83+
* the behavior we need, we have to take a peek at the possible BOM and if it's BE BOM,
84+
* we use BE encoding, otherwise LE encoding.
85+
*/
86+
if ("utf_16".equals(normalized) && len >= 2 && bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) {
87+
return StandardCharsets.UTF_16BE;
88+
} else if ("utf_32".equals(normalized) && len >= 4 && bytes[0] == 0 && bytes[1] == 0 && bytes[2] == (byte) 0xFE && bytes[3] == (byte) 0xFF) {
89+
return getJavaCharset("UTF-32BE");
90+
}
91+
}
92+
String name = CHARSET_NAME_MAP.get(normalized);
93+
if (name != null) {
94+
return getJavaCharset(name);
95+
}
96+
return null;
97+
}
98+
7499
@TruffleBoundary
75100
public static String getPythonEncodingNameFromJavaName(String javaEncodingName) {
76101
return CHARSET_NAME_MAP_REVERSE.get(javaEncodingName.toLowerCase());
@@ -138,7 +163,8 @@ private static void addAlias(String alias, String pythonName) {
138163
JAVA_CHARSETS.put("UTF-8", StandardCharsets.UTF_8);
139164
JAVA_CHARSETS.put("UTF-16BE", StandardCharsets.UTF_16BE);
140165
JAVA_CHARSETS.put("UTF-16LE", StandardCharsets.UTF_16LE);
141-
JAVA_CHARSETS.put("UTF-16", StandardCharsets.UTF_16);
166+
JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UnicodeLittle") : StandardCharsets.UTF_16);
167+
JAVA_CHARSETS.put("UTF-32", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UTF-32LE-BOM") : Charset.forName("UTF-32BE-BOM"));
142168

143169
// Add our custom charsets
144170
addMapping("raw_unicode_escape", "x-python-raw-unicode-escape");

0 commit comments

Comments
 (0)