Skip to content

Commit 7261459

Browse files
committed
Reject surrogates when decoding UTF32
1 parent df6c724 commit 7261459

File tree

3 files changed

+138
-8
lines changed

3 files changed

+138
-8
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,5 +875,11 @@ def test_encode(self):
875875
self.assertRaises(UnicodeEncodeError, codec.encode, '\xffff')
876876

877877

878+
class UTF32Test(unittest.TestCase):
879+
def test_utf32_surrogate_error(self):
880+
with self.assertRaisesRegex(UnicodeDecodeError, "'utf_32' codec can't decode bytes in position 4-7"):
881+
b'a\x00\x00\x00\x00\xd8\x00\x00z\x00\x00\x00'.decode('utf-32')
882+
883+
878884
if __name__ == '__main__':
879885
unittest.main()
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.charset;
42+
43+
import java.nio.ByteBuffer;
44+
import java.nio.ByteOrder;
45+
import java.nio.CharBuffer;
46+
import java.nio.charset.Charset;
47+
import java.nio.charset.CharsetDecoder;
48+
import java.nio.charset.CharsetEncoder;
49+
import java.nio.charset.CoderResult;
50+
51+
import com.oracle.graal.python.util.PythonUtils;
52+
53+
public class PythonUTF32CharsetWrapper extends Charset {
54+
private final ByteOrder byteOrder;
55+
private final Charset delegate;
56+
57+
public PythonUTF32CharsetWrapper(Charset delegate, ByteOrder byteOrder) {
58+
super("x-python-UTF32" + (byteOrder == ByteOrder.BIG_ENDIAN ? "BE" : "LE"), PythonUtils.EMPTY_STRING_ARRAY);
59+
this.byteOrder = byteOrder;
60+
this.delegate = delegate;
61+
}
62+
63+
@Override
64+
public boolean contains(Charset cs) {
65+
return delegate.contains(cs);
66+
}
67+
68+
@Override
69+
public CharsetDecoder newDecoder() {
70+
return new DecoderWrapper(this, delegate.newDecoder(), byteOrder);
71+
}
72+
73+
@Override
74+
public CharsetEncoder newEncoder() {
75+
return delegate.newEncoder();
76+
}
77+
78+
private static class DecoderWrapper extends CharsetDecoder {
79+
private final CharsetDecoder delegate;
80+
private final ByteOrder byteOrder;
81+
82+
private DecoderWrapper(Charset charset, CharsetDecoder delegate, ByteOrder byteOrder) {
83+
super(charset, 4, 4);
84+
this.delegate = delegate;
85+
this.byteOrder = byteOrder;
86+
}
87+
88+
@Override
89+
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
90+
ByteOrder originalByteOrder = in.order();
91+
int originalPosition = in.position();
92+
in.order(byteOrder);
93+
try {
94+
while (in.remaining() >= 4) {
95+
int cp = in.getInt();
96+
if (0xD800 <= cp && cp <= 0xDFFF) {
97+
in.position(in.position() - 4);
98+
return CoderResult.malformedForLength(4);
99+
}
100+
}
101+
} finally {
102+
in.order(originalByteOrder);
103+
}
104+
in.position(originalPosition);
105+
return delegate.decode(in, out, false);
106+
}
107+
}
108+
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -54,7 +54,9 @@
5454
import java.util.concurrent.ConcurrentMap;
5555

5656
import org.graalvm.shadowed.com.ibm.icu.charset.CharsetICU;
57+
5758
import com.oracle.graal.python.charset.PythonRawUnicodeEscapeCharset;
59+
import com.oracle.graal.python.charset.PythonUTF32CharsetWrapper;
5860
import com.oracle.graal.python.charset.PythonUnicodeEscapeCharset;
5961
import com.oracle.graal.python.util.CharsetMappingFactory.NormalizeEncodingNameNodeGen;
6062
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
@@ -72,7 +74,10 @@
7274
* Utility class for mapping Python encodings to Java charsets
7375
*/
7476
public class CharsetMapping {
75-
private static final Charset UTF_32 = Charset.forName("UTF_32");
77+
private static final Charset UTF_32LE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE"), ByteOrder.LITTLE_ENDIAN);
78+
private static final Charset UTF_32LE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE-BOM"), ByteOrder.LITTLE_ENDIAN);
79+
private static final Charset UTF_32BE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE"), ByteOrder.BIG_ENDIAN);
80+
private static final Charset UTF_32BE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE-BOM"), ByteOrder.BIG_ENDIAN);
7681
private static final ConcurrentMap<String, Charset> JAVA_CHARSETS = new ConcurrentHashMap<>();
7782
// Name maps are populated by static initializer and are immutable afterwards
7883
private static final Map<TruffleString, String> CHARSET_NAME_MAP = new HashMap<>();
@@ -101,8 +106,11 @@ public static Charset getCharsetForDecodingNormalized(TruffleString normalizedEn
101106
*/
102107
if (T_UTF_16_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING) && hasUTF16BOM(bytes, len)) {
103108
return StandardCharsets.UTF_16;
104-
} else if (T_UTF_32_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING) && hasUTF32BOM(bytes, len)) {
105-
return UTF_32;
109+
} else if (T_UTF_32_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING)) {
110+
Charset charset = getUTF32CharsetForBOM(bytes, len);
111+
if (charset != null) {
112+
return charset;
113+
}
106114
}
107115
}
108116
String name = CHARSET_NAME_MAP.get(normalizedEncoding);
@@ -120,12 +128,18 @@ private static boolean hasUTF16BOM(byte[] bytes, int len) {
120128
return head == (short) 0xFFFE || head == (short) 0xFEFF;
121129
}
122130

123-
private static boolean hasUTF32BOM(byte[] bytes, int len) {
131+
private static Charset getUTF32CharsetForBOM(byte[] bytes, int len) {
124132
if (len < 4) {
125-
return false;
133+
return null;
126134
}
127135
int head = PythonUtils.ARRAY_ACCESSOR.getInt(bytes, 0);
128-
return head == 0xFFFE0000 || head == 0x0000FEFF;
136+
if (head == 0xFFFE0000) {
137+
return UTF_32BE_BOM;
138+
}
139+
if (head == 0x0000FEFF) {
140+
return UTF_32LE_BOM;
141+
}
142+
return null;
129143
}
130144

131145
@TruffleBoundary
@@ -214,7 +228,9 @@ private static void addAlias(String alias, String pythonName) {
214228
JAVA_CHARSETS.put("UTF-16BE", StandardCharsets.UTF_16BE);
215229
JAVA_CHARSETS.put("UTF-16LE", StandardCharsets.UTF_16LE);
216230
JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UnicodeLittle") : StandardCharsets.UTF_16);
217-
JAVA_CHARSETS.put("UTF-32", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UTF-32LE-BOM") : Charset.forName("UTF-32BE-BOM"));
231+
JAVA_CHARSETS.put("UTF-32BE", UTF_32BE);
232+
JAVA_CHARSETS.put("UTF-32LE", UTF_32LE);
233+
JAVA_CHARSETS.put("UTF-32", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? UTF_32LE_BOM : UTF_32BE_BOM);
218234

219235
// Add our custom charsets
220236
addMapping("raw_unicode_escape", "x-python-raw-unicode-escape");

0 commit comments

Comments
 (0)