Skip to content

Commit 16a6509

Browse files
committed
[GR-23268] Fixes for test_unicode, part 4
PullRequest: graalpython/1285
2 parents 637fbbd + 875e449 commit 16a6509

File tree

13 files changed

+1032
-292
lines changed

13 files changed

+1032
-292
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019, Oracle and/or its affiliates.
1+
# Copyright (c) 2019, 2020, Oracle and/or its affiliates.
22
# Copyright (C) 1996-2017 Python Software Foundation
33
#
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
@@ -151,8 +151,7 @@ def test_escape_encode(self):
151151
for b in range(127, 256):
152152
check(chr(b), ('\\x%02x' % b).encode())
153153
check('\u20ac', br'\u20ac')
154-
# TODO Truffle: not working yet
155-
# check('\U0001d120', br'\U0001d120')
154+
check('\U0001d120', br'\U0001d120')
156155

157156
def test_escape_decode(self):
158157
decode = codecs.unicode_escape_decode
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_backslashescape
12
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_badregistercall
23
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_bug828737
34
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_translatehelper
4-
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_unencodablereplacement
55
*graalpython.lib-python.3.test.test_codeccallbacks.CodecCallbackTest.test_unknownhandler

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_unicode.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs
2323
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs_idna
2424
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs_utf7
25+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs_utf8
2526
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_compare
2627
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_comparison
2728
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_concatenation
@@ -108,6 +109,7 @@
108109
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_surrogates
109110
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_swapcase
110111
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_title
112+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_ucs4
111113
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_unexpected_end_of_data
112114
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_unicode_repr
113115
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_upper

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 300 additions & 271 deletions
Large diffs are not rendered by default.

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,18 +297,24 @@ public static int unicodeEscape(int codePoint, int startIndex, byte[] buffer) {
297297
buffer[i++] = 'r';
298298
} else {
299299
/* Map non-printable US ASCII and 8-bit characters to '\xHH' */
300-
buffer[i++] = '\\';
301-
buffer[i++] = 'x';
302-
buffer[i++] = hexdigits[(codePoint >> 4) & 0x000F];
303-
buffer[i++] = hexdigits[codePoint & 0x000F];
300+
byteEscape(codePoint, i, buffer);
301+
i += 4;
304302
}
305303
} else {
306304
i = unicodeNonAsciiEscape(codePoint, i, buffer);
307305
}
308306
return i;
309307
}
310308

311-
private static int unicodeNonAsciiEscape(int codePoint, int startIndex, byte[] buffer) {
309+
public static void byteEscape(int codePoint, int startIndex, byte[] buffer) {
310+
int i = startIndex;
311+
buffer[i++] = '\\';
312+
buffer[i++] = 'x';
313+
buffer[i++] = hexdigits[(codePoint >> 4) & 0x000F];
314+
buffer[i] = hexdigits[codePoint & 0x000F];
315+
}
316+
317+
public static int unicodeNonAsciiEscape(int codePoint, int startIndex, byte[] buffer) {
312318
int i = startIndex;
313319
if (codePoint < 0x100) {
314320
buffer[i++] = (byte) codePoint;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.charset;
42+
43+
import java.nio.charset.Charset;
44+
import java.nio.charset.CharsetDecoder;
45+
import java.nio.charset.CharsetEncoder;
46+
47+
public class PythonRawUnicodeEscapeCharset extends Charset {
48+
public PythonRawUnicodeEscapeCharset() {
49+
super("x-python-raw-unicode-escape", new String[0]);
50+
}
51+
52+
@Override
53+
public boolean contains(Charset charset) {
54+
return false;
55+
}
56+
57+
@Override
58+
public CharsetDecoder newDecoder() {
59+
return new PythonRawUnicodeEscapeCharsetDecoder(this);
60+
}
61+
62+
@Override
63+
public CharsetEncoder newEncoder() {
64+
return new PythonRawUnicodeEscapeCharsetEncoder(this);
65+
}
66+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
/*
2+
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.charset;
42+
43+
import java.nio.ByteBuffer;
44+
import java.nio.CharBuffer;
45+
import java.nio.charset.Charset;
46+
import java.nio.charset.CharsetDecoder;
47+
import java.nio.charset.CoderResult;
48+
49+
public class PythonRawUnicodeEscapeCharsetDecoder extends CharsetDecoder {
50+
private boolean seenBackslash = false;
51+
52+
protected PythonRawUnicodeEscapeCharsetDecoder(Charset cs) {
53+
super(cs, 1, 1);
54+
}
55+
56+
@Override
57+
protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target) {
58+
while (true) {
59+
if (!source.hasRemaining()) {
60+
return CoderResult.UNDERFLOW;
61+
}
62+
if (!target.hasRemaining()) {
63+
return CoderResult.OVERFLOW;
64+
}
65+
int initialPosition = source.position();
66+
byte b = source.get();
67+
if (seenBackslash) {
68+
// Report error from the backslash included
69+
initialPosition--;
70+
if (b == (byte) 'u' || b == (byte) 'U') {
71+
CoderResult result = PythonUnicodeEscapeCharsetDecoder.decodeHexUnicodeEscape(source, target, b, initialPosition);
72+
if (result != null) {
73+
return result;
74+
}
75+
seenBackslash = false;
76+
} else {
77+
target.put('\\');
78+
seenBackslash = false;
79+
}
80+
} else if (b == (byte) '\\') {
81+
seenBackslash = true;
82+
} else {
83+
// Bytes that are not an escape sequence are latin-1, which maps to unicode
84+
// codepoints directly
85+
target.put((char) (b & 0xFF));
86+
}
87+
}
88+
}
89+
90+
@Override
91+
protected CoderResult implFlush(CharBuffer target) {
92+
if (seenBackslash) {
93+
if (!target.hasRemaining()) {
94+
return CoderResult.OVERFLOW;
95+
}
96+
target.put('\\');
97+
seenBackslash = false;
98+
}
99+
return CoderResult.UNDERFLOW;
100+
}
101+
102+
@Override
103+
protected void implReset() {
104+
seenBackslash = false;
105+
}
106+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.charset;
42+
43+
import java.nio.ByteBuffer;
44+
import java.nio.CharBuffer;
45+
import java.nio.charset.Charset;
46+
import java.nio.charset.CharsetEncoder;
47+
import java.nio.charset.CoderResult;
48+
49+
public class PythonRawUnicodeEscapeCharsetEncoder extends CharsetEncoder {
50+
protected PythonRawUnicodeEscapeCharsetEncoder(Charset cs) {
51+
super(cs, 2, 10, new byte[]{(byte) '?'});
52+
}
53+
54+
@Override
55+
protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target) {
56+
while (true) {
57+
if (!source.hasRemaining()) {
58+
return CoderResult.UNDERFLOW;
59+
}
60+
if (!target.hasRemaining()) {
61+
return CoderResult.OVERFLOW;
62+
}
63+
int initialPosition = source.position();
64+
char ch = source.get();
65+
int codePoint = ch;
66+
if (Character.isHighSurrogate(ch)) {
67+
if (!source.hasRemaining()) {
68+
source.position(initialPosition);
69+
return CoderResult.UNDERFLOW;
70+
}
71+
char low = source.get();
72+
if (Character.isLowSurrogate(low)) {
73+
codePoint = Character.toCodePoint(ch, low);
74+
} else {
75+
// Unpaired surrogate, this shouldn't happen in any sanely constructed Java
76+
// String
77+
source.position(source.position() - 2);
78+
return CoderResult.malformedForLength(2);
79+
}
80+
}
81+
if (codePoint <= 0xFF) {
82+
// ASCII
83+
target.put((byte) codePoint);
84+
} else {
85+
String hexString = String.format((codePoint <= 0xFFFF ? "\\u%04x" : "\\U%08x"), codePoint);
86+
for (int i = 0; i < hexString.length(); i++) {
87+
if (!target.hasRemaining()) {
88+
source.position(initialPosition);
89+
target.position(target.position() - i);
90+
return CoderResult.OVERFLOW;
91+
}
92+
target.put((byte) hexString.charAt(i));
93+
}
94+
}
95+
}
96+
}
97+
}

0 commit comments

Comments
 (0)