Skip to content

Commit fa83811

Browse files
committed
[GR-12796] Add tests and more complete implementation of unicode-escape encode/decode
PullRequest: graalpython/320
2 parents 8dfe17d + 29f8f64 commit fa83811

File tree

6 files changed

+173
-13
lines changed

6 files changed

+173
-13
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
55

66

7+
def coding_checker(self, coder):
8+
def check(input, expect):
9+
self.assertEqual(coder(input), (expect, len(input)))
10+
return check
11+
12+
713
def assert_raises(err, fn, *args, **kwargs):
814
raised = False
915
try:
@@ -113,3 +119,56 @@ def test_constructorx(self):
113119

114120
def test_encodex(self):
115121
self.assertEqual(codecs.encode('abc', codecname), list('abc'))
122+
123+
124+
class UnicodeEscapeTest(unittest.TestCase):
125+
def test_empty(self):
126+
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
127+
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
128+
129+
def test_raw_encode(self):
130+
encode = codecs.unicode_escape_encode
131+
for b in range(32, 127):
132+
if b != b'\\'[0]:
133+
self.assertEqual(encode(chr(b)), (bytes([b]), 1))
134+
135+
def test_escape_encode(self):
136+
encode = codecs.unicode_escape_encode
137+
check = coding_checker(self, encode)
138+
check('\t', br'\t')
139+
check('\n', br'\n')
140+
check('\r', br'\r')
141+
check('\\', br'\\')
142+
for b in range(32):
143+
if chr(b) not in '\t\n\r':
144+
check(chr(b), ('\\x%02x' % b).encode())
145+
for b in range(127, 256):
146+
check(chr(b), ('\\x%02x' % b).encode())
147+
check('\u20ac', br'\u20ac')
148+
# TODO Truffle: not working yet
149+
# check('\U0001d120', br'\U0001d120')
150+
151+
def test_escape_decode(self):
152+
decode = codecs.unicode_escape_decode
153+
check = coding_checker(self, decode)
154+
check(b"[\\\n]", "[]")
155+
check(br'[\"]', '["]')
156+
check(br"[\']", "[']")
157+
check(br"[\\]", r"[\]")
158+
check(br"[\a]", "[\x07]")
159+
check(br"[\b]", "[\x08]")
160+
check(br"[\t]", "[\x09]")
161+
check(br"[\n]", "[\x0a]")
162+
check(br"[\v]", "[\x0b]")
163+
check(br"[\f]", "[\x0c]")
164+
check(br"[\r]", "[\x0d]")
165+
check(br"[\7]", "[\x07]")
166+
check(br"[\78]", "[\x078]")
167+
check(br"[\41]", "[!]")
168+
check(br"[\418]", "[!8]")
169+
check(br"[\101]", "[A]")
170+
check(br"[\1010]", "[A0]")
171+
check(br"[\x41]", "[A]")
172+
check(br"[\x410]", "[A0]")
173+
check(br"\u20ac", "\u20ac")
174+
check(br"\U0001d120", "\U0001d120")

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import java.nio.charset.CharacterCodingException;
5151
import java.nio.charset.Charset;
5252
import java.nio.charset.CodingErrorAction;
53+
import java.util.Arrays;
5354
import java.util.HashMap;
5455
import java.util.List;
5556
import java.util.Map;
@@ -58,17 +59,22 @@
5859
import com.oracle.graal.python.builtins.CoreFunctions;
5960
import com.oracle.graal.python.builtins.PythonBuiltins;
6061
import com.oracle.graal.python.builtins.objects.PNone;
62+
import com.oracle.graal.python.builtins.objects.bytes.BytesNodes;
6163
import com.oracle.graal.python.builtins.objects.bytes.PBytes;
6264
import com.oracle.graal.python.builtins.objects.bytes.PIBytesLike;
6365
import com.oracle.graal.python.builtins.objects.common.SequenceStorageNodes;
6466
import com.oracle.graal.python.builtins.objects.tuple.PTuple;
6567
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
6668
import com.oracle.graal.python.nodes.function.PythonBuiltinNode;
69+
import com.oracle.graal.python.nodes.function.builtins.PythonBinaryBuiltinNode;
70+
import com.oracle.graal.python.nodes.truffle.PythonArithmeticTypes;
71+
import com.oracle.graal.python.runtime.PythonCore;
6772
import com.oracle.truffle.api.CompilerDirectives;
6873
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
6974
import com.oracle.truffle.api.dsl.Cached;
7075
import com.oracle.truffle.api.dsl.Fallback;
7176
import com.oracle.truffle.api.dsl.GenerateNodeFactory;
77+
import com.oracle.truffle.api.dsl.ImportStatic;
7278
import com.oracle.truffle.api.dsl.NodeFactory;
7379
import com.oracle.truffle.api.dsl.Specialization;
7480
import com.oracle.truffle.api.profiles.ValueProfile;
@@ -251,6 +257,102 @@ protected static CodingErrorAction convertCodingErrorAction(String errors) {
251257
}
252258
}
253259

260+
@Builtin(name = "unicode_escape_encode", fixedNumOfPositionalArgs = 1, keywordArguments = {"errors"})
261+
@GenerateNodeFactory
262+
@ImportStatic(PythonArithmeticTypes.class)
263+
abstract static class UnicodeEscapeEncode extends PythonBinaryBuiltinNode {
264+
static final byte[] hexdigits = "0123456789abcdef".getBytes();
265+
266+
@Specialization
267+
@TruffleBoundary
268+
Object encode(String str, @SuppressWarnings("unused") Object errors) {
269+
// Initial allocation of bytes for UCS4 strings needs 10 bytes per source character
270+
// ('\U00xxxxxx')
271+
byte[] bytes = new byte[str.length() * 10];
272+
int j = 0;
273+
for (int i = 0; i < str.length(); i++) {
274+
int ch = str.codePointAt(i);
275+
/* U+0000-U+00ff range */
276+
if (ch < 0x100) {
277+
if (ch >= ' ' && ch < 127) {
278+
if (ch != '\\') {
279+
/* Copy printable US ASCII as-is */
280+
bytes[j++] = (byte) ch;
281+
} else {
282+
/* Escape backslashes */
283+
bytes[j++] = '\\';
284+
bytes[j++] = '\\';
285+
}
286+
} else if (ch == '\t') {
287+
/* Map special whitespace to '\t', \n', '\r' */
288+
bytes[j++] = '\\';
289+
bytes[j++] = 't';
290+
} else if (ch == '\n') {
291+
bytes[j++] = '\\';
292+
bytes[j++] = 'n';
293+
} else if (ch == '\r') {
294+
bytes[j++] = '\\';
295+
bytes[j++] = 'r';
296+
} else {
297+
/* Map non-printable US ASCII and 8-bit characters to '\xHH' */
298+
bytes[j++] = '\\';
299+
bytes[j++] = 'x';
300+
bytes[j++] = hexdigits[(ch >> 4) & 0x000F];
301+
bytes[j++] = hexdigits[ch & 0x000F];
302+
}
303+
} else if (ch < 0x10000) {
304+
/* U+0100-U+ffff range: Map 16-bit characters to '\\uHHHH' */
305+
bytes[j++] = '\\';
306+
bytes[j++] = 'u';
307+
bytes[j++] = hexdigits[(ch >> 12) & 0x000F];
308+
bytes[j++] = hexdigits[(ch >> 8) & 0x000F];
309+
bytes[j++] = hexdigits[(ch >> 4) & 0x000F];
310+
bytes[j++] = hexdigits[ch & 0x000F];
311+
} else {
312+
/* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
313+
/* Make sure that the first two digits are zero */
314+
bytes[j++] = '\\';
315+
bytes[j++] = 'U';
316+
bytes[j++] = '0';
317+
bytes[j++] = '0';
318+
bytes[j++] = hexdigits[(ch >> 20) & 0x0000000F];
319+
bytes[j++] = hexdigits[(ch >> 16) & 0x0000000F];
320+
bytes[j++] = hexdigits[(ch >> 12) & 0x0000000F];
321+
bytes[j++] = hexdigits[(ch >> 8) & 0x0000000F];
322+
bytes[j++] = hexdigits[(ch >> 4) & 0x0000000F];
323+
bytes[j++] = hexdigits[ch & 0x0000000F];
324+
}
325+
}
326+
bytes = Arrays.copyOf(bytes, j);
327+
return factory().createTuple(new Object[]{factory().createBytes(bytes), str.length()});
328+
}
329+
330+
@Fallback
331+
Object encode(Object str, @SuppressWarnings("unused") Object errors) {
332+
throw raise(TypeError, "unicode_escape_encode() argument 1 must be str, not %p", str);
333+
}
334+
}
335+
336+
@Builtin(name = "unicode_escape_decode", fixedNumOfPositionalArgs = 1, keywordArguments = {"errors"})
337+
@GenerateNodeFactory
338+
abstract static class UnicodeEscapeDecode extends PythonBinaryBuiltinNode {
339+
@Specialization(guards = "isBytes(bytes)")
340+
Object encode(Object bytes, @SuppressWarnings("unused") PNone errors,
341+
@Cached("create()") BytesNodes.ToBytesNode toBytes) {
342+
// for now we'll just parse this as a String, ignoring any error strategies
343+
PythonCore core = getCore();
344+
byte[] byteArray = toBytes.execute(bytes);
345+
String string = strFromBytes(byteArray);
346+
String unescapedString = core.getParser().unescapeJavaString(string);
347+
return factory().createTuple(new Object[]{unescapedString, byteArray.length});
348+
}
349+
350+
@TruffleBoundary
351+
private static String strFromBytes(byte[] execute) {
352+
return new String(execute);
353+
}
354+
}
355+
254356
// _codecs.encode(obj, encoding='utf-8', errors='strict')
255357
@Builtin(name = "__truffle_encode", fixedNumOfPositionalArgs = 1, keywordArguments = {"encoding", "errors"})
256358
@GenerateNodeFactory

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonParserImpl.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ public boolean isIdentifier(PythonCore core, String snippet) {
114114
return input.NAME() != null;
115115
}
116116

117+
@Override
118+
@TruffleBoundary
119+
public String unescapeJavaString(String str) {
120+
return PythonTreeTranslator.unescapeJavaString(str);
121+
}
122+
117123
private static PException handleParserError(ParserErrorCallback errors, Source source, Exception e) {
118124
SourceSection section = PythonErrorStrategy.getPosition(source, e);
119125
throw errors.raiseInvalidSyntax(source, section);

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonTreeTranslator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ private PNode parseString(String[] strings) {
658658
}
659659
}
660660

661-
private static String unescapeJavaString(String st) {
661+
public static String unescapeJavaString(String st) {
662662
if (st.indexOf("\\") == -1) {
663663
return st;
664664
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PythonParser.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ default RuntimeException raiseInvalidSyntax(Source source, SourceSection section
9393
*/
9494
boolean isIdentifier(PythonCore core, String snippet);
9595

96+
/**
97+
* Unescape Python escapes from a Java string
98+
*/
99+
public abstract String unescapeJavaString(String str);
100+
96101
/**
97102
* Runtime exception used to indicate incomplete source code during parsing.
98103
*/

graalpython/lib-graalpython/_codecs.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -245,18 +245,6 @@ def utf_32_ex_decode(data, errors=None, byteorder=0, final=False):
245245
raise NotImplementedError("utf_32_ex_decode")
246246

247247

248-
@__builtin__
249-
def unicode_escape_encode(string, errors=None):
250-
if not isinstance(string, str):
251-
raise TypeError("unicode_escape_encode() argument 1 must be str, not %s", type(string))
252-
return __truffle_encode(repr(string)[1:-1], "latin-1", errors)
253-
254-
255-
@__builtin__
256-
def unicode_escape_decode(string, errors=None):
257-
raise NotImplementedError("unicode_escape_decode")
258-
259-
260248
@__builtin__
261249
def unicode_internal_encode(obj, errors=None):
262250
raise NotImplementedError("unicode_internal_encode")

0 commit comments

Comments
 (0)