Skip to content

Commit 21c4324

Browse files
committed
decode bytes to string through the parser
1 parent dfe6086 commit 21c4324

File tree

6 files changed

+122
-6
lines changed

6 files changed

+122
-6
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_codecs.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
# Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
55

66

7+
def coding_checker(self, coder):
8+
def check(input, expect):
9+
self.assertEqual(coder(input), (expect, len(input)))
10+
return check
11+
12+
713
def assert_raises(err, fn, *args, **kwargs):
814
raised = False
915
try:
@@ -113,3 +119,85 @@ def test_constructorx(self):
113119

114120
def test_encodex(self):
115121
self.assertEqual(codecs.encode('abc', codecname), list('abc'))
122+
123+
124+
class UnicodeEscapeTest(unittest.TestCase):
125+
def test_empty(self):
126+
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
127+
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
128+
129+
def test_raw_encode(self):
130+
encode = codecs.unicode_escape_encode
131+
for b in range(32, 127):
132+
if b != b'\\'[0]:
133+
self.assertEqual(encode(chr(b)), (bytes([b]), 1))
134+
135+
def test_escape_encode(self):
136+
encode = codecs.unicode_escape_encode
137+
check = coding_checker(self, encode)
138+
check('\t', br'\t')
139+
check('\n', br'\n')
140+
check('\r', br'\r')
141+
check('\\', br'\\')
142+
for b in range(32):
143+
if chr(b) not in '\t\n\r':
144+
check(chr(b), ('\\x%02x' % b).encode())
145+
for b in range(127, 256):
146+
check(chr(b), ('\\x%02x' % b).encode())
147+
check('\u20ac', br'\u20ac')
148+
check('\U0001d120', br'\U0001d120')
149+
150+
def test_escape_decode(self):
151+
decode = codecs.unicode_escape_decode
152+
check = coding_checker(self, decode)
153+
check(b"[\\\n]", "[]")
154+
check(br'[\"]', '["]')
155+
check(br"[\']", "[']")
156+
check(br"[\\]", r"[\]")
157+
check(br"[\a]", "[\x07]")
158+
check(br"[\b]", "[\x08]")
159+
check(br"[\t]", "[\x09]")
160+
check(br"[\n]", "[\x0a]")
161+
check(br"[\v]", "[\x0b]")
162+
check(br"[\f]", "[\x0c]")
163+
check(br"[\r]", "[\x0d]")
164+
check(br"[\7]", "[\x07]")
165+
check(br"[\78]", "[\x078]")
166+
check(br"[\41]", "[!]")
167+
check(br"[\418]", "[!8]")
168+
check(br"[\101]", "[A]")
169+
check(br"[\1010]", "[A0]")
170+
check(br"[\x41]", "[A]")
171+
check(br"[\x410]", "[A0]")
172+
check(br"\u20ac", "\u20ac")
173+
check(br"\U0001d120", "\U0001d120")
174+
for i in range(97, 123):
175+
b = bytes([i])
176+
if b not in b'abfnrtuvx':
177+
with self.assertWarns(DeprecationWarning):
178+
check(b"\\" + b, "\\" + chr(i))
179+
if b.upper() not in b'UN':
180+
with self.assertWarns(DeprecationWarning):
181+
check(b"\\" + b.upper(), "\\" + chr(i-32))
182+
with self.assertWarns(DeprecationWarning):
183+
check(br"\8", "\\8")
184+
with self.assertWarns(DeprecationWarning):
185+
check(br"\9", "\\9")
186+
with self.assertWarns(DeprecationWarning):
187+
check(b"\\\xfa", "\\\xfa")
188+
189+
def test_decode_errors(self):
190+
decode = codecs.unicode_escape_decode
191+
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
192+
for i in range(d):
193+
self.assertRaises(UnicodeDecodeError, decode,
194+
b"\\" + c + b"0"*i)
195+
self.assertRaises(UnicodeDecodeError, decode,
196+
b"[\\" + c + b"0"*i + b"]")
197+
data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
198+
self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
199+
self.assertEqual(decode(data, "replace"),
200+
("[\ufffd]\ufffd", len(data)))
201+
self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
202+
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
203+
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,14 @@
5858
import com.oracle.graal.python.builtins.CoreFunctions;
5959
import com.oracle.graal.python.builtins.PythonBuiltins;
6060
import com.oracle.graal.python.builtins.objects.PNone;
61+
import com.oracle.graal.python.builtins.objects.bytes.BytesNodes;
6162
import com.oracle.graal.python.builtins.objects.bytes.PBytes;
6263
import com.oracle.graal.python.builtins.objects.bytes.PIBytesLike;
6364
import com.oracle.graal.python.builtins.objects.common.SequenceStorageNodes;
6465
import com.oracle.graal.python.builtins.objects.tuple.PTuple;
6566
import com.oracle.graal.python.nodes.function.PythonBuiltinBaseNode;
6667
import com.oracle.graal.python.nodes.function.PythonBuiltinNode;
68+
import com.oracle.graal.python.runtime.PythonCore;
6769
import com.oracle.truffle.api.CompilerDirectives;
6870
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
6971
import com.oracle.truffle.api.dsl.Cached;
@@ -251,6 +253,26 @@ protected static CodingErrorAction convertCodingErrorAction(String errors) {
251253
}
252254
}
253255

256+
@Builtin(name = "unicode_escape_decode", fixedNumOfPositionalArgs = 1, keywordArguments = {"errors"})
257+
@GenerateNodeFactory
258+
abstract static class UnicodeEscapeDecode extends PythonBuiltinNode {
259+
@Specialization(guards = "isBytes(bytes)")
260+
Object encode(Object bytes, @SuppressWarnings("unused") PNone errors,
261+
@Cached("create()") BytesNodes.ToBytesNode toBytes) {
262+
// this is basically just parsing as a String
263+
PythonCore core = getCore();
264+
byte[] byteArray = toBytes.execute(bytes);
265+
String string = strFromBytes(byteArray);
266+
String unescapedString = core.getParser().unescapeJavaString(string);
267+
return factory().createTuple(new Object[]{unescapedString, byteArray.length});
268+
}
269+
270+
@TruffleBoundary
271+
private static String strFromBytes(byte[] execute) {
272+
return new String(execute);
273+
}
274+
}
275+
254276
// _codecs.encode(obj, encoding='utf-8', errors='strict')
255277
@Builtin(name = "__truffle_encode", fixedNumOfPositionalArgs = 1, keywordArguments = {"encoding", "errors"})
256278
@GenerateNodeFactory

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonParserImpl.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ public boolean isIdentifier(PythonCore core, String snippet) {
114114
return input.NAME() != null;
115115
}
116116

117+
@Override
118+
@TruffleBoundary
119+
public String unescapeJavaString(String str) {
120+
return PythonTreeTranslator.unescapeJavaString(str);
121+
}
122+
117123
private static PException handleParserError(ParserErrorCallback errors, Source source, Exception e) {
118124
SourceSection section = PythonErrorStrategy.getPosition(source, e);
119125
throw errors.raiseInvalidSyntax(source, section);

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonTreeTranslator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ private PNode parseString(String[] strings) {
658658
}
659659
}
660660

661-
private static String unescapeJavaString(String st) {
661+
public static String unescapeJavaString(String st) {
662662
if (st.indexOf("\\") == -1) {
663663
return st;
664664
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/PythonParser.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ default RuntimeException raiseInvalidSyntax(Source source, SourceSection section
9393
*/
9494
boolean isIdentifier(PythonCore core, String snippet);
9595

96+
/**
97+
* Unescape Python escapes from a Java string
98+
*/
99+
public abstract String unescapeJavaString(String str);
100+
96101
/**
97102
* Runtime exception used to indicate incomplete source code during parsing.
98103
*/

graalpython/lib-graalpython/_codecs.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,6 @@ def unicode_escape_encode(string, errors=None):
252252
return __truffle_encode(repr(string)[1:-1], "latin-1", errors)
253253

254254

255-
@__builtin__
256-
def unicode_escape_decode(string, errors=None):
257-
raise NotImplementedError("unicode_escape_decode")
258-
259-
260255
@__builtin__
261256
def unicode_internal_encode(obj, errors=None):
262257
raise NotImplementedError("unicode_internal_encode")

0 commit comments

Comments
 (0)