Skip to content

Commit 02d75f4

Browse files
committed
[GR-23262] Make test_string_literals pass
PullRequest: graalpython/1206
2 parents d7043e9 + ddabb36 commit 02d75f4

File tree

3 files changed

+55
-17
lines changed

3 files changed

+55
-17
lines changed

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_string_literals.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_bytes_incomplete
22
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_bytes_normal
33
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_bytes_raw
4+
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_str_incomplete
45
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_str_normal
56
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_str_raw
67
*graalpython.lib-python.3.test.test_string_literals.TestLiterals.test_eval_str_u

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import java.util.Arrays;
3434

3535
import com.oracle.graal.python.nodes.ErrorMessages;
36+
import static com.oracle.graal.python.parser.sst.StringUtils.warnInvalidEscapeSequence;
3637
import com.oracle.graal.python.runtime.PythonCore;
3738
import com.oracle.graal.python.runtime.PythonParser.ParserErrorCallback;
3839
import com.oracle.truffle.api.CompilerAsserts;
@@ -107,6 +108,7 @@ public static StringBuilder decodeEscapes(ParserErrorCallback errors, String str
107108
// TODO: for the moment we assume ASCII
108109
StringBuilder charList = new StringBuilder();
109110
int length = string.length();
111+
boolean wasDeprecationWarning = false;
110112
for (int i = 0; i < length; i++) {
111113
char chr = string.charAt(i);
112114
if (chr != '\\') {
@@ -215,6 +217,10 @@ public static StringBuilder decodeEscapes(ParserErrorCallback errors, String str
215217
} else {
216218
charList.append('\\');
217219
charList.append(chr);
220+
if (!wasDeprecationWarning) {
221+
wasDeprecationWarning = true;
222+
warnInvalidEscapeSequence(errors, chr);
223+
}
218224
}
219225
}
220226
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/sst/StringUtils.java

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import com.oracle.graal.python.nodes.literal.StringLiteralNode;
5050
import com.oracle.graal.python.nodes.statement.StatementNode;
5151
import com.oracle.graal.python.runtime.PythonParser.ParserErrorCallback;
52+
import com.oracle.graal.python.runtime.exception.PException;
5253
import com.oracle.truffle.api.CompilerDirectives;
5354

5455
public class StringUtils {
@@ -148,33 +149,24 @@ public static String unescapeJavaString(ParserErrorCallback errorCallback, Strin
148149
continue;
149150
// Hex Unicode: u????
150151
case 'u':
151-
if (i >= st.length() - 5) {
152-
ch = 'u';
153-
break;
154-
}
155-
int code = Integer.parseInt(
156-
"" + st.charAt(i + 2) + st.charAt(i + 3) + st.charAt(i + 4) + st.charAt(i + 5), 16);
152+
int code = getHexValue(st, i + 2, 4);
157153
sb.append(Character.toChars(code));
158154
i += 5;
159155
continue;
160156
// Hex Unicode: U????????
161157
case 'U':
162-
if (i >= st.length() - 9) {
163-
ch = 'U';
164-
break;
158+
code = getHexValue(st, i + 2, 8);
159+
if (Character.isValidCodePoint(code)) {
160+
sb.append(Character.toChars(code));
161+
} else {
162+
throw PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + ILLEGAl_CHARACTER, i, i + 9);
165163
}
166-
code = Integer.parseInt(st.substring(i + 2, i + 10), 16);
167-
sb.append(Character.toChars(code));
168164
i += 9;
169165
continue;
170166
// Hex Unicode: x??
171167
case 'x':
172-
if (i >= st.length() - 3) {
173-
ch = 'u';
174-
break;
175-
}
176-
int hexCode = Integer.parseInt("" + st.charAt(i + 2) + st.charAt(i + 3), 16);
177-
sb.append(Character.toChars(hexCode));
168+
code = getHexValue(st, i + 2, 2);
169+
sb.append(Character.toChars(code));
178170
i += 3;
179171
continue;
180172
case 'N':
@@ -204,7 +196,46 @@ public static void warnInvalidEscapeSequence(ParserErrorCallback errorCallback,
204196

205197
private static final String UNICODE_ERROR = "'unicodeescape' codec can't decode bytes in position %d-%d:";
206198
private static final String MALFORMED_ERROR = " malformed \\N character escape";
199+
private static final String TRUNCATED_XXX_ERROR = "truncated \\xXX escape";
200+
private static final String TRUNCATED_UXXXX_ERROR = "truncated \\uXXXX escape";
201+
private static final String TRUNCATED_UXXXXXXXX_ERROR = "truncated \\UXXXXXXXX escape";
207202
private static final String UNKNOWN_UNICODE_ERROR = " unknown Unicode character name";
203+
private static final String ILLEGAl_CHARACTER = "illegal Unicode character";
204+
205+
private static int getHexValue(String text, int start, int len) {
206+
int digit;
207+
int result = 0;
208+
for (int index = start; index < (start + len); index++) {
209+
if (index < text.length()) {
210+
digit = Character.digit(text.charAt(index), 16);
211+
if (digit == -1) {
212+
// Like cpython, raise error with the wrong character first,
213+
// even if there are not enough characters
214+
throw createTruncatedError(start - 2, index - 1, len);
215+
}
216+
result = result * 16 + digit;
217+
} else {
218+
throw createTruncatedError(start - 2, index - 1, len);
219+
}
220+
}
221+
return result;
222+
}
223+
224+
private static PException createTruncatedError(int startIndex, int endIndex, int len) {
225+
String truncatedMessage = null;
226+
switch (len) {
227+
case 2:
228+
truncatedMessage = TRUNCATED_XXX_ERROR;
229+
break;
230+
case 4:
231+
truncatedMessage = TRUNCATED_UXXXX_ERROR;
232+
break;
233+
case 8:
234+
truncatedMessage = TRUNCATED_UXXXXXXXX_ERROR;
235+
break;
236+
}
237+
return PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + truncatedMessage, startIndex, endIndex);
238+
}
208239

209240
/**
210241
* Replace '/N{Unicode Character Name}' with the code point of the character.

0 commit comments

Comments
 (0)