|
49 | 49 | import com.oracle.graal.python.nodes.literal.StringLiteralNode;
|
50 | 50 | import com.oracle.graal.python.nodes.statement.StatementNode;
|
51 | 51 | import com.oracle.graal.python.runtime.PythonParser.ParserErrorCallback;
|
| 52 | +import com.oracle.graal.python.runtime.exception.PException; |
52 | 53 | import com.oracle.truffle.api.CompilerDirectives;
|
53 | 54 |
|
54 | 55 | public class StringUtils {
|
@@ -148,33 +149,24 @@ public static String unescapeJavaString(ParserErrorCallback errorCallback, Strin
|
148 | 149 | continue;
|
149 | 150 | // Hex Unicode: u????
|
150 | 151 | case 'u':
|
151 |
| - if (i >= st.length() - 5) { |
152 |
| - ch = 'u'; |
153 |
| - break; |
154 |
| - } |
155 |
| - int code = Integer.parseInt( |
156 |
| - "" + st.charAt(i + 2) + st.charAt(i + 3) + st.charAt(i + 4) + st.charAt(i + 5), 16); |
| 152 | + int code = getHexValue(st, i + 2, 4); |
157 | 153 | sb.append(Character.toChars(code));
|
158 | 154 | i += 5;
|
159 | 155 | continue;
|
160 | 156 | // Hex Unicode: U????????
|
161 | 157 | case 'U':
|
162 |
| - if (i >= st.length() - 9) { |
163 |
| - ch = 'U'; |
164 |
| - break; |
| 158 | + code = getHexValue(st, i + 2, 8); |
| 159 | + if (Character.isValidCodePoint(code)) { |
| 160 | + sb.append(Character.toChars(code)); |
| 161 | + } else { |
| 162 | + throw PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + ILLEGAl_CHARACTER, i, i + 9); |
165 | 163 | }
|
166 |
| - code = Integer.parseInt(st.substring(i + 2, i + 10), 16); |
167 |
| - sb.append(Character.toChars(code)); |
168 | 164 | i += 9;
|
169 | 165 | continue;
|
170 | 166 | // Hex Unicode: x??
|
171 | 167 | case 'x':
|
172 |
| - if (i >= st.length() - 3) { |
173 |
| - ch = 'u'; |
174 |
| - break; |
175 |
| - } |
176 |
| - int hexCode = Integer.parseInt("" + st.charAt(i + 2) + st.charAt(i + 3), 16); |
177 |
| - sb.append(Character.toChars(hexCode)); |
| 168 | + code = getHexValue(st, i + 2, 2); |
| 169 | + sb.append(Character.toChars(code)); |
178 | 170 | i += 3;
|
179 | 171 | continue;
|
180 | 172 | case 'N':
|
@@ -204,7 +196,46 @@ public static void warnInvalidEscapeSequence(ParserErrorCallback errorCallback,
|
204 | 196 |
|
205 | 197 | private static final String UNICODE_ERROR = "'unicodeescape' codec can't decode bytes in position %d-%d:";
|
206 | 198 | private static final String MALFORMED_ERROR = " malformed \\N character escape";
|
| 199 | + private static final String TRUNCATED_XXX_ERROR = "truncated \\xXX escape"; |
| 200 | + private static final String TRUNCATED_UXXXX_ERROR = "truncated \\uXXXX escape"; |
| 201 | + private static final String TRUNCATED_UXXXXXXXX_ERROR = "truncated \\UXXXXXXXX escape"; |
207 | 202 | private static final String UNKNOWN_UNICODE_ERROR = " unknown Unicode character name";
|
| 203 | + private static final String ILLEGAl_CHARACTER = "illegal Unicode character"; |
| 204 | + |
| 205 | + private static int getHexValue(String text, int start, int len) { |
| 206 | + int digit; |
| 207 | + int result = 0; |
| 208 | + for (int index = start; index < (start + len); index++) { |
| 209 | + if (index < text.length()) { |
| 210 | + digit = Character.digit(text.charAt(index), 16); |
| 211 | + if (digit == -1) { |
| 212 | + // Like cpython, raise error with the wrong character first, |
| 213 | + // even if there are not enough characters |
| 214 | + throw createTruncatedError(start - 2, index - 1, len); |
| 215 | + } |
| 216 | + result = result * 16 + digit; |
| 217 | + } else { |
| 218 | + throw createTruncatedError(start - 2, index - 1, len); |
| 219 | + } |
| 220 | + } |
| 221 | + return result; |
| 222 | + } |
| 223 | + |
| 224 | + private static PException createTruncatedError(int startIndex, int endIndex, int len) { |
| 225 | + String truncatedMessage = null; |
| 226 | + switch (len) { |
| 227 | + case 2: |
| 228 | + truncatedMessage = TRUNCATED_XXX_ERROR; |
| 229 | + break; |
| 230 | + case 4: |
| 231 | + truncatedMessage = TRUNCATED_UXXXX_ERROR; |
| 232 | + break; |
| 233 | + case 8: |
| 234 | + truncatedMessage = TRUNCATED_UXXXXXXXX_ERROR; |
| 235 | + break; |
| 236 | + } |
| 237 | + return PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + truncatedMessage, startIndex, endIndex); |
| 238 | + } |
208 | 239 |
|
209 | 240 | /**
|
210 | 241 | * Replace '/N{Unicode Character Name}' with the code point of the character.
|
|
0 commit comments