Skip to content

Commit afafe5a

Browse files
committed
[GR-18284] Escaping letters in string literals using name in Unicode database.
PullRequest: graalpython/1012
2 parents 99e9963 + c659623 commit afafe5a

File tree

10 files changed

+657
-10
lines changed

10 files changed

+657
-10
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
This changelog summarizes major changes between GraalVM versions of the Python
44
language runtime. The main focus is on user-observable behavior of the engine.
55

6+
## Version 20.2
7+
8+
* Escaping Unicode characters using the character names in strings like "\N{GREEK CAPITAL LETTER DELTA}".
9+
610
## Version 20.1.0
711

812
* Update language support target and standard library to 3.8.2

THIRD_PARTY_LICENSE.txt

Lines changed: 420 additions & 1 deletion
Large diffs are not rendered by default.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* The Universal Permissive License (UPL), Version 1.0
6+
*
7+
* Subject to the condition set forth below, permission is hereby granted to any
8+
* person obtaining a copy of this software, associated documentation and/or
9+
* data (collectively the "Software"), free of charge and under any and all
10+
* copyright rights in the Software, and any and all patent rights owned or
11+
* freely licensable by each licensor hereunder covering either (i) the
12+
* unmodified Software as contributed to or provided by such licensor, or (ii)
13+
* the Larger Works (as defined below), to deal in both
14+
*
15+
* (a) the Software, and
16+
*
17+
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
18+
* one is included with the Software each a "Larger Work" to which the Software
19+
* is contributed by such licensors),
20+
*
21+
* without restriction, including without limitation the rights to copy, create
22+
* derivative works of, display, perform, and distribute the Software and make,
23+
* use, sell, offer for sale, import, export, have made, and have sold the
24+
* Software and the Larger Work(s), and to sublicense the foregoing rights on
25+
* either these or other terms.
26+
*
27+
* This license is subject to the following condition:
28+
*
29+
* The above copyright notice and either this complete permission notice or at a
30+
* minimum a reference to the UPL must be included in all copies or substantial
31+
* portions of the Software.
32+
*
33+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39+
* SOFTWARE.
40+
*/
41+
package com.oracle.graal.python.test.parser;
42+
43+
import com.oracle.graal.python.parser.sst.StringUtils;
44+
import org.junit.Assert;
45+
import org.junit.Test;
46+
47+
public class StringUtilsTests extends ParserTestBase {
48+
@Test
49+
public void unicodeCharNameBasic() throws Exception {
50+
Assert.assertEquals("Δ", StringUtils.unescapeJavaString("\\N{GREEK CAPITAL LETTER DELTA}"));
51+
Assert.assertEquals("A", StringUtils.unescapeJavaString("\\N{LATIN CAPITAL LETTER A}"));
52+
Assert.assertEquals("A", StringUtils.unescapeJavaString("\\N{LATIN CAPITAL LETTER a}"));
53+
Assert.assertEquals("A", StringUtils.unescapeJavaString("\\N{LATIN CAPITAL LETTEr a}"));
54+
Assert.assertEquals("A", StringUtils.unescapeJavaString("\\N{latin capital letter a}"));
55+
Assert.assertEquals("AHOJ", StringUtils.unescapeJavaString("A\\N{LATIN CAPITAL LETTER H}OJ"));
56+
Assert.assertEquals("AHOJ", StringUtils.unescapeJavaString("\\N{LATIN CAPITAL LETTER A}\\N{LATIN CAPITAL LETTER H}\\N{LATIN CAPITAL LETTER O}\\N{LATIN CAPITAL LETTER J}"));
57+
checkUnknownChar("ahoj");
58+
}
59+
60+
@Test
61+
public void blockHangulSyllables() throws Exception {
62+
Assert.assertEquals("가", StringUtils.unescapeJavaString("\\N{HANGUL SYLLABLE GA}"));
63+
Assert.assertEquals("돐", StringUtils.unescapeJavaString("\\N{HANGUL SYLLABLE DOLS}"));
64+
Assert.assertEquals("똜", StringUtils.unescapeJavaString("\\N{HANGUL SYLLABLE DDOLS}"));
65+
}
66+
67+
@Test
68+
public void blockCjkUnifiedIdeograph() throws Exception {
69+
Assert.assertEquals("㐀", StringUtils.unescapeJavaString("\\N{CJK Unified Ideograph-3400}"));
70+
Assert.assertEquals("𫝜", StringUtils.unescapeJavaString("\\N{CJK Unified Ideograph-2B75C}"));
71+
Assert.assertEquals("丳", StringUtils.unescapeJavaString("\\N{CJK Unified Ideograph-4E33}"));
72+
}
73+
74+
@Test
75+
public void blockCjkUnifiedIdeographUnknownCharacters() throws Exception {
76+
checkUnknownChar("CJK Unified Ideograph-33FF");
77+
checkUnknownChar("CJK Unified Ideograph-4DC0");
78+
checkUnknownChar("CJK Unified Ideograph-4DFF");
79+
checkUnknownChar("CJK Unified Ideograph-33FF");
80+
checkUnknownChar("CJK Unified Ideograph-2A6E0");
81+
}
82+
83+
@Test
84+
public void malformedError() throws Exception {
85+
checkSyntaxErrorMessage("'\\N {LATIN CAPITAL LETTER A}'", "SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: malformed \\N character escape");
86+
checkSyntaxErrorMessage("'\\N LATIN CAPITAL LETTER A}'", "SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: malformed \\N character escape");
87+
checkSyntaxErrorMessage("'\\N{LATIN CAPITAL LETTER A'", "SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-24: malformed \\N character escape");
88+
checkSyntaxErrorMessage("'\\N{LATIN CAPITAL LETTER A \\N{LATIN CAPITAL LETTER B}'",
89+
"SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-51: unknown Unicode character name");
90+
}
91+
92+
@Test
93+
public void doNotTrimNames() throws Exception {
94+
checkUnknownChar("LATIN CAPITAL LETTER A ");
95+
checkUnknownChar(" LATIN CAPITAL LETTER A");
96+
checkUnknownChar(" LATIN CAPITAL LETTER A ");
97+
}
98+
99+
private void checkUnknownChar(String charName) throws Exception {
100+
String code = "'\\N{" + charName + "}'";
101+
checkSyntaxErrorMessage(code, "SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-" + (charName.length() + 3) + ": unknown Unicode character name");
102+
}
103+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*NormalizationTest.test_bug_834676
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*graalpython.lib-python.3.test.test_ucn.UnicodeNamesTest.test_cjk_unified_ideographs
2+
*graalpython.lib-python.3.test.test_ucn.UnicodeNamesTest.test_general
3+
*graalpython.lib-python.3.test.test_ucn.UnicodeNamesTest.test_misc_symbols
4+
*graalpython.lib-python.3.test.test_ucn.UnicodeNamesTest.test_strict_error_handling

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,11 @@ public int ord(String chr) {
14171417
return chr.charAt(0);
14181418
}
14191419

1420+
@Specialization
1421+
public int ord(PString chr) {
1422+
return ord(chr.getValue());
1423+
}
1424+
14201425
@Specialization
14211426
public long ord(VirtualFrame frame, PIBytesLike chr,
14221427
@Cached CastToJavaLongExactNode castNode,

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/UnicodeDataModuleBuiltins.java

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ public static String getUnicodeVersion() {
158158
@Override
159159
public void initialize(PythonCore core) {
160160
super.initialize(core);
161-
builtinConstants.put("version", getUnicodeVersion());
161+
builtinConstants.put("unidata_version", getUnicodeVersion());
162162
PythonBuiltinClass objectType = core.lookupType(PythonBuiltinClassType.PythonObject);
163163
PythonObject ucd_3_2_0 = core.factory().createPythonObject(objectType);
164164
ucd_3_2_0.setAttribute("unidata_version", "3.2.0");
@@ -198,4 +198,36 @@ public String normalize(String form, PString unistr,
198198
}
199199

200200
}
201+
202+
// unicodedata.is_normalized(form, unistr)
203+
@Builtin(name = "is_normalized", minNumOfPositionalArgs = 2)
204+
@GenerateNodeFactory
205+
public abstract static class IsNormalizedNode extends PythonBuiltinNode {
206+
@TruffleBoundary
207+
protected Normalizer.Form getForm(String form) {
208+
try {
209+
return Normalizer.Form.valueOf(form);
210+
} catch (IllegalArgumentException e) {
211+
return null;
212+
}
213+
}
214+
215+
@Specialization(guards = {"form.equals(cachedForm)"}, limit = "4")
216+
@TruffleBoundary
217+
public boolean isNormalized(@SuppressWarnings("unused") String form, String unistr,
218+
@SuppressWarnings("unused") @Cached("form") String cachedForm,
219+
@Cached("getForm(cachedForm)") Normalizer.Form cachedNormForm) {
220+
if (cachedNormForm == null) {
221+
throw raise(ValueError, ErrorMessages.INVALID_NORMALIZATION_FORM);
222+
}
223+
return Normalizer.isNormalized(unistr, cachedNormForm);
224+
}
225+
226+
@Specialization(guards = {"form.equals(cachedForm)"}, limit = "4")
227+
public boolean normalize(String form, PString unistr,
228+
@Cached("form") String cachedForm,
229+
@Cached("getForm(cachedForm)") Normalizer.Form cachedNormForm) {
230+
return isNormalized(form, unistr.getValue(), cachedForm, cachedNormForm);
231+
}
232+
}
201233
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/sst/FactorySSTVisitor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1158,7 +1158,7 @@ public PNode visit(StarSSTNode node) {
11581158

11591159
@Override
11601160
public PNode visit(StringLiteralSSTNode node) {
1161-
PNode result = StringUtils.parseString(node.values, nodeFactory, errors);
1161+
PNode result = StringUtils.parseString(source, node, nodeFactory, errors);
11621162
result.assignSourceSection(createSourceSection(node.startOffset, node.endOffset));
11631163
return result;
11641164
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/sst/StringUtils.java

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@
4141

4242
package com.oracle.graal.python.parser.sst;
4343

44-
import static com.oracle.graal.python.runtime.exception.PythonErrorType.SyntaxError;
44+
import com.ibm.icu.lang.UCharacter;
45+
import com.oracle.graal.python.PythonLanguage;
46+
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
4547

4648
import java.util.ArrayList;
4749
import java.util.List;
@@ -53,9 +55,13 @@
5355
import com.oracle.graal.python.nodes.expression.ExpressionNode;
5456
import com.oracle.graal.python.nodes.literal.FormatStringLiteralNode;
5557
import com.oracle.graal.python.nodes.literal.StringLiteralNode;
58+
import com.oracle.graal.python.nodes.object.IsBuiltinClassProfile;
5659
import com.oracle.graal.python.nodes.statement.StatementNode;
5760
import com.oracle.graal.python.runtime.PythonParser;
61+
import com.oracle.graal.python.runtime.exception.PException;
5862
import com.oracle.graal.python.util.PythonUtils;
63+
import com.oracle.truffle.api.CompilerDirectives;
64+
import com.oracle.truffle.api.source.Source;
5965

6066
public class StringUtils {
6167

@@ -108,12 +114,12 @@ byte[] build() {
108114
}
109115
}
110116

111-
public static PNode parseString(String[] strings, NodeFactory nodeFactory, PythonParser.ParserErrorCallback errors) {
117+
public static PNode parseString(Source source, StringLiteralSSTNode node, NodeFactory nodeFactory, PythonParser.ParserErrorCallback errors) {
112118
StringBuilder sb = null;
113119
BytesBuilder bb = null;
114120
boolean isFormatString = false;
115121
List<FormatStringLiteralNode.StringPart> formatStrings = null;
116-
for (String text : strings) {
122+
for (String text : node.values) {
117123
boolean isRaw = false;
118124
boolean isBytes = false;
119125
boolean isFormat = false;
@@ -146,7 +152,7 @@ public static PNode parseString(String[] strings, NodeFactory nodeFactory, Pytho
146152
text = text.substring(strStartIndex, strEndIndex);
147153
if (isBytes) {
148154
if (sb != null || isFormatString) {
149-
throw errors.raise(SyntaxError, CANNOT_MIX_MESSAGE);
155+
throw errors.raiseInvalidSyntax(source, source.createSection(node.startOffset, node.endOffset - node.startOffset), CANNOT_MIX_MESSAGE);
150156
}
151157
if (bb == null) {
152158
bb = new BytesBuilder();
@@ -158,10 +164,17 @@ public static PNode parseString(String[] strings, NodeFactory nodeFactory, Pytho
158164
}
159165
} else {
160166
if (bb != null) {
161-
throw errors.raise(SyntaxError, CANNOT_MIX_MESSAGE);
167+
throw errors.raiseInvalidSyntax(source, source.createSection(node.startOffset, node.endOffset - node.startOffset), CANNOT_MIX_MESSAGE);
162168
}
163169
if (!isRaw) {
164-
text = unescapeJavaString(text);
170+
try {
171+
text = unescapeJavaString(text);
172+
} catch (PException e) {
173+
e.expect(PythonBuiltinClassType.UnicodeDecodeError, IsBuiltinClassProfile.getUncached());
174+
String message = e.getMessage();
175+
message = "(unicode error)" + message.substring(PythonBuiltinClassType.UnicodeDecodeError.getName().length() + 1);
176+
throw errors.raiseInvalidSyntax(source, source.createSection(node.startOffset, node.endOffset - node.startOffset), message);
177+
}
165178
}
166179
if (isFormat) {
167180
isFormatString = true;
@@ -293,6 +306,10 @@ public static String unescapeJavaString(String st) {
293306
sb.append(Character.toChars(hexCode));
294307
i += 3;
295308
continue;
309+
case 'N':
310+
// a character from Unicode Data Database
311+
i = doCharacterName(st, sb, i + 2);
312+
continue;
296313
default:
297314
sb.append(ch);
298315
sb.append(nextChar);
@@ -305,4 +322,55 @@ public static String unescapeJavaString(String st) {
305322
}
306323
return sb.toString();
307324
}
325+
326+
private static final String UNICODE_ERROR = "'unicodeescape' codec can't decode bytes in position %d-%d:";
327+
private static final String MALFORMED_ERROR = " malformed \\N character escape";
328+
private static final String UNKNOWN_UNICODE_ERROR = " unknown Unicode character name";
329+
330+
/**
331+
* Replace '/N{Unicode Character Name}' with the code point of the character.
332+
*
333+
* @param text a text that contains /N{...} escape sequence
334+
* @param sb string builder where the result code point will be written
335+
* @param offset this is offset of the open brace
336+
* @return offset of the close brace
337+
*/
338+
@CompilerDirectives.TruffleBoundary
339+
private static int doCharacterName(String text, StringBuilder sb, int offset) {
340+
char ch = text.charAt(offset);
341+
if (ch != '{') {
342+
throw PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + MALFORMED_ERROR, offset - 2, offset - 1);
343+
}
344+
int closeIndex = text.indexOf("}", offset + 1);
345+
if (closeIndex == -1) {
346+
throw PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + MALFORMED_ERROR, offset - 2, text.length() - 1);
347+
}
348+
String charName = text.substring(offset + 1, closeIndex).toUpperCase();
349+
// When JDK 1.8 will not be supported, we can replace with Character.codePointOf(String
350+
// name) in the
351+
int cp = getCodePoint(charName);
352+
if (cp >= 0) {
353+
sb.append(Character.toChars(cp));
354+
} else {
355+
throw PythonLanguage.getCore().raise(PythonBuiltinClassType.UnicodeDecodeError, UNICODE_ERROR + UNKNOWN_UNICODE_ERROR, offset - 2, closeIndex);
356+
}
357+
return closeIndex;
358+
}
359+
360+
@CompilerDirectives.TruffleBoundary
361+
public static int getCodePoint(String charName) {
362+
int possibleChar = UCharacter.getCharFromName(charName);
363+
if (possibleChar > -1) {
364+
return possibleChar;
365+
}
366+
possibleChar = UCharacter.getCharFromExtendedName(charName);
367+
if (possibleChar > -1) {
368+
return possibleChar;
369+
}
370+
possibleChar = UCharacter.getCharFromNameAlias(charName);
371+
if (possibleChar > -1) {
372+
return possibleChar;
373+
}
374+
return -1;
375+
}
308376
}

mx.graalpython/suite.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,17 @@
113113
],
114114
"packedResource": True,
115115
"sha1": "bf7badf7e248e0ecf465d33c2f5aeec774209227",
116-
}
116+
},
117+
"ICU4J" : {
118+
"moduleName" : "com.ibm.icu",
119+
"sha1" : "72c7519b6d91f7a1f993bd44a99fe95d67211b27",
120+
"sourceSha1" : "57d00b7135ca8fa82311d6b9fd345309e4c46f0b",
121+
"maven" : {
122+
"groupId" : "com.ibm.icu",
123+
"artifactId" : "icu4j",
124+
"version" : "66.1",
125+
},
126+
},
117127
},
118128

119129
# --------------------------------------------------------------------------------------------------------------
@@ -196,6 +206,7 @@
196206
"truffle:ANTLR4",
197207
"sulong:SULONG_API",
198208
"XZ-1.8",
209+
"ICU4J",
199210
],
200211
"buildDependencies": ["com.oracle.graal.python.parser.antlr"],
201212
"jacoco": "include",

0 commit comments

Comments
 (0)