Skip to content

Commit fc9cf11

Browse files
committed
[GR-24013] Support source encoding declarations
PullRequest: graalpython/1037
2 parents 7953310 + 1d64126 commit fc9cf11

File tree

15 files changed

+2039
-1813
lines changed

15 files changed

+2039
-1813
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_exec.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
22
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
33
#
44
# The Universal Permissive License (UPL), Version 1.0
@@ -311,3 +311,28 @@ def __getitem__(self, key):
311311
exec("global x; x = y", ns, m)
312312
assert ns["x"] == "y";
313313
assert eval("x", None, m) == "x"
314+
315+
def test_exec_encoding(self):
316+
x = {}
317+
exec(b'#!/usr/bin/python\n# vim:fileencoding=cp1250\nx = "\x9elu\x9dou\xe8k\xfd k\xf9\xf2"', x)
318+
assert x['x'] == 'žluťoučký kůň'
319+
320+
def test_exec_invalid_encoding(self):
321+
def fn():
322+
exec(b'# encoding: cp12413254\nx=1')
323+
raises(SyntaxError, fn)
324+
325+
def test_exec_ignore_decoded(self):
326+
x = {}
327+
exec('# encoding: cp12413254\nx=1', x)
328+
assert x['x'] == 1
329+
330+
def test_exec_bom(self):
331+
x = {}
332+
exec(b'\xef\xbb\xbfx = "\xe6\xa5\xbd\xe3\x81\x97\xe3\x81\x84"', x)
333+
assert x['x'] == '楽しい'
334+
335+
def test_exec_bom_invalid(self):
336+
def fn():
337+
exec(b'\xef\xbb\xbf#encoding:latin-1\nx = "\xe9\xa7\x84\xe7\x9b\xae"')
338+
raises(SyntaxError, fn)

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_imp.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
*graalpython.lib-python.3.test.test_imp.ImportTests.test_bug7732
22
*graalpython.lib-python.3.test.test_imp.ImportTests.test_find_module_encoding
3+
*graalpython.lib-python.3.test.test_imp.ImportTests.test_import_encoded_module
34
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue15828_load_extensions
45
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue16421_multiple_modules_in_one_dll
56
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue24748_load_module_skips_sys_modules_check
67
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue31315
78
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue3594
9+
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue5604
810
*graalpython.lib-python.3.test.test_imp.ImportTests.test_issue9319
911
*graalpython.lib-python.3.test.test_imp.ImportTests.test_load_dynamic_ImportError_path
1012
*graalpython.lib-python.3.test.test_imp.ImportTests.test_load_module_extension_file_is_None
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
11
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_default_coding
2+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_double_coding_line
3+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_double_coding_same_line
4+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_first_coding_line
5+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_first_non_utf8_coding_line
6+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_second_coding_line
7+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_second_non_utf8_coding_line
28
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_third_coding_line
9+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_utf8_bom
10+
*graalpython.lib-python.3.test.test_source_encoding.BytesSourceEncodingTest.test_utf8_bom_and_utf8_coding_line
311
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_20731
12+
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_bad_coding
413
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_bad_coding2
514
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_compilestring
15+
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_error_message
16+
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_exec_valid_coding
617
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_file_parse
18+
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_issue2301
719
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_issue3297
820
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_issue4626
921
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_issue7820
22+
*graalpython.lib-python.3.test.test_source_encoding.MiscSourceEncodingTest.test_pep263

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017, 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* The Universal Permissive License (UPL), Version 1.0
@@ -40,12 +40,25 @@
4040
*/
4141
package com.oracle.graal.python;
4242

43+
import java.io.BufferedReader;
44+
import java.io.ByteArrayInputStream;
4345
import java.io.IOException;
44-
import com.oracle.truffle.api.TruffleFile;
46+
import java.io.InputStreamReader;
47+
import java.io.StringReader;
4548
import java.nio.charset.Charset;
49+
import java.nio.charset.StandardCharsets;
50+
import java.util.regex.Matcher;
51+
import java.util.regex.Pattern;
52+
53+
import com.oracle.graal.python.util.CharsetMapping;
54+
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
55+
import com.oracle.truffle.api.TruffleFile;
4656

4757
public final class PythonFileDetector implements TruffleFile.FileTypeDetector {
4858

59+
private static final String UTF_8_BOM_IN_LATIN_1 = new String(new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.ISO_8859_1);
60+
private static final Pattern ENCODING_COMMENT = Pattern.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+).*");
61+
4962
@Override
5063
public String findMimeType(TruffleFile file) throws IOException {
5164
String fileName = file.getName();
@@ -55,8 +68,99 @@ public String findMimeType(TruffleFile file) throws IOException {
5568
return null;
5669
}
5770

71+
public static class InvalidEncodingException extends RuntimeException {
72+
private static final long serialVersionUID = 1L;
73+
74+
private final String encodingName;
75+
76+
public InvalidEncodingException(String encodingName) {
77+
super("Invalid or unsupported encoding: " + encodingName);
78+
this.encodingName = encodingName;
79+
}
80+
81+
public String getEncodingName() {
82+
return encodingName;
83+
}
84+
}
85+
86+
private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) {
87+
if (line == null) {
88+
return null;
89+
}
90+
Matcher matcher = ENCODING_COMMENT.matcher(line);
91+
if (matcher.matches()) {
92+
// Files with UTF-8 BOM but different encoding declared are a SyntaxError
93+
// Note that CPython ignores UTF-8 aliases for the BOM check
94+
String encoding = matcher.group(1);
95+
if (hasBOM && !CharsetMapping.normalize(encoding).equals("utf_8")) {
96+
throw new InvalidEncodingException(encoding + " with BOM");
97+
}
98+
Charset charset = CharsetMapping.getCharset(encoding);
99+
if (charset == null) {
100+
throw new InvalidEncodingException(encoding);
101+
}
102+
return charset;
103+
}
104+
return null;
105+
}
106+
107+
@TruffleBoundary
108+
public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
109+
Charset charset;
110+
// Read first two lines like CPython
111+
String firstLine = reader.readLine();
112+
boolean hasBOM = false;
113+
if (firstLine != null && firstLine.startsWith(UTF_8_BOM_IN_LATIN_1)) {
114+
hasBOM = true;
115+
firstLine = firstLine.substring(UTF_8_BOM_IN_LATIN_1.length());
116+
}
117+
if ((charset = tryGetCharsetFromLine(firstLine, hasBOM)) != null) {
118+
return charset;
119+
}
120+
if ((charset = tryGetCharsetFromLine(reader.readLine(), hasBOM)) != null) {
121+
return charset;
122+
}
123+
return StandardCharsets.UTF_8;
124+
}
125+
126+
@TruffleBoundary
127+
public static Charset findEncodingStrict(TruffleFile file) throws IOException {
128+
// Using Latin-1 to read the header avoids exceptions on non-ascii characters
129+
try (BufferedReader reader = file.newBufferedReader(StandardCharsets.ISO_8859_1)) {
130+
return findEncodingStrict(reader);
131+
}
132+
}
133+
134+
@TruffleBoundary
135+
public static Charset findEncodingStrict(String source) {
136+
try (BufferedReader reader = new BufferedReader(new StringReader(source))) {
137+
return findEncodingStrict(reader);
138+
} catch (IOException e) {
139+
// Shouldn't happen on a string
140+
throw new RuntimeException(e);
141+
}
142+
}
143+
144+
@TruffleBoundary
145+
public static Charset findEncodingStrict(byte[] source) {
146+
// Using Latin-1 to read the header avoids exceptions on non-ascii characters
147+
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(source), StandardCharsets.ISO_8859_1))) {
148+
return findEncodingStrict(reader);
149+
} catch (IOException e) {
150+
// Shouldn't happen on a string
151+
throw new RuntimeException(e);
152+
}
153+
}
154+
58155
@Override
59156
public Charset findEncoding(TruffleFile file) throws IOException {
60-
return null;
157+
try {
158+
return findEncodingStrict(file);
159+
} catch (InvalidEncodingException e) {
160+
// We cannot throw a SyntaxError at this point, but the parser will revalidate this.
161+
// Return Latin-1 so that it doesn't throw encoding errors before getting to the
162+
// parser, because Truffle would otherwise default to UTF-8
163+
return StandardCharsets.ISO_8859_1;
164+
}
61165
}
62166
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonLanguage.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ public static Source newSource(PythonContext ctxt, String src, String name, bool
483483
SourceBuilder sourceBuilder = null;
484484
if (mayBeFile) {
485485
try {
486-
TruffleFile truffleFile = ctxt.getEnv().getInternalTruffleFile(name);
486+
TruffleFile truffleFile = ctxt.getPublicTruffleFileRelaxed(name, PythonLanguage.DEFAULT_PYTHON_EXTENSIONS);
487487
if (truffleFile.exists()) {
488488
// XXX: (tfel): We don't know if the expression has anything to do with the
489489
// filename that's given. We would really have to compare the entire

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@
7171
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
7272

7373
import java.math.BigInteger;
74+
import java.nio.charset.Charset;
7475
import java.util.List;
7576

77+
import com.oracle.graal.python.PythonFileDetector;
7678
import com.oracle.graal.python.PythonLanguage;
7779
import com.oracle.graal.python.builtins.Builtin;
7880
import com.oracle.graal.python.builtins.CoreFunctions;
@@ -719,9 +721,23 @@ public CompileNode() {
719721
public abstract PCode execute(VirtualFrame frame, Object source, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize);
720722

721723
@Specialization
722-
PCode compile(VirtualFrame frame, PBytes source, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize,
724+
PCode compile(VirtualFrame frame, PBytes pBytes, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize,
723725
@Cached("create()") BytesNodes.ToBytesNode toBytesNode) {
724-
return compile(createString(toBytesNode.execute(frame, source)), filename, mode, kwFlags, kwDontInherit, kwOptimize);
726+
try {
727+
byte[] bytes = toBytesNode.execute(frame, pBytes);
728+
Charset charset = PythonFileDetector.findEncodingStrict(bytes);
729+
return compile(createString(bytes, charset), filename, mode, kwFlags, kwDontInherit, kwOptimize);
730+
} catch (PythonFileDetector.InvalidEncodingException e) {
731+
throw handleInvalidEncoding(filename, e);
732+
}
733+
}
734+
735+
@TruffleBoundary
736+
private RuntimeException handleInvalidEncoding(String filename, PythonFileDetector.InvalidEncodingException e) {
737+
PythonContext context = getContext();
738+
// Create non-empty source to avoid overwriting the message with "unexpected EOF"
739+
Source source = PythonLanguage.newSource(context, " ", filename, mayBeFromFile);
740+
throw getCore().raiseInvalidSyntax(source, source.createUnavailableSection(), "encoding problem: %s", e.getEncodingName());
725741
}
726742

727743
@SuppressWarnings("unused")
@@ -766,8 +782,8 @@ PCode compile(PCode code, String filename, String mode, Object flags, Object don
766782
}
767783

768784
@TruffleBoundary
769-
private static String createString(byte[] bytes) {
770-
return new String(bytes);
785+
private static String createString(byte[] bytes, Charset charset) {
786+
return new String(bytes, charset);
771787

772788
}
773789

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonParserImpl.java

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,24 @@
2525
*/
2626
package com.oracle.graal.python.parser;
2727

28+
import java.io.ByteArrayInputStream;
29+
import java.io.ByteArrayOutputStream;
30+
import java.io.DataInputStream;
31+
import java.io.DataOutputStream;
32+
import java.io.IOException;
33+
34+
import org.antlr.v4.runtime.CharStreams;
35+
import org.antlr.v4.runtime.CommonTokenStream;
36+
import org.antlr.v4.runtime.Token;
37+
import org.graalvm.nativeimage.ImageInfo;
38+
39+
import com.oracle.graal.python.PythonFileDetector;
2840
import com.oracle.graal.python.PythonLanguage;
2941
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
3042
import com.oracle.graal.python.nodes.ModuleRootNode;
3143
import com.oracle.graal.python.nodes.function.FunctionDefinitionNode;
3244
import com.oracle.graal.python.nodes.function.GeneratorFunctionDefinitionNode;
3345
import com.oracle.graal.python.nodes.util.BadOPCodeNode;
34-
import org.antlr.v4.runtime.CharStreams;
35-
import org.antlr.v4.runtime.CommonTokenStream;
36-
3746
import com.oracle.graal.python.parser.antlr.DescriptiveBailErrorListener;
3847
import com.oracle.graal.python.parser.antlr.Python3Lexer;
3948
import com.oracle.graal.python.parser.antlr.Python3Parser;
@@ -58,13 +67,6 @@
5867
import com.oracle.truffle.api.nodes.RootNode;
5968
import com.oracle.truffle.api.source.Source;
6069
import com.oracle.truffle.api.source.SourceSection;
61-
import java.io.ByteArrayInputStream;
62-
import java.io.ByteArrayOutputStream;
63-
import java.io.DataInputStream;
64-
import java.io.DataOutputStream;
65-
import java.io.IOException;
66-
import org.antlr.v4.runtime.Token;
67-
import org.graalvm.nativeimage.ImageInfo;
6870

6971
public final class PythonParserImpl implements PythonParser, PythonCodeSerializer {
7072

@@ -81,15 +83,8 @@ public PythonParserImpl(Env env) {
8183
this.timeStatistics = env.getOptions().get(PythonOptions.ParserStatistics);
8284
}
8385

84-
private static Python3Parser getPython3Parser(Source source, ParserErrorCallback errors) {
85-
Python3Lexer lexer;
86-
try {
87-
lexer = source.getPath() == null
88-
? new Python3Lexer(CharStreams.fromString(source.getCharacters().toString()))
89-
: new Python3Lexer(CharStreams.fromFileName(source.getPath()));
90-
} catch (IOException ex) {
91-
lexer = new Python3Lexer(CharStreams.fromString(source.getCharacters().toString()));
92-
}
86+
private static Python3Parser getPython3Parser(Source source, String sourceText, ParserErrorCallback errors) {
87+
Python3Lexer lexer = new Python3Lexer(CharStreams.fromString(sourceText));
9388
lexer.removeErrorListeners();
9489
lexer.addErrorListener(ERROR_LISTENER);
9590
Python3Parser parser = new Python3Parser(new CommonTokenStream(lexer));
@@ -262,8 +257,25 @@ public Node parse(ParserMode mode, ParserErrorCallback errors, Source source, Fr
262257

263258
private CacheItem parseWithANTLR(ParserMode mode, ParserErrorCallback errors, PythonSSTNodeFactory sstFactory, Source source, Frame currentFrame) {
264259
FrameDescriptor inlineLocals = mode == ParserMode.InlineEvaluation ? currentFrame.getFrameDescriptor() : null;
260+
String sourceText = source.getCharacters().toString();
261+
// Preprocessing
262+
263+
// Check that declared encoding (if any) is valid. The file detector picks an encoding
264+
// for the file, but it doesn't have a means of communicating that the declared encoding
265+
// wasn't valid or supported, so in that case it defaults to Latin-1 and we have to
266+
// recheck it here.
267+
// msimacek: The encoding check should happen only when the source encoding was
268+
// determined by PythonFileDetector. But we currently have no way to tell, so we
269+
// assume that it is the case when it is a file.
270+
if (source.getURI().getScheme().equals("file")) {
271+
try {
272+
PythonFileDetector.findEncodingStrict(sourceText);
273+
} catch (PythonFileDetector.InvalidEncodingException e) {
274+
throw errors.raiseInvalidSyntax(source, source.createUnavailableSection(), "encoding problem: %s", e.getEncodingName());
275+
}
276+
}
265277
// ANTLR parsing
266-
Python3Parser parser = getPython3Parser(source, errors);
278+
Python3Parser parser = getPython3Parser(source, sourceText, errors);
267279
parser.setFactory(sstFactory);
268280
SSTNode parserSSTResult = null;
269281

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/antlr/Python3.g4

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ locals
381381
}
382382
{ loopState = null; }
383383
{ int start = start(); }
384+
BOM?
384385
(
385386
NEWLINE
386387
| simple_stmt
@@ -401,6 +402,7 @@ locals
401402
{ _localctx.scope = scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
402403
{ loopState = null; }
403404
{ int start = start(); }
405+
BOM?
404406
(
405407
NEWLINE
406408
| stmt
@@ -418,7 +420,7 @@ eval_input returns [SSTNode result]
418420
locals [ com.oracle.graal.python.parser.ScopeInfo scope ]
419421
:
420422
{ scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
421-
testlist NEWLINE* EOF
423+
BOM? testlist NEWLINE* EOF
422424
{ $result = $testlist.result; }
423425
{scopeEnvironment.popScope(); }
424426
;
@@ -1884,6 +1886,8 @@ SKIP_
18841886
: ( SPACES | COMMENT | LINE_JOINING ) -> skip
18851887
;
18861888

1889+
BOM : '\uFEFF';
1890+
18871891
UNKNOWN_CHAR
18881892
: .
18891893
;

0 commit comments

Comments
 (0)