Skip to content

Commit eca4e06

Browse files
committed
Suport encoding declarations in source files
1 parent 151042d commit eca4e06

File tree

2 files changed

+86
-2
lines changed

2 files changed

+86
-2
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,21 @@
4040
*/
4141
package com.oracle.graal.python;
4242

43+
import java.io.BufferedReader;
4344
import java.io.IOException;
44-
import com.oracle.truffle.api.TruffleFile;
45+
import java.io.StringReader;
4546
import java.nio.charset.Charset;
47+
import java.nio.charset.StandardCharsets;
48+
import java.util.regex.Matcher;
49+
import java.util.regex.Pattern;
50+
51+
import com.oracle.graal.python.util.CharsetMapping;
52+
import com.oracle.truffle.api.TruffleFile;
4653

4754
public final class PythonFileDetector implements TruffleFile.FileTypeDetector {
4855

56+
private static final Pattern ENCODING_COMMENT = Pattern.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+).*");
57+
4958
@Override
5059
public String findMimeType(TruffleFile file) throws IOException {
5160
String fileName = file.getName();
@@ -55,8 +64,72 @@ public String findMimeType(TruffleFile file) throws IOException {
5564
return null;
5665
}
5766

67+
public static class InvalidEncodingException extends RuntimeException {
68+
private static final long serialVersionUID = 1L;
69+
70+
private final String encodingName;
71+
72+
public InvalidEncodingException(String encodingName) {
73+
super("Invalid or unsupported encoding: " + encodingName);
74+
this.encodingName = encodingName;
75+
}
76+
77+
public String getEncodingName() {
78+
return encodingName;
79+
}
80+
}
81+
82+
public static Charset tryGetCharsetFromLine(String line) {
83+
if (line == null) {
84+
return null;
85+
}
86+
Matcher matcher = ENCODING_COMMENT.matcher(line);
87+
if (matcher.matches()) {
88+
Charset charset = CharsetMapping.getCharset(matcher.group(1));
89+
if (charset == null) {
90+
throw new InvalidEncodingException(matcher.group(1));
91+
}
92+
return charset;
93+
}
94+
return null;
95+
}
96+
97+
public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
98+
Charset charset;
99+
if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
100+
return charset;
101+
}
102+
if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
103+
return charset;
104+
}
105+
return StandardCharsets.UTF_8;
106+
}
107+
108+
public static Charset findEncodingStrict(TruffleFile file) throws IOException {
109+
// Using Latin-1 to read the header avoids exceptions on non-ascii characters
110+
try (BufferedReader reader = file.newBufferedReader(StandardCharsets.ISO_8859_1)) {
111+
return findEncodingStrict(reader);
112+
}
113+
}
114+
115+
public static Charset findEncodingStrict(String source) {
116+
try (BufferedReader reader = new BufferedReader(new StringReader(source))) {
117+
return findEncodingStrict(reader);
118+
} catch (IOException e) {
119+
// Shouldn't happen on a string
120+
throw new RuntimeException(e);
121+
}
122+
}
123+
58124
@Override
59125
public Charset findEncoding(TruffleFile file) throws IOException {
60-
return null;
126+
try {
127+
return findEncodingStrict(file);
128+
} catch (InvalidEncodingException e) {
129+
// We cannot throw a SyntaxError at this point, but the parser will revalidate this.
130+
// Return Latin-1 so that it doesn't throw encoding errors before getting to the
131+
// parser, because Truffle would otherwise default to UTF-8
132+
return StandardCharsets.ISO_8859_1;
133+
}
61134
}
62135
}

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/PythonParserImpl.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.antlr.v4.runtime.CharStreams;
3535
import org.antlr.v4.runtime.CommonTokenStream;
3636

37+
import com.oracle.graal.python.PythonFileDetector;
3738
import com.oracle.graal.python.parser.antlr.DescriptiveBailErrorListener;
3839
import com.oracle.graal.python.parser.antlr.Python3Lexer;
3940
import com.oracle.graal.python.parser.antlr.Python3Parser;
@@ -262,6 +263,16 @@ public Node parse(ParserMode mode, ParserErrorCallback errors, Source source, Fr
262263

263264
private CacheItem parseWithANTLR(ParserMode mode, ParserErrorCallback errors, PythonSSTNodeFactory sstFactory, Source source, Frame currentFrame) {
264265
FrameDescriptor inlineLocals = mode == ParserMode.InlineEvaluation ? currentFrame.getFrameDescriptor() : null;
266+
String sourceText = source.getCharacters().toString();
267+
// Preprocessing
268+
// Check that declared encoding (if any) is valid. The file detector picks an encoding for
269+
// the file, but it doesn't have a means of communicating that the declared encoding wasn't
270+
// valid or supported, so in that case it defaults to Latin-1 and we have to recheck it here
271+
try {
272+
PythonFileDetector.findEncodingStrict(sourceText);
273+
} catch (PythonFileDetector.InvalidEncodingException e) {
274+
throw errors.raiseInvalidSyntax(source, source.createUnavailableSection(), "encoding problem: %s", e.getEncodingName());
275+
}
265276
// ANTLR parsing
266277
Python3Parser parser = getPython3Parser(source, errors);
267278
parser.setFactory(sstFactory);

0 commit comments

Comments
 (0)