Skip to content

Commit 395ba09

Browse files
committed
Interpret encoding comments when passing bytes to compile/eval/exec
1 parent eca4e06 commit 395ba09

File tree

2 files changed

+38
-5
lines changed

2 files changed

+38
-5
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,17 @@
4141
package com.oracle.graal.python;
4242

4343
import java.io.BufferedReader;
44+
import java.io.ByteArrayInputStream;
4445
import java.io.IOException;
46+
import java.io.InputStreamReader;
4547
import java.io.StringReader;
4648
import java.nio.charset.Charset;
4749
import java.nio.charset.StandardCharsets;
4850
import java.util.regex.Matcher;
4951
import java.util.regex.Pattern;
5052

5153
import com.oracle.graal.python.util.CharsetMapping;
54+
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
5255
import com.oracle.truffle.api.TruffleFile;
5356

5457
public final class PythonFileDetector implements TruffleFile.FileTypeDetector {
@@ -79,7 +82,7 @@ public String getEncodingName() {
7982
}
8083
}
8184

82-
public static Charset tryGetCharsetFromLine(String line) {
85+
private static Charset tryGetCharsetFromLine(String line) {
8386
if (line == null) {
8487
return null;
8588
}
@@ -94,6 +97,7 @@ public static Charset tryGetCharsetFromLine(String line) {
9497
return null;
9598
}
9699

100+
@TruffleBoundary
97101
public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
98102
Charset charset;
99103
if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
@@ -105,13 +109,15 @@ public static Charset findEncodingStrict(BufferedReader reader) throws IOExcepti
105109
return StandardCharsets.UTF_8;
106110
}
107111

112+
@TruffleBoundary
108113
public static Charset findEncodingStrict(TruffleFile file) throws IOException {
109114
// Using Latin-1 to read the header avoids exceptions on non-ascii characters
110115
try (BufferedReader reader = file.newBufferedReader(StandardCharsets.ISO_8859_1)) {
111116
return findEncodingStrict(reader);
112117
}
113118
}
114119

120+
@TruffleBoundary
115121
public static Charset findEncodingStrict(String source) {
116122
try (BufferedReader reader = new BufferedReader(new StringReader(source))) {
117123
return findEncodingStrict(reader);
@@ -121,6 +127,17 @@ public static Charset findEncodingStrict(String source) {
121127
}
122128
}
123129

130+
@TruffleBoundary
131+
public static Charset findEncodingStrict(byte[] source) {
132+
// Using Latin-1 to read the header avoids exceptions on non-ascii characters
133+
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(source), StandardCharsets.ISO_8859_1))) {
134+
return findEncodingStrict(reader);
135+
} catch (IOException e) {
136+
// Shouldn't happen on a string
137+
throw new RuntimeException(e);
138+
}
139+
}
140+
124141
@Override
125142
public Charset findEncoding(TruffleFile file) throws IOException {
126143
try {

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@
7171
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
7272

7373
import java.math.BigInteger;
74+
import java.nio.charset.Charset;
7475
import java.util.List;
7576

77+
import com.oracle.graal.python.PythonFileDetector;
7678
import com.oracle.graal.python.PythonLanguage;
7779
import com.oracle.graal.python.builtins.Builtin;
7880
import com.oracle.graal.python.builtins.CoreFunctions;
@@ -719,9 +721,23 @@ public CompileNode() {
719721
public abstract PCode execute(VirtualFrame frame, Object source, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize);
720722

721723
@Specialization
722-
PCode compile(VirtualFrame frame, PBytes source, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize,
724+
PCode compile(VirtualFrame frame, PBytes pBytes, String filename, String mode, Object kwFlags, Object kwDontInherit, Object kwOptimize,
723725
@Cached("create()") BytesNodes.ToBytesNode toBytesNode) {
724-
return compile(createString(toBytesNode.execute(frame, source)), filename, mode, kwFlags, kwDontInherit, kwOptimize);
726+
try {
727+
byte[] bytes = toBytesNode.execute(frame, pBytes);
728+
Charset charset = PythonFileDetector.findEncodingStrict(bytes);
729+
return compile(createString(bytes, charset), filename, mode, kwFlags, kwDontInherit, kwOptimize);
730+
} catch (PythonFileDetector.InvalidEncodingException e) {
731+
throw handleInvalidEncoding(filename, e);
732+
}
733+
}
734+
735+
@TruffleBoundary
736+
private RuntimeException handleInvalidEncoding(String filename, PythonFileDetector.InvalidEncodingException e) {
737+
PythonContext context = getContext();
738+
// Create non-empty source to avoid overwriting the message with "unexpected EOF"
739+
Source source = PythonLanguage.newSource(context, " ", filename, mayBeFromFile);
740+
throw getCore().raiseInvalidSyntax(source, source.createUnavailableSection(), "encoding problem: %s", e.getEncodingName());
725741
}
726742

727743
@SuppressWarnings("unused")
@@ -766,8 +782,8 @@ PCode compile(PCode code, String filename, String mode, Object flags, Object don
766782
}
767783

768784
@TruffleBoundary
769-
private static String createString(byte[] bytes) {
770-
return new String(bytes);
785+
private static String createString(byte[] bytes, Charset charset) {
786+
return new String(bytes, charset);
771787

772788
}
773789

0 commit comments

Comments
 (0)