Skip to content

Commit 45b97df

Browse files
committed
Add handling for BOM
1 parent b248d17 commit 45b97df

File tree

3 files changed

+25
-7
lines changed

3 files changed

+25
-7
lines changed

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656

5757
public final class PythonFileDetector implements TruffleFile.FileTypeDetector {
5858

59+
private static final String UTF_8_BOM_IN_LATIN_1 = new String(new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.ISO_8859_1);
5960
private static final Pattern ENCODING_COMMENT = Pattern.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+).*");
6061

6162
@Override
@@ -82,15 +83,21 @@ public String getEncodingName() {
8283
}
8384
}
8485

85-
private static Charset tryGetCharsetFromLine(String line) {
86+
private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) {
8687
if (line == null) {
8788
return null;
8889
}
8990
Matcher matcher = ENCODING_COMMENT.matcher(line);
9091
if (matcher.matches()) {
91-
Charset charset = CharsetMapping.getCharset(matcher.group(1));
92+
// Files with UTF-8 BOM but different encoding declared are a SyntaxError
93+
// Note that CPython ignores UTF-8 aliases for the BOM check
94+
String encoding = matcher.group(1);
95+
if (hasBOM && !CharsetMapping.normalize(encoding).equals("utf_8")) {
96+
throw new InvalidEncodingException(encoding + " with BOM");
97+
}
98+
Charset charset = CharsetMapping.getCharset(encoding);
9299
if (charset == null) {
93-
throw new InvalidEncodingException(matcher.group(1));
100+
throw new InvalidEncodingException(encoding);
94101
}
95102
return charset;
96103
}
@@ -100,10 +107,17 @@ private static Charset tryGetCharsetFromLine(String line) {
100107
@TruffleBoundary
101108
public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
102109
Charset charset;
103-
if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
110+
// Read first two lines like CPython
111+
String firstLine = reader.readLine();
112+
boolean hasBOM = false;
113+
if (firstLine != null && firstLine.startsWith(UTF_8_BOM_IN_LATIN_1)) {
114+
hasBOM = true;
115+
firstLine = firstLine.substring(UTF_8_BOM_IN_LATIN_1.length());
116+
}
117+
if ((charset = tryGetCharsetFromLine(firstLine, hasBOM)) != null) {
104118
return charset;
105119
}
106-
if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
120+
if ((charset = tryGetCharsetFromLine(reader.readLine(), hasBOM)) != null) {
107121
return charset;
108122
}
109123
return StandardCharsets.UTF_8;

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/antlr/Python3.g4

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ locals
381381
}
382382
{ loopState = null; }
383383
{ int start = start(); }
384+
BOM?
384385
(
385386
NEWLINE
386387
| simple_stmt
@@ -401,6 +402,7 @@ locals
401402
{ _localctx.scope = scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
402403
{ loopState = null; }
403404
{ int start = start(); }
405+
BOM?
404406
(
405407
NEWLINE
406408
| stmt
@@ -418,7 +420,7 @@ eval_input returns [SSTNode result]
418420
locals [ com.oracle.graal.python.parser.ScopeInfo scope ]
419421
:
420422
{ scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
421-
testlist NEWLINE* EOF
423+
BOM? testlist NEWLINE* EOF
422424
{ $result = $testlist.result; }
423425
{scopeEnvironment.popScope(); }
424426
;
@@ -1884,6 +1886,8 @@ SKIP_
18841886
: ( SPACES | COMMENT | LINE_JOINING ) -> skip
18851887
;
18861888

1889+
BOM : '\uFEFF';
1890+
18871891
UNKNOWN_CHAR
18881892
: .
18891893
;

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public static Charset getCharset(String encoding) {
6565
return null;
6666
}
6767

68-
private static String normalize(String encoding) {
68+
public static String normalize(String encoding) {
6969
return encoding.toLowerCase(Locale.ENGLISH).replaceAll("[^\\w.]+", "_");
7070
}
7171

0 commit comments

Comments
 (0)