Add handling for BOM

msimacek · msimacek · commit 45b97dff86a1 · 2020-06-16T10:37:36.000+02:00
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java
@@ -56,6 +56,7 @@
 
 public final class PythonFileDetector implements TruffleFile.FileTypeDetector {
 
+    private static final String UTF_8_BOM_IN_LATIN_1 = new String(new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.ISO_8859_1);
     private static final Pattern ENCODING_COMMENT = Pattern.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+).*");
 
     @Override
@@ -82,15 +83,21 @@ public String getEncodingName() {
         }
     }
 
-    private static Charset tryGetCharsetFromLine(String line) {
+    private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) {
         if (line == null) {
             return null;
         }
         Matcher matcher = ENCODING_COMMENT.matcher(line);
         if (matcher.matches()) {
-            Charset charset = CharsetMapping.getCharset(matcher.group(1));
+            // Files with UTF-8 BOM but different encoding declared are a SyntaxError
+            // Note that CPython ignores UTF-8 aliases for the BOM check
+            String encoding = matcher.group(1);
+            if (hasBOM && !CharsetMapping.normalize(encoding).equals("utf_8")) {
+                throw new InvalidEncodingException(encoding + " with BOM");
+            }
+            Charset charset = CharsetMapping.getCharset(encoding);
             if (charset == null) {
-                throw new InvalidEncodingException(matcher.group(1));
+                throw new InvalidEncodingException(encoding);
             }
             return charset;
         }
@@ -100,10 +107,17 @@ private static Charset tryGetCharsetFromLine(String line) {
     @TruffleBoundary
     public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
         Charset charset;
-        if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
+        // Read first two lines like CPython
+        String firstLine = reader.readLine();
+        boolean hasBOM = false;
+        if (firstLine != null && firstLine.startsWith(UTF_8_BOM_IN_LATIN_1)) {
+            hasBOM = true;
+            firstLine = firstLine.substring(UTF_8_BOM_IN_LATIN_1.length());
+        }
+        if ((charset = tryGetCharsetFromLine(firstLine, hasBOM)) != null) {
             return charset;
         }
-        if ((charset = tryGetCharsetFromLine(reader.readLine())) != null) {
+        if ((charset = tryGetCharsetFromLine(reader.readLine(), hasBOM)) != null) {
             return charset;
         }
         return StandardCharsets.UTF_8;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/antlr/Python3.g4 b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/parser/antlr/Python3.g4
@@ -381,6 +381,7 @@ locals
 	}
 	{ loopState = null; }
 	{ int start = start(); }
+	BOM?
 	(
 		NEWLINE
 		| simple_stmt
@@ -401,6 +402,7 @@ locals
 	{  _localctx.scope = scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
 	{ loopState = null; }
 	{ int start = start(); }
+	BOM?
 	(
 		NEWLINE
 		| stmt
@@ -418,7 +420,7 @@ eval_input returns [SSTNode result]
 locals [ com.oracle.graal.python.parser.ScopeInfo scope ]
 :
 	{ scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }
-	testlist NEWLINE* EOF
+	BOM? testlist NEWLINE* EOF
 	{ $result = $testlist.result; }
 	{scopeEnvironment.popScope(); }
 ;
@@ -1884,6 +1886,8 @@ SKIP_
  : ( SPACES | COMMENT | LINE_JOINING ) -> skip
  ;
 
+BOM : '\uFEFF';
+
 UNKNOWN_CHAR
  : .
  ;
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java
@@ -65,7 +65,7 @@ public static Charset getCharset(String encoding) {
         return null;
     }
 
-    private static String normalize(String encoding) {
+    public static String normalize(String encoding) {
         return encoding.toLowerCase(Locale.ENGLISH).replaceAll("[^\\w.]+", "_");
     }
 

Original file line number	Diff line number	Diff line change
`@@ -381,6 +381,7 @@ locals`
`381`	`381`	`}`
`382`	`382`	`{ loopState = null; }`
`383`	`383`	`{ int start = start(); }`
	`384`	`+ BOM?`
`384`	`385`	`(`
`385`	`386`	`NEWLINE`
`386`	`387`	`\| simple_stmt`
`@@ -401,6 +402,7 @@ locals`
`401`	`402`	`{ _localctx.scope = scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }`
`402`	`403`	`{ loopState = null; }`
`403`	`404`	`{ int start = start(); }`
	`405`	`+ BOM?`
`404`	`406`	`(`
`405`	`407`	`NEWLINE`
`406`	`408`	`\| stmt`
`@@ -418,7 +420,7 @@ eval_input returns [SSTNode result]`
`418`	`420`	`locals [ com.oracle.graal.python.parser.ScopeInfo scope ]`
`419`	`421`	`:`
`420`	`422`	`{ scopeEnvironment.pushScope(_localctx.toString(), ScopeInfo.ScopeKind.Module); }`
`421`		`- testlist NEWLINE* EOF`
	`423`	`+ BOM? testlist NEWLINE* EOF`
`422`	`424`	`{ $result = $testlist.result; }`
`423`	`425`	`{scopeEnvironment.popScope(); }`
`424`	`426`	`;`
`@@ -1884,6 +1886,8 @@ SKIP_`
`1884`	`1886`	`: ( SPACES \| COMMENT \| LINE_JOINING ) -> skip`
`1885`	`1887`	`;`
`1886`	`1888`
	`1889`	`+BOM : '\uFEFF';`
	`1890`	`+`
`1887`	`1891`	`UNKNOWN_CHAR`
`1888`	`1892`	`: .`
`1889`	`1893`	`;`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ public static Charset getCharset(String encoding) {`
`65`	`65`	`return null;`
`66`	`66`	`}`
`67`	`67`
`68`		`- private static String normalize(String encoding) {`
	`68`	`+ public static String normalize(String encoding) {`
`69`	`69`	`return encoding.toLowerCase(Locale.ENGLISH).replaceAll("[^\\w.]+", "_");`
`70`	`70`	`}`
`71`	`71`