56
56
57
57
public final class PythonFileDetector implements TruffleFile .FileTypeDetector {
58
58
59
+ private static final String UTF_8_BOM_IN_LATIN_1 = new String (new byte []{(byte ) 0xEF , (byte ) 0xBB , (byte ) 0xBF }, StandardCharsets .ISO_8859_1 );
59
60
private static final Pattern ENCODING_COMMENT = Pattern .compile ("^[ \t \f ]*#.*?coding[:=][ \t ]*([-_.a-zA-Z0-9]+).*" );
60
61
61
62
@ Override
@@ -82,15 +83,21 @@ public String getEncodingName() {
82
83
}
83
84
}
84
85
85
- private static Charset tryGetCharsetFromLine (String line ) {
86
+ private static Charset tryGetCharsetFromLine (String line , boolean hasBOM ) {
86
87
if (line == null ) {
87
88
return null ;
88
89
}
89
90
Matcher matcher = ENCODING_COMMENT .matcher (line );
90
91
if (matcher .matches ()) {
91
- Charset charset = CharsetMapping .getCharset (matcher .group (1 ));
92
+ // Files with UTF-8 BOM but different encoding declared are a SyntaxError
93
+ // Note that CPython ignores UTF-8 aliases for the BOM check
94
+ String encoding = matcher .group (1 );
95
+ if (hasBOM && !CharsetMapping .normalize (encoding ).equals ("utf_8" )) {
96
+ throw new InvalidEncodingException (encoding + " with BOM" );
97
+ }
98
+ Charset charset = CharsetMapping .getCharset (encoding );
92
99
if (charset == null ) {
93
- throw new InvalidEncodingException (matcher . group ( 1 ) );
100
+ throw new InvalidEncodingException (encoding );
94
101
}
95
102
return charset ;
96
103
}
@@ -100,10 +107,17 @@ private static Charset tryGetCharsetFromLine(String line) {
100
107
@ TruffleBoundary
101
108
public static Charset findEncodingStrict (BufferedReader reader ) throws IOException {
102
109
Charset charset ;
103
- if ((charset = tryGetCharsetFromLine (reader .readLine ())) != null ) {
110
+ // Read first two lines like CPython
111
+ String firstLine = reader .readLine ();
112
+ boolean hasBOM = false ;
113
+ if (firstLine != null && firstLine .startsWith (UTF_8_BOM_IN_LATIN_1 )) {
114
+ hasBOM = true ;
115
+ firstLine = firstLine .substring (UTF_8_BOM_IN_LATIN_1 .length ());
116
+ }
117
+ if ((charset = tryGetCharsetFromLine (firstLine , hasBOM )) != null ) {
104
118
return charset ;
105
119
}
106
- if ((charset = tryGetCharsetFromLine (reader .readLine ())) != null ) {
120
+ if ((charset = tryGetCharsetFromLine (reader .readLine (), hasBOM )) != null ) {
107
121
return charset ;
108
122
}
109
123
return StandardCharsets .UTF_8 ;
0 commit comments