40
40
*/
41
41
package com .oracle .graal .python ;
42
42
43
+ import java .io .BufferedReader ;
43
44
import java .io .IOException ;
44
- import com . oracle . truffle . api . TruffleFile ;
45
+ import java . io . StringReader ;
45
46
import java .nio .charset .Charset ;
47
+ import java .nio .charset .StandardCharsets ;
48
+ import java .util .regex .Matcher ;
49
+ import java .util .regex .Pattern ;
50
+
51
+ import com .oracle .graal .python .util .CharsetMapping ;
52
+ import com .oracle .truffle .api .TruffleFile ;
46
53
47
54
public final class PythonFileDetector implements TruffleFile .FileTypeDetector {
48
55
56
+ private static final Pattern ENCODING_COMMENT = Pattern .compile ("^[ \t \f ]*#.*?coding[:=][ \t ]*([-_.a-zA-Z0-9]+).*" );
57
+
49
58
@ Override
50
59
public String findMimeType (TruffleFile file ) throws IOException {
51
60
String fileName = file .getName ();
@@ -55,8 +64,72 @@ public String findMimeType(TruffleFile file) throws IOException {
55
64
return null ;
56
65
}
57
66
67
+ public static class InvalidEncodingException extends RuntimeException {
68
+ private static final long serialVersionUID = 1L ;
69
+
70
+ private final String encodingName ;
71
+
72
+ public InvalidEncodingException (String encodingName ) {
73
+ super ("Invalid or unsupported encoding: " + encodingName );
74
+ this .encodingName = encodingName ;
75
+ }
76
+
77
+ public String getEncodingName () {
78
+ return encodingName ;
79
+ }
80
+ }
81
+
82
+ public static Charset tryGetCharsetFromLine (String line ) {
83
+ if (line == null ) {
84
+ return null ;
85
+ }
86
+ Matcher matcher = ENCODING_COMMENT .matcher (line );
87
+ if (matcher .matches ()) {
88
+ Charset charset = CharsetMapping .getCharset (matcher .group (1 ));
89
+ if (charset == null ) {
90
+ throw new InvalidEncodingException (matcher .group (1 ));
91
+ }
92
+ return charset ;
93
+ }
94
+ return null ;
95
+ }
96
+
97
+ public static Charset findEncodingStrict (BufferedReader reader ) throws IOException {
98
+ Charset charset ;
99
+ if ((charset = tryGetCharsetFromLine (reader .readLine ())) != null ) {
100
+ return charset ;
101
+ }
102
+ if ((charset = tryGetCharsetFromLine (reader .readLine ())) != null ) {
103
+ return charset ;
104
+ }
105
+ return StandardCharsets .UTF_8 ;
106
+ }
107
+
108
+ public static Charset findEncodingStrict (TruffleFile file ) throws IOException {
109
+ // Using Latin-1 to read the header avoids exceptions on non-ascii characters
110
+ try (BufferedReader reader = file .newBufferedReader (StandardCharsets .ISO_8859_1 )) {
111
+ return findEncodingStrict (reader );
112
+ }
113
+ }
114
+
115
+ public static Charset findEncodingStrict (String source ) {
116
+ try (BufferedReader reader = new BufferedReader (new StringReader (source ))) {
117
+ return findEncodingStrict (reader );
118
+ } catch (IOException e ) {
119
+ // Shouldn't happen on a string
120
+ throw new RuntimeException (e );
121
+ }
122
+ }
123
+
58
124
@ Override
59
125
public Charset findEncoding (TruffleFile file ) throws IOException {
60
- return null ;
126
+ try {
127
+ return findEncodingStrict (file );
128
+ } catch (InvalidEncodingException e ) {
129
+ // We cannot throw a SyntaxError at this point, but the parser will revalidate this.
130
+ // Return Latin-1 so that it doesn't throw encoding errors before getting to the
131
+ // parser, because Truffle would otherwise default to UTF-8
132
+ return StandardCharsets .ISO_8859_1 ;
133
+ }
61
134
}
62
135
}
0 commit comments