50
50
import java .nio .charset .CharacterCodingException ;
51
51
import java .nio .charset .Charset ;
52
52
import java .nio .charset .CodingErrorAction ;
53
+ import java .nio .charset .IllegalCharsetNameException ;
54
+ import java .nio .charset .StandardCharsets ;
55
+ import java .nio .charset .UnsupportedCharsetException ;
53
56
import java .util .Arrays ;
54
57
import java .util .HashMap ;
55
58
import java .util .List ;
82
85
83
86
@ CoreFunctions (defineModule = "_codecs" )
84
87
public class CodecsModuleBuiltins extends PythonBuiltins {
85
- public static final String DEFAULT_ENCODING = "utf-8" ;
88
+ private static final Charset UTF32 = Charset . forName ( "utf-32" ) ;
86
89
87
90
// python to java codecs mapping
88
- private static final Map <String , String > PY_CODECS_ALIASES = new HashMap <>();
91
+ private static final Map <String , Charset > CHARSET_MAP = new HashMap <>();
89
92
static {
90
93
// ascii
91
- PY_CODECS_ALIASES .put ("us-ascii" , "us-ascii" );
92
- PY_CODECS_ALIASES .put ("ascii" , "us-ascii" );
93
- PY_CODECS_ALIASES .put ("646" , "us-ascii" );
94
+ CHARSET_MAP .put ("us-ascii" , StandardCharsets . US_ASCII );
95
+ CHARSET_MAP .put ("ascii" , StandardCharsets . US_ASCII );
96
+ CHARSET_MAP .put ("646" , StandardCharsets . US_ASCII );
94
97
95
98
// latin 1
96
- PY_CODECS_ALIASES .put ("iso-8859-1" , "iso-8859-1" );
97
- PY_CODECS_ALIASES .put ("latin-1" , "iso-8859-1" );
98
- PY_CODECS_ALIASES .put ("latin_1" , "iso-8859-1" );
99
- PY_CODECS_ALIASES .put ("iso-8859-1" , "iso-8859-1" );
100
- PY_CODECS_ALIASES .put ("iso8859-1" , "iso-8859-1" );
101
- PY_CODECS_ALIASES .put ("8859" , "iso-8859-1" );
102
- PY_CODECS_ALIASES .put ("cp819" , "iso-8859-1" );
103
- PY_CODECS_ALIASES .put ("latin" , "iso-8859-1" );
104
- PY_CODECS_ALIASES .put ("latin1" , "iso-8859-1" );
105
- PY_CODECS_ALIASES .put ("L1" , "iso-8859-1" );
99
+ CHARSET_MAP .put ("iso-8859-1" , StandardCharsets .ISO_8859_1 );
100
+ CHARSET_MAP .put ("latin-1" , StandardCharsets .ISO_8859_1 );
101
+ CHARSET_MAP .put ("latin_1" , StandardCharsets .ISO_8859_1 );
102
+ CHARSET_MAP .put ("iso8859-1" , StandardCharsets .ISO_8859_1 );
103
+ CHARSET_MAP .put ("8859" , StandardCharsets .ISO_8859_1 );
104
+ CHARSET_MAP .put ("cp819" , StandardCharsets .ISO_8859_1 );
105
+ CHARSET_MAP .put ("latin" , StandardCharsets .ISO_8859_1 );
106
+ CHARSET_MAP .put ("latin1" , StandardCharsets .ISO_8859_1 );
107
+ CHARSET_MAP .put ("L1" , StandardCharsets .ISO_8859_1 );
106
108
107
109
// utf-8
108
- PY_CODECS_ALIASES .put ("utf-8" , "utf-8" );
109
- PY_CODECS_ALIASES .put ("utf_8" , "utf-8" );
110
- PY_CODECS_ALIASES .put ("U8" , "utf-8" );
111
- PY_CODECS_ALIASES .put ("UTF" , "utf-8" );
112
- PY_CODECS_ALIASES .put ("utf8" , "utf-8" );
110
+ CHARSET_MAP .put ("utf-8" , StandardCharsets . UTF_8 );
111
+ CHARSET_MAP .put ("utf_8" , StandardCharsets . UTF_8 );
112
+ CHARSET_MAP .put ("U8" , StandardCharsets . UTF_8 );
113
+ CHARSET_MAP .put ("UTF" , StandardCharsets . UTF_8 );
114
+ CHARSET_MAP .put ("utf8" , StandardCharsets . UTF_8 );
113
115
114
116
// utf-16
115
- PY_CODECS_ALIASES .put ("utf-16" , "utf-16" );
116
- PY_CODECS_ALIASES .put ("utf_16" , "utf-16" );
117
- PY_CODECS_ALIASES .put ("U16" , "utf-16" );
118
- PY_CODECS_ALIASES .put ("utf16" , "utf-16" );
117
+ CHARSET_MAP .put ("utf-16" , StandardCharsets . UTF_16 );
118
+ CHARSET_MAP .put ("utf_16" , StandardCharsets . UTF_16 );
119
+ CHARSET_MAP .put ("U16" , StandardCharsets . UTF_16 );
120
+ CHARSET_MAP .put ("utf16" , StandardCharsets . UTF_16 );
119
121
// TODO BMP only
120
- PY_CODECS_ALIASES .put ("utf_16_be" , "utf-16be" );
121
- PY_CODECS_ALIASES .put ("utf_16_le" , "utf-16le" );
122
+ CHARSET_MAP .put ("utf_16_be" , StandardCharsets . UTF_16BE );
123
+ CHARSET_MAP .put ("utf_16_le" , StandardCharsets . UTF_16LE );
122
124
123
125
// utf-32
124
- PY_CODECS_ALIASES .put ("utf-32" , "utf-32" );
125
- PY_CODECS_ALIASES .put ("utf_32" , "utf-32" );
126
- PY_CODECS_ALIASES .put ("U32" , "utf-32" );
127
- PY_CODECS_ALIASES .put ("utf_32_be" , "utf-32be" );
128
- PY_CODECS_ALIASES .put ("utf_32_le" , "utf-32le" );
129
- PY_CODECS_ALIASES .put ("utf32" , "utf-32" );
126
+ final Charset utf32be = Charset .forName ("utf-32be" );
127
+ final Charset utf32le = Charset .forName ("utf-32le" );
128
+ final Charset ibm437 = Charset .forName ("IBM437" );
129
+
130
+ CHARSET_MAP .put ("utf-32" , UTF32 );
131
+ CHARSET_MAP .put ("utf_32" , UTF32 );
132
+ CHARSET_MAP .put ("U32" , UTF32 );
133
+ CHARSET_MAP .put ("utf-32be" , utf32be );
134
+ CHARSET_MAP .put ("utf_32_be" , utf32be );
135
+ CHARSET_MAP .put ("utf-32le" , utf32le );
136
+ CHARSET_MAP .put ("utf_32_le" , utf32le );
137
+ CHARSET_MAP .put ("utf32" , UTF32 );
130
138
// big5 big5-tw, csbig5 Traditional Chinese
131
139
// big5hkscs big5-hkscs, hkscs Traditional Chinese
132
140
// cp037 IBM037, IBM039 English
133
141
// cp424 EBCDIC-CP-HE, IBM424 Hebrew
134
142
// cp437 437, IBM437 English
135
- PY_CODECS_ALIASES .put ("IBM437" , "IBM437" );
136
- PY_CODECS_ALIASES .put ("IBM437 English" , "IBM437" );
137
- PY_CODECS_ALIASES .put ("437" , "IBM437" );
138
- PY_CODECS_ALIASES .put ("cp437" , "IBM437" );
143
+ CHARSET_MAP .put ("IBM437" , ibm437 );
144
+ CHARSET_MAP .put ("IBM437 English" , ibm437 );
145
+ CHARSET_MAP .put ("437" , ibm437 );
146
+ CHARSET_MAP .put ("cp437" , ibm437 );
139
147
// cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe
140
148
// cp720 Arabic
141
149
// cp737 Greek
@@ -221,15 +229,7 @@ public class CodecsModuleBuiltins extends PythonBuiltins {
221
229
222
230
@ TruffleBoundary
223
231
static Charset getCharset (String encoding ) {
224
- if (encoding == null ) {
225
- return Charset .forName (DEFAULT_ENCODING );
226
- } else {
227
- String val = PY_CODECS_ALIASES .get (encoding );
228
- if (val != null ) {
229
- return Charset .forName (val );
230
- }
231
- return Charset .forName (encoding );
232
- }
232
+ return CHARSET_MAP .get (encoding );
233
233
}
234
234
235
235
@ Override
@@ -412,15 +412,18 @@ Object encode(Object str, @SuppressWarnings("unused") Object encoding, @Suppress
412
412
@ TruffleBoundary
413
413
private PBytes encodeString (String self , String encoding , String errors ) {
414
414
CodingErrorAction errorAction = convertCodingErrorAction (errors );
415
+ Charset charset ;
416
+ try {
417
+ charset = getCharset (encoding );
418
+ } catch (UnsupportedCharsetException | IllegalCharsetNameException e ) {
419
+ throw raise (LookupError , "unknown encoding: %s" , encoding );
420
+ }
415
421
try {
416
- Charset charset = getCharset (encoding );
417
422
ByteBuffer encoded = charset .newEncoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).encode (CharBuffer .wrap (self ));
418
423
int n = encoded .remaining ();
419
424
byte [] data = new byte [n ];
420
425
encoded .get (data );
421
426
return factory ().createBytes (data );
422
- } catch (IllegalArgumentException e ) {
423
- throw raise (LookupError , "unknown encoding: %s" , encoding );
424
427
} catch (CharacterCodingException e ) {
425
428
throw raise (UnicodeEncodeError , e );
426
429
}
@@ -454,8 +457,7 @@ private Object[] encodeString(String self, String errors) {
454
457
CodingErrorAction errorAction = convertCodingErrorAction (errors );
455
458
456
459
try {
457
- Charset charset = getCharset ("utf-32" );
458
- ByteBuffer encoded = charset .newEncoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).encode (CharBuffer .wrap (self ));
460
+ ByteBuffer encoded = UTF32 .newEncoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).encode (CharBuffer .wrap (self ));
459
461
int n = encoded .remaining ();
460
462
ByteBuffer buf = ByteBuffer .allocate (n );
461
463
assert n % Integer .BYTES == 0 ;
@@ -543,12 +545,15 @@ private ByteBuffer getBytesBuffer(PIBytesLike bytesLike) {
543
545
@ TruffleBoundary
544
546
String decodeBytes (ByteBuffer bytes , String encoding , String errors ) {
545
547
CodingErrorAction errorAction = convertCodingErrorAction (errors );
548
+ Charset charset ;
549
+ try {
550
+ charset = getCharset (encoding );
551
+ } catch (UnsupportedCharsetException | IllegalCharsetNameException e ) {
552
+ throw raise (LookupError , "unknown encoding: %s" , encoding );
553
+ }
546
554
try {
547
- Charset charset = getCharset (encoding );
548
555
CharBuffer decoded = charset .newDecoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).decode (bytes );
549
556
return String .valueOf (decoded );
550
- } catch (IllegalArgumentException e ) {
551
- throw raise (LookupError , "unknown encoding: %s" , encoding );
552
557
} catch (CharacterCodingException e ) {
553
558
throw raise (UnicodeDecodeError , e );
554
559
}
@@ -607,8 +612,7 @@ String decodeBytes(ByteBuffer bytes, String errors) {
607
612
buf .putInt (val );
608
613
}
609
614
buf .flip ();
610
- Charset charset = getCharset ("utf-32" );
611
- CharBuffer decoded = charset .newDecoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).decode (buf );
615
+ CharBuffer decoded = UTF32 .newDecoder ().onMalformedInput (errorAction ).onUnmappableCharacter (errorAction ).decode (buf );
612
616
return String .valueOf (decoded );
613
617
} catch (CharacterCodingException e ) {
614
618
throw raise (UnicodeDecodeError , e );
@@ -626,7 +630,7 @@ Object lookup(String encoding) {
626
630
try {
627
631
getCharset (encoding );
628
632
return true ;
629
- } catch (IllegalArgumentException e ) {
633
+ } catch (UnsupportedCharsetException | IllegalCharsetNameException e ) {
630
634
return PNone .NONE ;
631
635
}
632
636
}
0 commit comments