Skip to content

Commit c8a8f62

Browse files
committed
[GR-23268] UTF7 support
PullRequest: graalpython/1251
2 parents 53785fc + 81b1423 commit c8a8f62

File tree

5 files changed

+52
-6
lines changed

5 files changed

+52
-6
lines changed

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_unicode.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_bytes_comparison
1515
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_case_operation_overflow
1616
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_center
17+
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs
1718
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_codecs_idna
1819
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_comparison
1920
*graalpython.lib-python.3.test.test_unicode.UnicodeTest.test_concatenation

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/resources/reflect-config.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,13 @@
4242
{
4343
"name":"com.ibm.icu.text.BreakIteratorFactory",
4444
"methods":[{"name":"<init>","parameterTypes":[] }]
45+
},
46+
{
47+
"name":"com.ibm.icu.charset.CharsetUTF7",
48+
"methods":[{"name":"<init>","parameterTypes":["java.lang.String", "java.lang.String", "java.lang.String[]"]}]
49+
},
50+
{
51+
"name":"com.ibm.icu.charset.CharsetHZ",
52+
"methods":[{"name":"<init>","parameterTypes":["java.lang.String", "java.lang.String", "java.lang.String[]"]}]
4553
}
4654
]

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import java.util.Locale;
4848
import java.util.Map;
4949

50+
import com.ibm.icu.charset.CharsetICU;
5051
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
5152

5253
/**
@@ -78,10 +79,20 @@ public static String normalize(String encoding) {
7879
private static Charset getJavaCharset(String name) {
7980
Charset charset = JAVA_CHARSETS.get(name);
8081
if (charset == null) {
81-
try {
82-
charset = Charset.forName(name);
83-
} catch (UnsupportedCharsetException e) {
84-
// Let it stay null
82+
// Important: When adding additional ICU4J charset, the implementation class needs to be
83+
// added to reflect-config.json
84+
if (name.equals("UTF-7") || name.equals("HZ")) {
85+
try {
86+
charset = CharsetICU.forNameICU(name);
87+
} catch (UnsupportedCharsetException e) {
88+
// Let it stay null
89+
}
90+
} else {
91+
try {
92+
charset = Charset.forName(name);
93+
} catch (UnsupportedCharsetException e) {
94+
// Let it stay null
95+
}
8596
}
8697
JAVA_CHARSETS.put(name, charset);
8798
}
@@ -161,7 +172,7 @@ private static void addAlias(String alias, String pythonName) {
161172
addMapping("gb2312", "GB2312");
162173
addMapping("gbk", "GBK");
163174
addMapping("hp_roman8", null);
164-
addMapping("hz", "GB2312");
175+
addMapping("hz", "HZ");
165176
addMapping("iso2022_jp_1", null);
166177
addMapping("iso2022_jp_2004", null);
167178
addMapping("iso2022_jp_2", "ISO-2022-JP-2");
@@ -213,7 +224,7 @@ private static void addAlias(String alias, String pythonName) {
213224
addMapping("utf_32_be", "UTF-32BE");
214225
addMapping("utf_32_le", "UTF-32LE");
215226
addMapping("utf_32", "UTF-32");
216-
addMapping("utf_7", null);
227+
addMapping("utf_7", "UTF-7");
217228
addMapping("utf_8", "UTF-8");
218229

219230
// Generated from encodings.aliases.aliases, removed non-language encodings like base64

mx.graalpython/mx_graalpython.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1438,10 +1438,25 @@ def _register_bench_suites(namespace):
14381438
mx_benchmark.add_bm_suite(java_bench_suite)
14391439

14401440

1441+
class CharsetFilteringPariticpant:
1442+
"""
1443+
Remove charset providers from the resulting JAR distribution. Done to avoid libraries (icu4j-charset)
1444+
adding their charsets implicitly to native image. We need to add them explicitly in a controlled way.
1445+
"""
1446+
def __opened__(self, archive, src_archive, services):
1447+
self.__services = services
1448+
1449+
def __closing__(self):
1450+
self.__services.pop('java.nio.charset.spi.CharsetProvider', None)
1451+
1452+
14411453
def mx_post_parse_cmd_line(namespace):
14421454
# all projects are now available at this time
14431455
_register_vms(namespace)
14441456
_register_bench_suites(namespace)
1457+
for dist in mx.suite('graalpython').dists:
1458+
if hasattr(dist, 'set_archiveparticipant'):
1459+
dist.set_archiveparticipant(CharsetFilteringPariticpant())
14451460

14461461

14471462
def python_coverage(args):

mx.graalpython/suite.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,16 @@
124124
"version" : "66.1",
125125
},
126126
},
127+
"ICU4J-CHARSET-66.1" : {
128+
"moduleName" : "com.ibm.icu.charset",
129+
"sha1" : "292f8736709f5c69afd9275faf92fac11a342b3e",
130+
"sourceSha1" : "e981b54cb052d22f5512626dac6a29c79b1460d6",
131+
"maven" : {
132+
"groupId" : "com.ibm.icu",
133+
"artifactId" : "icu4j-charset",
134+
"version" : "66.1",
135+
},
136+
},
127137
},
128138

129139
# --------------------------------------------------------------------------------------------------------------
@@ -229,6 +239,7 @@
229239
"sulong:SULONG_API",
230240
"XZ-1.8",
231241
"ICU4J-66.1",
242+
"ICU4J-CHARSET-66.1",
232243
],
233244
"buildDependencies": ["com.oracle.graal.python.parser.antlr"],
234245
"jacoco": "include",

0 commit comments

Comments
 (0)