Skip to content

Commit cc7aea8

Browse files
committed
Implement unicodedata.lookup()
1 parent 2e0ca0f commit cc7aea8

File tree

6 files changed

+110
-36
lines changed

6 files changed

+110
-36
lines changed

graalpython/com.oracle.graal.python.test/src/tests/test_unicodedata.py

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
22
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
33
#
44
# The Universal Permissive License (UPL), Version 1.0
@@ -37,34 +37,41 @@
3737
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3838
# SOFTWARE.
3939

40+
import unicodedata
41+
import unittest
4042

41-
def assert_raises(err, fn, *args, **kwargs):
42-
raised = False
43-
try:
44-
fn(*args, **kwargs)
45-
except err:
46-
raised = True
47-
assert raised
43+
class TestUnicodedata(unittest.TestCase):
4844

45+
def test_args_validation(self):
46+
self.assertRaises(TypeError, unicodedata.category, None)
47+
self.assertRaises(TypeError, unicodedata.bidirectional, None)
48+
self.assertRaises(TypeError, unicodedata.name, None)
4949

50-
def test_args_validation():
51-
import unicodedata
52-
assert_raises(TypeError, unicodedata.category, None)
53-
assert_raises(TypeError, unicodedata.bidirectional, None)
54-
assert_raises(TypeError, unicodedata.name, None)
5550

51+
def test_normalize(self):
52+
self.assertRaises(TypeError, unicodedata.normalize)
53+
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
54+
assert unicodedata.normalize('NFKC', '') == ''
5655

57-
def test_normalize():
58-
import unicodedata
59-
assert_raises(TypeError, unicodedata.normalize)
60-
assert_raises(ValueError, unicodedata.normalize, 'unknown', 'xx')
61-
assert unicodedata.normalize('NFKC', '') == ''
6256

57+
def test_category(self):
58+
assert unicodedata.category('\uFFFE') == 'Cn'
59+
assert unicodedata.category('a') == 'Ll'
60+
assert unicodedata.category('A') == 'Lu'
61+
self.assertRaises(TypeError, unicodedata.category)
62+
self.assertRaises(TypeError, unicodedata.category, 'xx')
63+
64+
65+
def test_lookup(self):
66+
unicode_name = "ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH"
67+
self.assertEqual(unicodedata.lookup(unicode_name), "\u0616")
68+
69+
unicode_name_alias = "ARABIC SMALL HIGH LIGATURE ALEF WITH YEH BARREE"
70+
self.assertEqual(unicodedata.lookup(unicode_name_alias), "\u0616")
71+
72+
with self.assertRaisesRegex(KeyError, "undefined character name 'wrong-name'"):
73+
unicodedata.lookup("wrong-name")
74+
75+
with self.assertRaisesRegex(KeyError, "name too long"):
76+
unicodedata.lookup("a" * 257)
6377

64-
def test_category():
65-
import unicodedata
66-
assert unicodedata.category('\uFFFE') == 'Cn'
67-
assert unicodedata.category('a') == 'Ll'
68-
assert unicodedata.category('A') == 'Lu'
69-
assert_raises(TypeError, unicodedata.category)
70-
assert_raises(TypeError, unicodedata.category, 'xx')

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags/test_ucn.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,5 @@ test.test_ucn.UnicodeNamesTest.test_errors @ darwin-arm64,darwin-x86_64,linux-aa
66
test.test_ucn.UnicodeNamesTest.test_general @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
77
test.test_ucn.UnicodeNamesTest.test_hangul_syllables @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
88
test.test_ucn.UnicodeNamesTest.test_misc_symbols @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
9-
test.test_ucn.UnicodeNamesTest.test_named_sequences_full @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
109
test.test_ucn.UnicodeNamesTest.test_named_sequences_names_in_pua_range @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
11-
test.test_ucn.UnicodeNamesTest.test_named_sequences_sample @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64
1210
test.test_ucn.UnicodeNamesTest.test_strict_error_handling @ darwin-arm64,darwin-x86_64,linux-aarch64,linux-x86_64,win32-AMD64

graalpython/com.oracle.graal.python.test/src/tests/unittest_tags_bytecode_dsl/test_ucn.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,5 @@ test.test_ucn.UnicodeNamesTest.test_errors @ linux-x86_64
66
test.test_ucn.UnicodeNamesTest.test_general @ linux-x86_64
77
test.test_ucn.UnicodeNamesTest.test_hangul_syllables @ linux-x86_64
88
test.test_ucn.UnicodeNamesTest.test_misc_symbols @ linux-x86_64
9-
test.test_ucn.UnicodeNamesTest.test_named_sequences_full @ linux-x86_64
109
test.test_ucn.UnicodeNamesTest.test_named_sequences_names_in_pua_range @ linux-x86_64
11-
test.test_ucn.UnicodeNamesTest.test_named_sequences_sample @ linux-x86_64
1210
test.test_ucn.UnicodeNamesTest.test_strict_error_handling @ linux-x86_64

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/UnicodeDataModuleBuiltins.java

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,16 @@
4242

4343
import static com.oracle.graal.python.nodes.BuiltinNames.J_UNICODEDATA;
4444
import static com.oracle.graal.python.nodes.BuiltinNames.T_UNICODEDATA;
45+
import static com.oracle.graal.python.runtime.exception.PythonErrorType.KeyError;
4546
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
4647
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
4748
import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached;
4849

4950
import java.util.List;
5051

5152
import com.oracle.graal.python.builtins.objects.module.PythonModule;
53+
import com.oracle.truffle.api.strings.TruffleString.FromJavaStringNode;
54+
import com.oracle.truffle.api.strings.TruffleString.ToJavaStringNode;
5255
import org.graalvm.shadowed.com.ibm.icu.lang.UCharacter;
5356
import org.graalvm.shadowed.com.ibm.icu.lang.UProperty;
5457
import org.graalvm.shadowed.com.ibm.icu.text.Normalizer2;
@@ -140,8 +143,8 @@ static TruffleString normalize(@SuppressWarnings("unused") TruffleString form, T
140143
@SuppressWarnings("unused") @Cached("form") TruffleString cachedForm,
141144
@Cached("getNormalizer(cachedForm)") Normalizer2 cachedNormalizer,
142145
@SuppressWarnings("unused") @Cached TruffleString.EqualNode equalNode,
143-
@Cached TruffleString.ToJavaStringNode toJavaStringNode,
144-
@Exclusive @Cached TruffleString.FromJavaStringNode fromJavaStringNode) {
146+
@Cached ToJavaStringNode toJavaStringNode,
147+
@Exclusive @Cached FromJavaStringNode fromJavaStringNode) {
145148
return fromJavaStringNode.execute(normalize(toJavaStringNode.execute(unistr), cachedNormalizer), TS_ENCODING);
146149
}
147150

@@ -188,6 +191,72 @@ protected ArgumentClinicProvider getArgumentClinic() {
188191
}
189192
}
190193

194+
// unicodedata.lookup(name)
195+
@Builtin(name = "lookup", minNumOfPositionalArgs = 1, numOfPositionalOnlyArgs = 1, parameterNames = {"name"})
196+
@ArgumentClinic(name = "name", conversion = ArgumentClinic.ClinicConversion.TString)
197+
@GenerateNodeFactory
198+
public abstract static class LookupNode extends PythonUnaryClinicBuiltinNode {
199+
200+
private static final int NAME_MAX_LENGTH = 256;
201+
202+
@Specialization
203+
@TruffleBoundary
204+
static Object lookup(TruffleString name,
205+
@Bind Node inliningTarget) {
206+
String nameString = ToJavaStringNode.getUncached().execute(name);
207+
if (nameString.length() > NAME_MAX_LENGTH) {
208+
throw PRaiseNode.raiseStatic(inliningTarget, KeyError, ErrorMessages.NAME_TOO_LONG);
209+
}
210+
211+
// TODO: support Unicode character named sequences (GR-68227)
212+
// see test/test_ucn.py.UnicodeFunctionsTest.test_named_sequences_full
213+
String character = getCharacterByUnicodeName(nameString);
214+
if (character == null) {
215+
character = getCharacterByUnicodeNameAlias(nameString);
216+
}
217+
if (character == null) {
218+
throw PRaiseNode.raiseStatic(inliningTarget, KeyError, ErrorMessages.UNDEFINED_CHARACTER_NAME, name);
219+
}
220+
221+
return FromJavaStringNode.getUncached().execute(character, TS_ENCODING);
222+
}
223+
224+
@Override
225+
protected ArgumentClinicProvider getArgumentClinic() {
226+
return UnicodeDataModuleBuiltinsClinicProviders.LookupNodeClinicProviderGen.INSTANCE;
227+
}
228+
229+
/**
230+
* Finds a Unicode code point by its Unicode name and returns it as a single character
231+
* String. Returns null if name is not found.
232+
*/
233+
@TruffleBoundary
234+
private static String getCharacterByUnicodeName(String unicodeName) {
235+
int codepoint = UCharacter.getCharFromName(unicodeName);
236+
237+
if (codepoint < 0) {
238+
return null;
239+
}
240+
241+
return UCharacter.toString(codepoint);
242+
}
243+
244+
/**
245+
* Finds a Unicode code point by its Unicode name alias and returns it as a single character
246+
* String. Returns null if name alias is not found.
247+
*/
248+
@TruffleBoundary
249+
private static String getCharacterByUnicodeNameAlias(String unicodeName) {
250+
int codepoint = UCharacter.getCharFromNameAlias(unicodeName);
251+
252+
if (codepoint < 0) {
253+
return null;
254+
}
255+
256+
return UCharacter.toString(codepoint);
257+
}
258+
}
259+
191260
// unicodedata.name(chr, default)
192261
@Builtin(name = "name", minNumOfPositionalArgs = 1, parameterNames = {"chr", "default"})
193262
@ArgumentClinic(name = "chr", conversion = ArgumentClinic.ClinicConversion.CodePoint)
@@ -197,7 +266,7 @@ public abstract static class NameNode extends PythonBinaryClinicBuiltinNode {
197266
@Specialization
198267
static Object name(int cp, Object defaultValue,
199268
@Bind Node inliningTarget,
200-
@Cached TruffleString.FromJavaStringNode fromJavaStringNode,
269+
@Cached FromJavaStringNode fromJavaStringNode,
201270
@Cached PRaiseNode raiseNode) {
202271
String result = getUnicodeName(cp);
203272
if (result == null) {
@@ -222,7 +291,7 @@ protected ArgumentClinicProvider getArgumentClinic() {
222291
public abstract static class BidirectionalNode extends PythonUnaryClinicBuiltinNode {
223292
@Specialization
224293
static TruffleString bidirectional(int chr,
225-
@Cached TruffleString.FromJavaStringNode fromJavaStringNode) {
294+
@Cached FromJavaStringNode fromJavaStringNode) {
226295
return fromJavaStringNode.execute(getBidiClassName(chr), TS_ENCODING);
227296
}
228297

@@ -244,7 +313,7 @@ protected ArgumentClinicProvider getArgumentClinic() {
244313
public abstract static class CategoryNode extends PythonUnaryClinicBuiltinNode {
245314
@Specialization
246315
static TruffleString category(int chr,
247-
@Cached TruffleString.FromJavaStringNode fromJavaStringNode) {
316+
@Cached FromJavaStringNode fromJavaStringNode) {
248317
return fromJavaStringNode.execute(getCategoryName(chr), TS_ENCODING);
249318
}
250319

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/ErrorMessages.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ public abstract class ErrorMessages {
530530
public static final TruffleString NAME_MUST_BE_A_STRING = tsLiteral("__name__ must be a string");
531531
public static final TruffleString NAME_NOT_DEFINED = tsLiteral("name '%s' is not defined");
532532
public static final TruffleString NAME_NOT_IN_GLOBALS = tsLiteral("'__name__' not in globals");
533+
public static final TruffleString NAME_TOO_LONG = tsLiteral("name too long");
533534
public static final TruffleString NAMEDEXPR_TARGET_MUST_BE_A_NAME = tsLiteral("NamedExpr target must be a Name");
534535
public static final TruffleString NAMELESS_MODULE = tsLiteral("nameless module");
535536
public static final TruffleString NATIVE_S_SUBTYPES_NOT_IMPLEMENTED = tsLiteral("native %s subtypes not implemented");
@@ -745,6 +746,7 @@ public abstract class ErrorMessages {
745746
public static final TruffleString UNEXPECTED_CONSTANT_INSIDE_OF_A_LITERAL_PATTERN = tsLiteral("unexpected constant inside of a literal pattern");
746747
public static final TruffleString UNEXPECTED_KEYWORD_ARGS = tsLiteral("%s: unexpected keyword arguments");
747748
public static final TruffleString UNEXPECTED_S_IN_FIELD_NAME = tsLiteral("unexpected %s in field name");
749+
public static final TruffleString UNDEFINED_CHARACTER_NAME = tsLiteral("undefined character name '%s'");
748750
public static final TruffleString UNHASHABLE_TYPE_P = tsLiteral("unhashable type: '%p'");
749751
public static final TruffleString UNHASHABLE_TYPE = tsLiteral("unhashable type");
750752
public static final TruffleString UNINITIALIZED_S_OBJECT = tsLiteral("uninitialized classmethod object");

graalpython/lib-graalpython/unicodedata.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
22
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
33
#
44
# The Universal Permissive License (UPL), Version 1.0
@@ -40,6 +40,6 @@
4040
__graalpython__.import_current_as_named_module_with_delegate(
4141
module_name="unicodedata",
4242
delegate_name="_cpython_unicodedata",
43-
delegate_attributes=['ucd_3_2_0', 'lookup', 'east_asian_width', 'combining'],
43+
delegate_attributes=['ucd_3_2_0', 'east_asian_width', 'combining'],
4444
wrap_methods=False,
4545
owner_globals=globals())

0 commit comments

Comments
 (0)