diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 3236213de5aaa2..0bdb201d3536df 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -273,6 +273,14 @@ Other language changes This speeds up class creation, and helps avoid reference cycles. (Contributed by Petr Viktorin in :gh:`135228`.) +* Added support for Unicode ``U+XXXX`` notation in :func:`unicodedata.lookup` + and ``'\N{...}'``. For example:: + + >>> "\N{U+03BB}" + 'λ' + + (Contributed by Stan Ulbrych in :gh:`62814`.) + New modules =========== diff --git a/Include/internal/pycore_ucnhash.h b/Include/internal/pycore_ucnhash.h index 1561dfbb3150d3..2dfd1d13123175 100644 --- a/Include/internal/pycore_ucnhash.h +++ b/Include/internal/pycore_ucnhash.h @@ -26,6 +26,9 @@ typedef struct { int (*getcode)(const char* name, int namelen, Py_UCS4* code, int with_named_seq); + /* Parse Unicode notation */ + int (*parse_u_plus)(const char* name, int namelen, Py_UCS4* code); + } _PyUnicode_Name_CAPI; extern _PyUnicode_Name_CAPI* _PyUnicode_GetNameCAPI(void); diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 0e2c25aaff2fe9..f84d56d02970d5 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -200,11 +200,20 @@ def check_version(testfile): with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname) + def test_u_plus_notation(self): + self.assertEqual(unicodedata.lookup("U+0041"), "A") + self.assertEqual(unicodedata.lookup("U+00410"), "А") # Cyrillic capital A + self.assertEqual(unicodedata.lookup("U+004100"), "䄀") + self.assertEqual(unicodedata.lookup("U+03BB"), "λ") + def test_errors(self): self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, 'xx') self.assertRaises(TypeError, unicodedata.lookup) self.assertRaises(KeyError, unicodedata.lookup, 'unknown') + self.assertRaises(ValueError, unicodedata.lookup, 'U+00') + self.assertRaises(ValueError, unicodedata.lookup, 'U+0000000') + self.assertRaises(ValueError, unicodedata.lookup, 'U+000Z') def test_strict_error_handling(self): # bogus character name diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-09-06-20-31-48.gh-issue-62814.PdwlEc.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-09-06-20-31-48.gh-issue-62814.PdwlEc.rst new file mode 100644 index 00000000000000..f8060abe2b0260 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-09-06-20-31-48.gh-issue-62814.PdwlEc.rst @@ -0,0 +1 @@ +``\N{}`` now supports ``U+XXXX`` notation. diff --git a/Misc/NEWS.d/next/Library/2025-09-06-20-31-05.gh-issue-62814.Nt6o-t.rst b/Misc/NEWS.d/next/Library/2025-09-06-20-31-05.gh-issue-62814.Nt6o-t.rst new file mode 100644 index 00000000000000..5f50b19a36b56d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-06-20-31-05.gh-issue-62814.Nt6o-t.rst @@ -0,0 +1 @@ +:func:`unicodedata.lookup` now supports ``U+XXXXXX`` notation. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 41725e5aec1641..572a1a365b4ded 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1448,6 +1448,36 @@ _getcode(const char* name, int namelen, Py_UCS4* code) } +static int +_parse_u_plus(const char *codepoint, int namelen, Py_UCS4 *code) +{ + const int hex_len = namelen - 2; + if (codepoint[0] != 'U' || codepoint[1] != '+') { + return 0; + } + if (hex_len < 4 || hex_len > 6) { + PyErr_SetString(PyExc_ValueError, "invalid codepoint notation length"); + return -1; + } + + char buf[7]; + memcpy(buf, codepoint + 2, hex_len); + buf[hex_len] = '\0'; + + char *endptr = NULL; + const unsigned long v = strtoul(buf, &endptr, 16); + + if (*endptr != '\0' || v > 0x10ffff) { + PyErr_Format(PyExc_ValueError, + "invalid codepoint notation '%.*s'", namelen, codepoint); + return -1; + } + + *code = (Py_UCS4)v; + return 1; +} + + static int capi_getcode(const char* name, int namelen, Py_UCS4* code, int with_named_seq) @@ -1458,6 +1488,12 @@ capi_getcode(const char* name, int namelen, Py_UCS4* code, return _check_alias_and_seq(code, with_named_seq); } +static int +capi_parse_u_plus(const char* name, int namelen, Py_UCS4* code) +{ + return _parse_u_plus(name, namelen, code); +} + static void unicodedata_destroy_capi(PyObject *capsule) { @@ -1475,6 +1511,7 @@ unicodedata_create_capi(void) } capi->getname = capi_getucname; capi->getcode = capi_getcode; + capi->parse_u_plus = capi_parse_u_plus; PyObject *capsule = PyCapsule_New(capi, PyUnicodeData_CAPSULE_NAME, @@ -1543,6 +1580,16 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name, { Py_UCS4 code; unsigned int index; + + const int check = _parse_u_plus(name, (int)name_length, &code); + if (check == 1) { + return PyUnicode_FromOrdinal(code); + } + if (check == -1) { + /* Error set in _parse_u_plus */ + return NULL; + } + if (name_length > NAME_MAXLEN) { PyErr_SetString(PyExc_KeyError, "name too long"); return NULL; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 4c88e4c1fdca2e..b52e46913f9549 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6783,9 +6783,21 @@ _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s, /* found a name. look it up in the unicode database */ s++; ch = 0xffffffff; /* in case 'getcode' messes up */ - if (namelen <= INT_MAX && - ucnhash_capi->getcode(start, (int)namelen, - &ch, 0)) { + int ok = 0; + + if (namelen <= INT_MAX) { + if (ucnhash_capi->getcode(start, (int)namelen, &ch, 0)) { + ok = 1; + } + else { + ok = ucnhash_capi->parse_u_plus(start, (int)namelen, &ch); + if (ok == -1) { + goto onError; + } + } + } + + if (ok) { assert(ch <= MAX_UNICODE); WRITE_CHAR(ch); continue;