Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,14 @@ Other language changes
This speeds up class creation, and helps avoid reference cycles.
(Contributed by Petr Viktorin in :gh:`135228`.)

* Added support for Unicode ``U+XXXX`` notation in :func:`unicodedata.lookup`
and ``'\N{...}'``. For example::

>>> "\N{U+03BB}"
'λ'

(Contributed by Stan Ulbrych in :gh:`62814`.)


New modules
===========
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_ucnhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ typedef struct {
int (*getcode)(const char* name, int namelen, Py_UCS4* code,
int with_named_seq);

/* Parse Unicode notation */
int (*parse_u_plus)(const char* name, int namelen, Py_UCS4* code);

} _PyUnicode_Name_CAPI;

extern _PyUnicode_Name_CAPI* _PyUnicode_GetNameCAPI(void);
Expand Down
9 changes: 9 additions & 0 deletions Lib/test/test_ucn.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,20 @@ def check_version(testfile):
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)

def test_u_plus_notation(self):
self.assertEqual(unicodedata.lookup("U+0041"), "A")
self.assertEqual(unicodedata.lookup("U+00410"), "А") # Cyrillic capital A
self.assertEqual(unicodedata.lookup("U+004100"), "䄀")
self.assertEqual(unicodedata.lookup("U+03BB"), "λ")

def test_errors(self):
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
self.assertRaises(ValueError, unicodedata.lookup, 'U+00')
self.assertRaises(ValueError, unicodedata.lookup, 'U+0000000')
self.assertRaises(ValueError, unicodedata.lookup, 'U+000Z')

def test_strict_error_handling(self):
# bogus character name
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
``\N{}`` now supports ``U+XXXX`` notation.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:func:`unicodedata.lookup` now supports ``U+XXXXXX`` notation.
47 changes: 47 additions & 0 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,36 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
}


static int
_parse_u_plus(const char *codepoint, int namelen, Py_UCS4 *code)
{
const int hex_len = namelen - 2;
if (codepoint[0] != 'U' || codepoint[1] != '+') {
return 0;
}
if (hex_len < 4 || hex_len > 6) {
PyErr_SetString(PyExc_ValueError, "invalid codepoint notation length");
return -1;
}

char buf[7];
memcpy(buf, codepoint + 2, hex_len);
buf[hex_len] = '\0';

char *endptr = NULL;
const unsigned long v = strtoul(buf, &endptr, 16);

if (*endptr != '\0' || v > 0x10ffff) {
PyErr_Format(PyExc_ValueError,
"invalid codepoint notation '%.*s'", namelen, codepoint);
return -1;
}

*code = (Py_UCS4)v;
return 1;
}


static int
capi_getcode(const char* name, int namelen, Py_UCS4* code,
int with_named_seq)
Expand All @@ -1458,6 +1488,12 @@ capi_getcode(const char* name, int namelen, Py_UCS4* code,
return _check_alias_and_seq(code, with_named_seq);
}

static int
capi_parse_u_plus(const char* name, int namelen, Py_UCS4* code)
{
return _parse_u_plus(name, namelen, code);
}

static void
unicodedata_destroy_capi(PyObject *capsule)
{
Expand All @@ -1475,6 +1511,7 @@ unicodedata_create_capi(void)
}
capi->getname = capi_getucname;
capi->getcode = capi_getcode;
capi->parse_u_plus = capi_parse_u_plus;

PyObject *capsule = PyCapsule_New(capi,
PyUnicodeData_CAPSULE_NAME,
Expand Down Expand Up @@ -1543,6 +1580,16 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
{
Py_UCS4 code;
unsigned int index;

const int check = _parse_u_plus(name, (int)name_length, &code);
if (check == 1) {
return PyUnicode_FromOrdinal(code);
}
if (check == -1) {
/* Error set in _parse_u_plus */
return NULL;
}

if (name_length > NAME_MAXLEN) {
PyErr_SetString(PyExc_KeyError, "name too long");
return NULL;
Expand Down
18 changes: 15 additions & 3 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -6783,9 +6783,21 @@ _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
/* found a name. look it up in the unicode database */
s++;
ch = 0xffffffff; /* in case 'getcode' messes up */
if (namelen <= INT_MAX &&
ucnhash_capi->getcode(start, (int)namelen,
&ch, 0)) {
int ok = 0;

if (namelen <= INT_MAX) {
if (ucnhash_capi->getcode(start, (int)namelen, &ch, 0)) {
ok = 1;
}
else {
ok = ucnhash_capi->parse_u_plus(start, (int)namelen, &ch);
if (ok == -1) {
goto onError;
}
}
}

if (ok) {
assert(ch <= MAX_UNICODE);
WRITE_CHAR(ch);
continue;
Expand Down
Loading