Skip to content

Commit 0997158

Browse files
Commit
1 parent c919d02 commit 0997158

File tree

7 files changed

+84
-3
lines changed

7 files changed

+84
-3
lines changed

Doc/whatsnew/3.15.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,14 @@ Other language changes
273273
This speeds up class creation, and helps avoid reference cycles.
274274
(Contributed by Petr Viktorin in :gh:`135228`.)
275275

276+
* Added support for Unicode ``U+XXXX`` notation in :func:`unicodedata.lookup`
277+
and ``'\N{...}'``. For example::
278+
279+
>>> "\N{U+03BB}"
280+
'λ'
281+
282+
(Contributed by Stan Ulbrych in :gh:`62814`.)
283+
276284

277285
New modules
278286
===========

Include/internal/pycore_ucnhash.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ typedef struct {
2626
int (*getcode)(const char* name, int namelen, Py_UCS4* code,
2727
int with_named_seq);
2828

29+
/* Parse Unicode notation */
30+
int (*parse_u_plus)(const char* name, int namelen, Py_UCS4* code);
31+
2932
} _PyUnicode_Name_CAPI;
3033

3134
extern _PyUnicode_Name_CAPI* _PyUnicode_GetNameCAPI(void);

Lib/test/test_ucn.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,20 @@ def check_version(testfile):
200200
with self.assertRaises(KeyError):
201201
unicodedata.ucd_3_2_0.lookup(seqname)
202202

203+
def test_u_plus_notation(self):
204+
self.assertEqual(unicodedata.lookup("U+0041"), "A")
205+
self.assertEqual(unicodedata.lookup("U+00410"), "А") # Cyrillic capital A
206+
self.assertEqual(unicodedata.lookup("U+004100"), "䄀")
207+
self.assertEqual(unicodedata.lookup("U+03BB"), "λ")
208+
203209
def test_errors(self):
204210
self.assertRaises(TypeError, unicodedata.name)
205211
self.assertRaises(TypeError, unicodedata.name, 'xx')
206212
self.assertRaises(TypeError, unicodedata.lookup)
207213
self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
214+
self.assertRaises(ValueError, unicodedata.lookup, 'U+00')
215+
self.assertRaises(ValueError, unicodedata.lookup, 'U+0000000')
216+
self.assertRaises(ValueError, unicodedata.lookup, 'U+000Z')
208217

209218
def test_strict_error_handling(self):
210219
# bogus character name
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
``\N{}`` now supports ``U+XXXX`` notation.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:func:`unicodedata.lookup` now supports ``U+XXXXXX`` notation.

Modules/unicodedata.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,36 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
14481448
}
14491449

14501450

1451+
static int
1452+
_parse_u_plus(const char *codepoint, int namelen, Py_UCS4 *code)
1453+
{
1454+
const int hex_len = namelen - 2;
1455+
if (codepoint[0] != 'U' || codepoint[1] != '+') {
1456+
return 0;
1457+
}
1458+
if (hex_len < 4 || hex_len > 6) {
1459+
PyErr_SetString(PyExc_ValueError, "invalid codepoint notation length");
1460+
return -1;
1461+
}
1462+
1463+
char buf[7];
1464+
memcpy(buf, codepoint + 2, hex_len);
1465+
buf[hex_len] = '\0';
1466+
1467+
char *endptr = NULL;
1468+
const unsigned long v = strtoul(buf, &endptr, 16);
1469+
1470+
if (*endptr != '\0' || v > 0x10ffff) {
1471+
PyErr_Format(PyExc_ValueError,
1472+
"invalid codepoint notation '%.*s'", namelen, codepoint);
1473+
return -1;
1474+
}
1475+
1476+
*code = (Py_UCS4)v;
1477+
return 1;
1478+
}
1479+
1480+
14511481
static int
14521482
capi_getcode(const char* name, int namelen, Py_UCS4* code,
14531483
int with_named_seq)
@@ -1458,6 +1488,12 @@ capi_getcode(const char* name, int namelen, Py_UCS4* code,
14581488
return _check_alias_and_seq(code, with_named_seq);
14591489
}
14601490

1491+
static int
1492+
capi_parse_u_plus(const char* name, int namelen, Py_UCS4* code)
1493+
{
1494+
return _parse_u_plus(name, namelen, code);
1495+
}
1496+
14611497
static void
14621498
unicodedata_destroy_capi(PyObject *capsule)
14631499
{
@@ -1475,6 +1511,7 @@ unicodedata_create_capi(void)
14751511
}
14761512
capi->getname = capi_getucname;
14771513
capi->getcode = capi_getcode;
1514+
capi->parse_u_plus = capi_parse_u_plus;
14781515

14791516
PyObject *capsule = PyCapsule_New(capi,
14801517
PyUnicodeData_CAPSULE_NAME,
@@ -1543,6 +1580,16 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
15431580
{
15441581
Py_UCS4 code;
15451582
unsigned int index;
1583+
1584+
const int check = _parse_u_plus(name, (int)name_length, &code);
1585+
if (check == 1) {
1586+
return PyUnicode_FromOrdinal(code);
1587+
}
1588+
if (check == -1) {
1589+
/* Error set in _parse_u_plus */
1590+
return NULL;
1591+
}
1592+
15461593
if (name_length > NAME_MAXLEN) {
15471594
PyErr_SetString(PyExc_KeyError, "name too long");
15481595
return NULL;

Objects/unicodeobject.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6783,9 +6783,21 @@ _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
67836783
/* found a name. look it up in the unicode database */
67846784
s++;
67856785
ch = 0xffffffff; /* in case 'getcode' messes up */
6786-
if (namelen <= INT_MAX &&
6787-
ucnhash_capi->getcode(start, (int)namelen,
6788-
&ch, 0)) {
6786+
int ok = 0;
6787+
6788+
if (namelen <= INT_MAX) {
6789+
if (ucnhash_capi->getcode(start, (int)namelen, &ch, 0)) {
6790+
ok = 1;
6791+
}
6792+
else {
6793+
ok = ucnhash_capi->parse_u_plus(start, (int)namelen, &ch);
6794+
if (ok == -1) {
6795+
goto onError;
6796+
}
6797+
}
6798+
}
6799+
6800+
if (ok) {
67896801
assert(ch <= MAX_UNICODE);
67906802
WRITE_CHAR(ch);
67916803
continue;

0 commit comments

Comments
 (0)