Commit

StanFromIreland · StanFromIreland · commit 0997158cfb50 · 2025-09-06T20:37:15.000+01:00
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
@@ -273,6 +273,14 @@ Other language changes
   This speeds up class creation, and helps avoid reference cycles.
   (Contributed by Petr Viktorin in :gh:`135228`.)
 
+* Added support for Unicode ``U+XXXX`` notation in :func:`unicodedata.lookup`
+  and ``'\N{...}'``. For example::
+
+    >>> "\N{U+03BB}"
+    'λ'
+
+  (Contributed by Stan Ulbrych in :gh:`62814`.)
+
 
 New modules
 ===========
diff --git a/Include/internal/pycore_ucnhash.h b/Include/internal/pycore_ucnhash.h
@@ -26,6 +26,9 @@ typedef struct {
     int (*getcode)(const char* name, int namelen, Py_UCS4* code,
                    int with_named_seq);
 
+    /* Parse Unicode notation */
+    int (*parse_u_plus)(const char* name, int namelen, Py_UCS4* code);
+
 } _PyUnicode_Name_CAPI;
 
 extern _PyUnicode_Name_CAPI* _PyUnicode_GetNameCAPI(void);
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
@@ -200,11 +200,20 @@ def check_version(testfile):
                 with self.assertRaises(KeyError):
                     unicodedata.ucd_3_2_0.lookup(seqname)
 
+    def test_u_plus_notation(self):
+        self.assertEqual(unicodedata.lookup("U+0041"), "A")
+        self.assertEqual(unicodedata.lookup("U+00410"), "А")  # Cyrillic capital A
+        self.assertEqual(unicodedata.lookup("U+004100"), "䄀")
+        self.assertEqual(unicodedata.lookup("U+03BB"), "λ")
+
     def test_errors(self):
         self.assertRaises(TypeError, unicodedata.name)
         self.assertRaises(TypeError, unicodedata.name, 'xx')
         self.assertRaises(TypeError, unicodedata.lookup)
         self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
+        self.assertRaises(ValueError, unicodedata.lookup, 'U+00')
+        self.assertRaises(ValueError, unicodedata.lookup, 'U+0000000')
+        self.assertRaises(ValueError, unicodedata.lookup, 'U+000Z')
 
     def test_strict_error_handling(self):
         # bogus character name
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-09-06-20-31-48.gh-issue-62814.PdwlEc.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-09-06-20-31-48.gh-issue-62814.PdwlEc.rst
@@ -0,0 +1 @@
+``\N{}`` now supports ``U+XXXX`` notation.
diff --git a/Misc/NEWS.d/next/Library/2025-09-06-20-31-05.gh-issue-62814.Nt6o-t.rst b/Misc/NEWS.d/next/Library/2025-09-06-20-31-05.gh-issue-62814.Nt6o-t.rst
@@ -0,0 +1 @@
+:func:`unicodedata.lookup` now supports ``U+XXXXXX`` notation.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -1448,6 +1448,36 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
 }
 
 
+static int
+_parse_u_plus(const char *codepoint, int namelen, Py_UCS4 *code)
+{
+    const int hex_len = namelen - 2;
+    if (codepoint[0] != 'U' || codepoint[1] != '+') {
+        return 0;
+    }
+    if (hex_len < 4 || hex_len > 6) {
+        PyErr_SetString(PyExc_ValueError, "invalid codepoint notation length");
+        return -1;
+    }
+
+    char buf[7];
+    memcpy(buf, codepoint + 2, hex_len);
+    buf[hex_len] = '\0';
+
+    char *endptr = NULL;
+    const unsigned long v = strtoul(buf, &endptr, 16);
+
+    if (*endptr != '\0' || v > 0x10ffff) {
+        PyErr_Format(PyExc_ValueError,
+               "invalid codepoint notation '%.*s'", namelen, codepoint);
+        return -1;
+    }
+
+    *code = (Py_UCS4)v;
+    return 1;
+}
+
+
 static int
 capi_getcode(const char* name, int namelen, Py_UCS4* code,
              int with_named_seq)
@@ -1458,6 +1488,12 @@ capi_getcode(const char* name, int namelen, Py_UCS4* code,
     return _check_alias_and_seq(code, with_named_seq);
 }
 
+static int
+capi_parse_u_plus(const char* name, int namelen, Py_UCS4* code)
+{
+    return _parse_u_plus(name, namelen, code);
+}
+
 static void
 unicodedata_destroy_capi(PyObject *capsule)
 {
@@ -1475,6 +1511,7 @@ unicodedata_create_capi(void)
     }
     capi->getname = capi_getucname;
     capi->getcode = capi_getcode;
+    capi->parse_u_plus = capi_parse_u_plus;
 
     PyObject *capsule = PyCapsule_New(capi,
                                       PyUnicodeData_CAPSULE_NAME,
@@ -1543,6 +1580,16 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
 {
     Py_UCS4 code;
     unsigned int index;
+
+    const int check = _parse_u_plus(name, (int)name_length, &code);
+    if (check == 1) {
+        return PyUnicode_FromOrdinal(code);
+    }
+    if (check == -1) {
+        /* Error set in _parse_u_plus */
+        return NULL;
+    }
+
     if (name_length > NAME_MAXLEN) {
         PyErr_SetString(PyExc_KeyError, "name too long");
         return NULL;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -6783,9 +6783,21 @@ _PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
                     /* found a name.  look it up in the unicode database */
                     s++;
                     ch = 0xffffffff; /* in case 'getcode' messes up */
-                    if (namelen <= INT_MAX &&
-                        ucnhash_capi->getcode(start, (int)namelen,
-                                              &ch, 0)) {
+                    int ok = 0;
+
+                    if (namelen <= INT_MAX) {
+                        if (ucnhash_capi->getcode(start, (int)namelen, &ch, 0)) {
+                            ok = 1;
+                        }
+                        else {
+                            ok = ucnhash_capi->parse_u_plus(start, (int)namelen, &ch);
+                            if (ok == -1) {
+                                goto onError;
+                            }
+                        }
+                    }
+
+                    if (ok) {
                         assert(ch <= MAX_UNICODE);
                         WRITE_CHAR(ch);
                         continue;

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+``\N{}`` now supports ``U+XXXX`` notation.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+:func:`unicodedata.lookup` now supports ``U+XXXXXX`` notation.