Skip to content

Commit dbe3950

Browse files
gh-129117: Add unicodedata.isxidstart() function (#140269)
Expose `_PyUnicode_IsXidContinue/Start` in `unicodedata`: add isxidstart() and isxidcontinue() functions. Co-authored-by: Victor Stinner <[email protected]>
1 parent 25bd72d commit dbe3950

File tree

13 files changed

+225
-13
lines changed

13 files changed

+225
-13
lines changed

Doc/library/unicodedata.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,36 @@ following functions:
144144
1
145145

146146

147+
.. function:: isxidstart(chr, /)
148+
149+
Return ``True`` if *chr* is a valid identifier start per the
150+
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
151+
that is, it has the ``XID_Start`` property. Return ``False`` otherwise.
152+
For example::
153+
154+
>>> unicodedata.isxidstart('S')
155+
True
156+
>>> unicodedata.isxidstart('0')
157+
False
158+
159+
.. versionadded:: next
160+
161+
162+
.. function:: isxidcontinue(chr, /)
163+
164+
Return ``True`` if *chr* is a valid identifier character per the
165+
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
166+
that is, it has the ``XID_Continue`` property. Return ``False`` otherwise.
167+
For example::
168+
169+
>>> unicodedata.isxidcontinue('S')
170+
True
171+
>>> unicodedata.isxidcontinue(' ')
172+
False
173+
174+
.. versionadded:: next
175+
176+
147177
.. function:: decomposition(chr)
148178

149179
Returns the character decomposition mapping assigned to the character

Doc/whatsnew/3.15.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,11 @@ unicodedata
794794

795795
* The Unicode database has been updated to Unicode 17.0.0.
796796

797+
* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue`
798+
functions to check whether a character can start or continue a
799+
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
800+
(Contributed by Stan Ulbrych in :gh:`129117`.)
801+
797802

798803
wave
799804
----
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#ifndef Py_INTERNAL_UNICODECTYPE_H
2+
#define Py_INTERNAL_UNICODECTYPE_H
3+
#ifdef __cplusplus
4+
extern "C" {
5+
#endif
6+
7+
#ifndef Py_BUILD_CORE
8+
# error "this header requires Py_BUILD_CORE define"
9+
#endif
10+
11+
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
12+
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
13+
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
14+
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
15+
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
16+
extern int _PyUnicode_IsCased(Py_UCS4 ch);
17+
18+
// Export for 'unicodedata' shared extension.
19+
PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
20+
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);
21+
22+
#ifdef __cplusplus
23+
}
24+
#endif
25+
#endif /* !Py_INTERNAL_UNICODECTYPE_H */

Include/internal/pycore_unicodeobject.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
7474
return 0;
7575
}
7676

77-
78-
/* --- Characters Type APIs ----------------------------------------------- */
79-
80-
extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
81-
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
82-
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
83-
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
84-
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
85-
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
86-
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
87-
extern int _PyUnicode_IsCased(Py_UCS4 ch);
88-
8977
/* --- Unicode API -------------------------------------------------------- */
9078

9179
// Export for '_json' shared extension

Lib/test/test_unicodedata.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self):
276276
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
277277
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
278278

279+
def test_isxidstart(self):
280+
self.assertTrue(self.db.isxidstart('S'))
281+
self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM
282+
self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA
283+
self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
284+
self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU
285+
self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM
286+
287+
self.assertFalse(self.db.isxidstart(' '))
288+
self.assertFalse(self.db.isxidstart('0'))
289+
self.assertRaises(TypeError, self.db.isxidstart)
290+
self.assertRaises(TypeError, self.db.isxidstart, 'xx')
291+
292+
def test_isxidcontinue(self):
293+
self.assertTrue(self.db.isxidcontinue('S'))
294+
self.assertTrue(self.db.isxidcontinue('_'))
295+
self.assertTrue(self.db.isxidcontinue('0'))
296+
self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
297+
self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL
298+
self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH
299+
self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
300+
self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO
301+
302+
self.assertFalse(self.db.isxidcontinue(' '))
303+
self.assertRaises(TypeError, self.db.isxidcontinue)
304+
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')
305+
279306
class UnicodeMiscTest(UnicodeDatabaseTest):
280307

281308
@cpython_only

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \
14331433
$(srcdir)/Include/internal/pycore_typeobject.h \
14341434
$(srcdir)/Include/internal/pycore_typevarobject.h \
14351435
$(srcdir)/Include/internal/pycore_ucnhash.h \
1436+
$(srcdir)/Include/internal/pycore_unicodectype.h \
14361437
$(srcdir)/Include/internal/pycore_unicodeobject.h \
14371438
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
14381439
$(srcdir)/Include/internal/pycore_unionobject.h \
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and
2+
:func:`~unicodedata.isxidcontinue` functions to check whether a character can
3+
start or continue a `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.

Modules/clinic/unicodedata.c.h

Lines changed: 73 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/unicodedata.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "Python.h"
2020
#include "pycore_object.h" // _PyObject_VisitType()
2121
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
22+
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart()
2223

2324
#include <stdbool.h>
2425
#include <stddef.h> // offsetof()
@@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
15251526
return PyUnicode_FromString(name);
15261527
}
15271528

1529+
/*[clinic input]
1530+
unicodedata.UCD.isxidstart
1531+
1532+
self: self
1533+
chr: int(accept={str})
1534+
/
1535+
1536+
Return True if the character has the XID_Start property, else False.
1537+
1538+
[clinic start generated code]*/
1539+
1540+
static PyObject *
1541+
unicodedata_UCD_isxidstart_impl(PyObject *self, int chr)
1542+
/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/
1543+
{
1544+
if (UCD_Check(self)) {
1545+
const change_record *old = get_old_record(self, chr);
1546+
if (old->category_changed == 0) {
1547+
/* unassigned */
1548+
Py_RETURN_FALSE;
1549+
}
1550+
}
1551+
1552+
return PyBool_FromLong(_PyUnicode_IsXidStart(chr));
1553+
}
1554+
1555+
/*[clinic input]
1556+
unicodedata.UCD.isxidcontinue
1557+
1558+
self: self
1559+
chr: int(accept={str})
1560+
/
1561+
1562+
Return True if the character has the XID_Continue property, else False.
1563+
1564+
[clinic start generated code]*/
1565+
1566+
static PyObject *
1567+
unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr)
1568+
/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/
1569+
{
1570+
if (UCD_Check(self)) {
1571+
const change_record *old = get_old_record(self, chr);
1572+
if (old->category_changed == 0) {
1573+
/* unassigned */
1574+
Py_RETURN_FALSE;
1575+
}
1576+
}
1577+
1578+
return PyBool_FromLong(_PyUnicode_IsXidContinue(chr));
1579+
}
1580+
15281581
/*[clinic input]
15291582
unicodedata.UCD.lookup
15301583
@@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = {
15901643
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
15911644
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
15921645
UNICODEDATA_UCD_NAME_METHODDEF
1646+
UNICODEDATA_UCD_ISXIDSTART_METHODDEF
1647+
UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF
15931648
UNICODEDATA_UCD_LOOKUP_METHODDEF
15941649
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
15951650
UNICODEDATA_UCD_NORMALIZE_METHODDEF

Objects/unicodectype.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
*/
1010

1111
#include "Python.h"
12+
#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue()
1213

1314
#define ALPHA_MASK 0x01
1415
#define DECIMAL_MASK 0x02

0 commit comments

Comments
 (0)