From dbe3950a76cce176c6c185b873f9552503d87043 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:18:12 +0000 Subject: [PATCH 01/13] gh-129117: Add unicodedata.isxidstart() function (#140269) Expose `_PyUnicode_IsXidContinue/Start` in `unicodedata`: add isxidstart() and isxidcontinue() functions. Co-authored-by: Victor Stinner --- Doc/library/unicodedata.rst | 30 ++++++++ Doc/whatsnew/3.15.rst | 5 ++ Include/internal/pycore_unicodectype.h | 25 +++++++ Include/internal/pycore_unicodeobject.h | 12 --- Lib/test/test_unicodedata.py | 27 +++++++ Makefile.pre.in | 1 + ...-10-17-20-42-38.gh-issue-129117.X9jr4p.rst | 3 + Modules/clinic/unicodedata.c.h | 74 ++++++++++++++++++- Modules/unicodedata.c | 55 ++++++++++++++ Objects/unicodectype.c | 1 + Objects/unicodeobject.c | 1 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + 13 files changed, 225 insertions(+), 13 deletions(-) create mode 100644 Include/internal/pycore_unicodectype.h create mode 100644 Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 0369cd99c47c18..c49bf641704616 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -144,6 +144,36 @@ following functions: 1 +.. function:: isxidstart(chr, /) + + Return ``True`` if *chr* is a valid identifier start per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Start`` property. Return ``False`` otherwise. + For example:: + + >>> unicodedata.isxidstart('S') + True + >>> unicodedata.isxidstart('0') + False + + .. versionadded:: next + + +.. function:: isxidcontinue(chr, /) + + Return ``True`` if *chr* is a valid identifier character per the + `Unicode Standard Annex #31 `_, + that is, it has the ``XID_Continue`` property. Return ``False`` otherwise. + For example:: + + >>> unicodedata.isxidcontinue('S') + True + >>> unicodedata.isxidcontinue(' ') + False + + .. versionadded:: next + + .. function:: decomposition(chr) Returns the character decomposition mapping assigned to the character diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 85b4c12544a0c9..fe9adfe9f730ec 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -794,6 +794,11 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. +* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue` + functions to check whether a character can start or continue a + `Unicode Standard Annex #31 `_ identifier. + (Contributed by Stan Ulbrych in :gh:`129117`.) + wave ---- diff --git a/Include/internal/pycore_unicodectype.h b/Include/internal/pycore_unicodectype.h new file mode 100644 index 00000000000000..523bdb56b09cde --- /dev/null +++ b/Include/internal/pycore_unicodectype.h @@ -0,0 +1,25 @@ +#ifndef Py_INTERNAL_UNICODECTYPE_H +#define Py_INTERNAL_UNICODECTYPE_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); +extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); +extern int _PyUnicode_IsCased(Py_UCS4 ch); + +// Export for 'unicodedata' shared extension. +PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch); +PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch); + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_UNICODECTYPE_H */ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index b83039c1869f23..f384fad8713adc 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) return 0; } - -/* --- Characters Type APIs ----------------------------------------------- */ - -extern int _PyUnicode_IsXidStart(Py_UCS4 ch); -extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); -extern int _PyUnicode_IsCased(Py_UCS4 ch); - /* --- Unicode API -------------------------------------------------------- */ // Export for '_json' shared extension diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 8013eaf6e9d851..a3c22a4f27ee77 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') + def test_isxidstart(self): + self.assertTrue(self.db.isxidstart('S')) + self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM + self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA + self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA + self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU + self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM + + self.assertFalse(self.db.isxidstart(' ')) + self.assertFalse(self.db.isxidstart('0')) + self.assertRaises(TypeError, self.db.isxidstart) + self.assertRaises(TypeError, self.db.isxidstart, 'xx') + + def test_isxidcontinue(self): + self.assertTrue(self.db.isxidcontinue('S')) + self.assertTrue(self.db.isxidcontinue('_')) + self.assertTrue(self.db.isxidcontinue('0')) + self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR + self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL + self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH + self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA + self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO + + self.assertFalse(self.db.isxidcontinue(' ')) + self.assertRaises(TypeError, self.db.isxidcontinue) + self.assertRaises(TypeError, self.db.isxidcontinue, 'xx') + class UnicodeMiscTest(UnicodeDatabaseTest): @cpython_only diff --git a/Makefile.pre.in b/Makefile.pre.in index 19423c11545c19..0a1b8d028addad 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_typeobject.h \ $(srcdir)/Include/internal/pycore_typevarobject.h \ $(srcdir)/Include/internal/pycore_ucnhash.h \ + $(srcdir)/Include/internal/pycore_unicodectype.h \ $(srcdir)/Include/internal/pycore_unicodeobject.h \ $(srcdir)/Include/internal/pycore_unicodeobject_generated.h \ $(srcdir)/Include/internal/pycore_unionobject.h \ diff --git a/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst new file mode 100644 index 00000000000000..8767b1bb4837ad --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-17-20-42-38.gh-issue-129117.X9jr4p.rst @@ -0,0 +1,3 @@ +:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and +:func:`~unicodedata.isxidcontinue` functions to check whether a character can +start or continue a `Unicode Standard Annex #31 `_ identifier. diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 345440eeee89a6..5fcba083c2f4ce 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -518,6 +518,78 @@ unicodedata_UCD_name(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return return_value; } +PyDoc_STRVAR(unicodedata_UCD_isxidstart__doc__, +"isxidstart($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Start property, else False."); + +#define UNICODEDATA_UCD_ISXIDSTART_METHODDEF \ + {"isxidstart", (PyCFunction)unicodedata_UCD_isxidstart, METH_O, unicodedata_UCD_isxidstart__doc__}, + +static PyObject * +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isxidstart(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isxidstart", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isxidstart(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isxidstart_impl(self, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_UCD_isxidcontinue__doc__, +"isxidcontinue($self, chr, /)\n" +"--\n" +"\n" +"Return True if the character has the XID_Continue property, else False."); + +#define UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF \ + {"isxidcontinue", (PyCFunction)unicodedata_UCD_isxidcontinue, METH_O, unicodedata_UCD_isxidcontinue__doc__}, + +static PyObject * +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr); + +static PyObject * +unicodedata_UCD_isxidcontinue(PyObject *self, PyObject *arg) +{ + PyObject *return_value = NULL; + int chr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("isxidcontinue", "argument", "a unicode character", arg); + goto exit; + } + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_Format(PyExc_TypeError, + "isxidcontinue(): argument must be a unicode character, " + "not a string of length %zd", + PyUnicode_GET_LENGTH(arg)); + goto exit; + } + chr = PyUnicode_READ_CHAR(arg, 0); + return_value = unicodedata_UCD_isxidcontinue_impl(self, chr); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_UCD_lookup__doc__, "lookup($self, name, /)\n" "--\n" @@ -549,4 +621,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=8a59d430cee41058 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=c5e56c8f6bb80f93 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index a3699beff7da01..a6094676d4194c 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,7 @@ #include "Python.h" #include "pycore_object.h" // _PyObject_VisitType() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart() #include #include // offsetof() @@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) return PyUnicode_FromString(name); } +/*[clinic input] +unicodedata.UCD.isxidstart + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Start property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isxidstart_impl(PyObject *self, int chr) +/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/ +{ + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, chr); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidStart(chr)); +} + +/*[clinic input] +unicodedata.UCD.isxidcontinue + + self: self + chr: int(accept={str}) + / + +Return True if the character has the XID_Continue property, else False. + +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr) +/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/ +{ + if (UCD_Check(self)) { + const change_record *old = get_old_record(self, chr); + if (old->category_changed == 0) { + /* unassigned */ + Py_RETURN_FALSE; + } + } + + return PyBool_FromLong(_PyUnicode_IsXidContinue(chr)); +} + /*[clinic input] unicodedata.UCD.lookup @@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF + UNICODEDATA_UCD_ISXIDSTART_METHODDEF + UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..fdd380190ac1ec 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -9,6 +9,7 @@ */ #include "Python.h" +#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue() #define ALPHA_MASK 0x01 #define DECIMAL_MASK 0x02 diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f60f7dd2d13604..8a5638ac1406ab 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding() #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI +#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart #include "pycore_unicodeobject.h" // struct _Py_unicode_state #include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings() diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 2657ee5c444e60..a101c1b45cf25c 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -328,6 +328,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 9c12be6e9356a6..e3f261c2b92ab9 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -528,6 +528,9 @@ Include\cpython + + Include\internal + Include\internal From c45e6e1bb3b3045b413574d25ebb2a473f6f5a1e Mon Sep 17 00:00:00 2001 From: Donghee Na Date: Thu, 30 Oct 2025 19:32:08 +0900 Subject: [PATCH 02/13] gh-137821: Improve Argument Clinic definitions in the ``_json`` module (#140780) --- Lib/test/test_json/test_scanstring.py | 2 +- ...-10-30-15-33-07.gh-issue-137821.8_Iavt.rst | 1 - ...-10-30-15-33-07.gh-issue-137821.8_Iavt.rst | 2 + Modules/_json.c | 55 ++++--------------- Modules/clinic/_json.c.h | 46 +++++++++++++++- 5 files changed, 59 insertions(+), 47 deletions(-) delete mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst create mode 100644 Misc/NEWS.d/next/Library/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py index cca556a3b95bab..9a6cdfe12d266c 100644 --- a/Lib/test/test_json/test_scanstring.py +++ b/Lib/test/test_json/test_scanstring.py @@ -144,7 +144,7 @@ def test_bad_escapes(self): def test_overflow(self): with self.assertRaises(OverflowError): - self.json.decoder.scanstring(b"xxx", sys.maxsize+1) + self.json.decoder.scanstring("xxx", sys.maxsize+1) class TestPyScanstring(TestScanstring, PyTest): pass diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst deleted file mode 100644 index 124ea3f9993814..00000000000000 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst +++ /dev/null @@ -1 +0,0 @@ -Convert ``_json`` module to use Argument Clinic diff --git a/Misc/NEWS.d/next/Library/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst b/Misc/NEWS.d/next/Library/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst new file mode 100644 index 00000000000000..7ccbfc3cb950bf --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-30-15-33-07.gh-issue-137821.8_Iavt.rst @@ -0,0 +1,2 @@ +Convert ``_json`` module to use Argument Clinic. +Patched by Yoonho Hann. diff --git a/Modules/_json.c b/Modules/_json.c index 6a84661a243ea4..14714d4b346546 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -645,7 +645,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next /*[clinic input] _json.scanstring as py_scanstring - pystr: object + pystr: unicode end: Py_ssize_t strict: bool = True / @@ -664,74 +664,41 @@ after the end quote. static PyObject * py_scanstring_impl(PyObject *module, PyObject *pystr, Py_ssize_t end, int strict) -/*[clinic end generated code: output=961740cfae07cdb3 input=9d46d7df7ac749b0]*/ +/*[clinic end generated code: output=961740cfae07cdb3 input=cff59e47498f4d8e]*/ { - PyObject *rval; Py_ssize_t next_end = -1; - if (PyUnicode_Check(pystr)) { - rval = scanstring_unicode(pystr, end, strict, &next_end); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } + PyObject *rval = scanstring_unicode(pystr, end, strict, &next_end); return _build_rval_index_tuple(rval, next_end); } /*[clinic input] _json.encode_basestring_ascii as py_encode_basestring_ascii - pystr: object + pystr: unicode / Return an ASCII-only JSON representation of a Python string [clinic start generated code]*/ static PyObject * -py_encode_basestring_ascii(PyObject *module, PyObject *pystr) -/*[clinic end generated code: output=a8afcd88eba0b572 input=f4085ccd5928ea55]*/ +py_encode_basestring_ascii_impl(PyObject *module, PyObject *pystr) +/*[clinic end generated code: output=7b3841287cf211df input=4f3609498aff2de5]*/ { - PyObject *rval; - /* Return an ASCII-only JSON representation of a Python string */ - /* METH_O */ - if (PyUnicode_Check(pystr)) { - rval = ascii_escape_unicode(pystr); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } - return rval; + return ascii_escape_unicode(pystr); } /*[clinic input] _json.encode_basestring as py_encode_basestring - pystr: object + pystr: unicode / Return a JSON representation of a Python string [clinic start generated code]*/ static PyObject * -py_encode_basestring(PyObject *module, PyObject *pystr) -/*[clinic end generated code: output=c87752300776d3b1 input=c3c7ef6e72624f6e]*/ +py_encode_basestring_impl(PyObject *module, PyObject *pystr) +/*[clinic end generated code: output=900950f95df3f1c9 input=d42ef714b2c07386]*/ { - PyObject *rval; - /* Return a JSON representation of a Python string */ - /* METH_O */ - if (PyUnicode_Check(pystr)) { - rval = escape_unicode(pystr); - } - else { - PyErr_Format(PyExc_TypeError, - "first argument must be a string, not %.80s", - Py_TYPE(pystr)->tp_name); - return NULL; - } - return rval; + return escape_unicode(pystr); } static void diff --git a/Modules/clinic/_json.c.h b/Modules/clinic/_json.c.h index b80e72ad00a62a..cd37a236c7611a 100644 --- a/Modules/clinic/_json.c.h +++ b/Modules/clinic/_json.c.h @@ -37,6 +37,10 @@ py_scanstring(PyObject *module, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("scanstring", nargs, 2, 3)) { goto exit; } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("scanstring", "argument 1", "str", args[0]); + goto exit; + } pystr = args[0]; { Py_ssize_t ival = -1; @@ -73,6 +77,26 @@ PyDoc_STRVAR(py_encode_basestring_ascii__doc__, #define PY_ENCODE_BASESTRING_ASCII_METHODDEF \ {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii, METH_O, py_encode_basestring_ascii__doc__}, +static PyObject * +py_encode_basestring_ascii_impl(PyObject *module, PyObject *pystr); + +static PyObject * +py_encode_basestring_ascii(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + PyObject *pystr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("encode_basestring_ascii", "argument", "str", arg); + goto exit; + } + pystr = arg; + return_value = py_encode_basestring_ascii_impl(module, pystr); + +exit: + return return_value; +} + PyDoc_STRVAR(py_encode_basestring__doc__, "encode_basestring($module, pystr, /)\n" "--\n" @@ -81,4 +105,24 @@ PyDoc_STRVAR(py_encode_basestring__doc__, #define PY_ENCODE_BASESTRING_METHODDEF \ {"encode_basestring", (PyCFunction)py_encode_basestring, METH_O, py_encode_basestring__doc__}, -/*[clinic end generated code: output=d3aa505efc0acb3f input=a9049054013a1b77]*/ + +static PyObject * +py_encode_basestring_impl(PyObject *module, PyObject *pystr); + +static PyObject * +py_encode_basestring(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + PyObject *pystr; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("encode_basestring", "argument", "str", arg); + goto exit; + } + pystr = arg; + return_value = py_encode_basestring_impl(module, pystr); + +exit: + return return_value; +} +/*[clinic end generated code: output=5bdd16375c95a4d9 input=a9049054013a1b77]*/ From 622d97b8bbeb9ebdaa1061adf99a8b240d715e2f Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:50:16 +0000 Subject: [PATCH 03/13] gh-139198: Remove `Tools/scripts/checkpip.py` script (GH-139199) Commit --- ...-09-21-10-30-08.gh-issue-139198.Fm7NfU.rst | 1 + Tools/scripts/README | 2 -- Tools/scripts/checkpip.py | 32 ------------------- 3 files changed, 1 insertion(+), 34 deletions(-) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2025-09-21-10-30-08.gh-issue-139198.Fm7NfU.rst delete mode 100755 Tools/scripts/checkpip.py diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-09-21-10-30-08.gh-issue-139198.Fm7NfU.rst b/Misc/NEWS.d/next/Tools-Demos/2025-09-21-10-30-08.gh-issue-139198.Fm7NfU.rst new file mode 100644 index 00000000000000..0dc589c3986ad6 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-09-21-10-30-08.gh-issue-139198.Fm7NfU.rst @@ -0,0 +1 @@ +Remove ``Tools/scripts/checkpip.py`` script. diff --git a/Tools/scripts/README b/Tools/scripts/README index a078bfbf662a37..4e52cda38e8d88 100644 --- a/Tools/scripts/README +++ b/Tools/scripts/README @@ -1,8 +1,6 @@ This directory contains a collection of executable Python scripts that are useful while building, extending or managing Python. -checkpip.py Checks the version of the projects bundled in ensurepip - are the latest available combinerefs.py A helper for analyzing PYTHONDUMPREFS output divmod_threshold.py Determine threshold for switching from longobject.c divmod to _pylong.int_divmod() diff --git a/Tools/scripts/checkpip.py b/Tools/scripts/checkpip.py deleted file mode 100755 index a4a9ddfa6f324a..00000000000000 --- a/Tools/scripts/checkpip.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -""" -Checks that the version of the projects bundled in ensurepip are the latest -versions available. -""" -import ensurepip -import json -import urllib.request -import sys - - -def main(): - outofdate = False - - for project, version in ensurepip._PROJECTS: - data = json.loads(urllib.request.urlopen( - "https://pypi.org/pypi/{}/json".format(project), - cadefault=True, - ).read().decode("utf8")) - upstream_version = data["info"]["version"] - - if version != upstream_version: - outofdate = True - print("The latest version of {} on PyPI is {}, but ensurepip " - "has {}".format(project, upstream_version, version)) - - if outofdate: - sys.exit(1) - - -if __name__ == "__main__": - main() From 327dbbedffa3f2c95e70129a11974b83e27864f9 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 30 Oct 2025 12:52:02 +0200 Subject: [PATCH 04/13] gh-138162: Fix logging.LoggerAdapter with merge_extra=True and without the extra argument (GH-140511) --- Doc/library/logging.rst | 15 ++++++--- Lib/logging/__init__.py | 11 ++++--- Lib/test/test_logging.py | 33 ++++++++++++++++++- ...-10-23-19-39-16.gh-issue-138162.Znw5DN.rst | 2 ++ 4 files changed, 50 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-23-19-39-16.gh-issue-138162.Znw5DN.rst diff --git a/Doc/library/logging.rst b/Doc/library/logging.rst index 425025931d9835..0cf5b1c0d9bc3e 100644 --- a/Doc/library/logging.rst +++ b/Doc/library/logging.rst @@ -1082,12 +1082,13 @@ LoggerAdapter Objects information into logging calls. For a usage example, see the section on :ref:`adding contextual information to your logging output `. -.. class:: LoggerAdapter(logger, extra, merge_extra=False) +.. class:: LoggerAdapter(logger, extra=None, merge_extra=False) Returns an instance of :class:`LoggerAdapter` initialized with an - underlying :class:`Logger` instance, a dict-like object (*extra*), and a - boolean (*merge_extra*) indicating whether or not the *extra* argument of - individual log calls should be merged with the :class:`LoggerAdapter` extra. + underlying :class:`Logger` instance, an optional dict-like object (*extra*), + and an optional boolean (*merge_extra*) indicating whether or not + the *extra* argument of individual log calls should be merged with + the :class:`LoggerAdapter` extra. The default behavior is to ignore the *extra* argument of individual log calls and only use the one of the :class:`LoggerAdapter` instance @@ -1127,9 +1128,13 @@ information into logging calls. For a usage example, see the section on Attribute :attr:`!manager` and method :meth:`!_log` were added, which delegate to the underlying logger and allow adapters to be nested. + .. versionchanged:: 3.10 + + The *extra* argument is now optional. + .. versionchanged:: 3.13 - The *merge_extra* argument was added. + The *merge_extra* parameter was added. Thread Safety diff --git a/Lib/logging/__init__.py b/Lib/logging/__init__.py index 431ff41b352048..39689a57e6ecd6 100644 --- a/Lib/logging/__init__.py +++ b/Lib/logging/__init__.py @@ -1849,9 +1849,9 @@ class LoggerAdapter(object): def __init__(self, logger, extra=None, merge_extra=False): """ - Initialize the adapter with a logger and a dict-like object which - provides contextual information. This constructor signature allows - easy stacking of LoggerAdapters, if so desired. + Initialize the adapter with a logger and an optional dict-like object + which provides contextual information. This constructor signature + allows easy stacking of LoggerAdapters, if so desired. You can effectively pass keyword arguments as shown in the following example: @@ -1882,8 +1882,9 @@ def process(self, msg, kwargs): Normally, you'll only need to override this one method in a LoggerAdapter subclass for your specific needs. """ - if self.merge_extra and "extra" in kwargs: - kwargs["extra"] = {**self.extra, **kwargs["extra"]} + if self.merge_extra and kwargs.get("extra") is not None: + if self.extra is not None: + kwargs["extra"] = {**self.extra, **kwargs["extra"]} else: kwargs["extra"] = self.extra return msg, kwargs diff --git a/Lib/test/test_logging.py b/Lib/test/test_logging.py index 1f7a4d9e197f9c..8815426fc99c39 100644 --- a/Lib/test/test_logging.py +++ b/Lib/test/test_logging.py @@ -5826,7 +5826,7 @@ def cleanup(): self.addCleanup(cleanup) self.addCleanup(logging.shutdown) - self.adapter = logging.LoggerAdapter(logger=self.logger, extra=None) + self.adapter = logging.LoggerAdapter(logger=self.logger) def test_exception(self): msg = 'testing exception: %r' @@ -5997,6 +5997,18 @@ def test_extra_merged(self): self.assertEqual(record.foo, '1') self.assertEqual(record.bar, '2') + self.adapter.critical('no extra') # should not fail + self.assertEqual(len(self.recording.records), 2) + record = self.recording.records[-1] + self.assertEqual(record.foo, '1') + self.assertNotHasAttr(record, 'bar') + + self.adapter.critical('none extra', extra=None) # should not fail + self.assertEqual(len(self.recording.records), 3) + record = self.recording.records[-1] + self.assertEqual(record.foo, '1') + self.assertNotHasAttr(record, 'bar') + def test_extra_merged_log_call_has_precedence(self): self.adapter = logging.LoggerAdapter(logger=self.logger, extra={'foo': '1'}, @@ -6008,6 +6020,25 @@ def test_extra_merged_log_call_has_precedence(self): self.assertHasAttr(record, 'foo') self.assertEqual(record.foo, '2') + def test_extra_merged_without_extra(self): + self.adapter = logging.LoggerAdapter(logger=self.logger, + merge_extra=True) + + self.adapter.critical('foo should be here', extra={'foo': '1'}) + self.assertEqual(len(self.recording.records), 1) + record = self.recording.records[-1] + self.assertEqual(record.foo, '1') + + self.adapter.critical('no extra') # should not fail + self.assertEqual(len(self.recording.records), 2) + record = self.recording.records[-1] + self.assertNotHasAttr(record, 'foo') + + self.adapter.critical('none extra', extra=None) # should not fail + self.assertEqual(len(self.recording.records), 3) + record = self.recording.records[-1] + self.assertNotHasAttr(record, 'foo') + class PrefixAdapter(logging.LoggerAdapter): prefix = 'Adapter' diff --git a/Misc/NEWS.d/next/Library/2025-10-23-19-39-16.gh-issue-138162.Znw5DN.rst b/Misc/NEWS.d/next/Library/2025-10-23-19-39-16.gh-issue-138162.Znw5DN.rst new file mode 100644 index 00000000000000..ef7a90bc37e650 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-23-19-39-16.gh-issue-138162.Znw5DN.rst @@ -0,0 +1,2 @@ +Fix :class:`logging.LoggerAdapter` with ``merge_extra=True`` and without the +*extra* argument. From 2a904263aa0bc7c4a13beb4d8baa8cbc060a45ee Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 10:54:03 +0000 Subject: [PATCH 05/13] gh-131885: Update unicodedata function signatures to use `/` (#140270) --- Doc/library/unicodedata.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index c49bf641704616..fd5f56bd7eaaeb 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -31,7 +31,7 @@ following functions: this module. -.. function:: lookup(name) +.. function:: lookup(name, /) Look up character by name. If a character with the given name is found, return the corresponding character. If not found, :exc:`KeyError` is raised. @@ -94,7 +94,7 @@ following functions: 0.5 -.. function:: category(chr) +.. function:: category(chr, /) Returns the general category assigned to the character *chr* as string. General category names consist of two letters. @@ -106,7 +106,7 @@ following functions: 'Lu' -.. function:: bidirectional(chr) +.. function:: bidirectional(chr, /) Returns the bidirectional class assigned to the character *chr* as string. If no such value is defined, an empty string is returned. @@ -118,7 +118,7 @@ following functions: 'AN' -.. function:: combining(chr) +.. function:: combining(chr, /) Returns the canonical combining class assigned to the character *chr* as integer. Returns ``0`` if no combining class is defined. @@ -127,14 +127,14 @@ following functions: for more information. -.. function:: east_asian_width(chr) +.. function:: east_asian_width(chr, /) Returns the east asian width assigned to the character *chr* as string. For a list of widths and or more information, see the `Unicode Standard Annex #11 `_. -.. function:: mirrored(chr) +.. function:: mirrored(chr, /) Returns the mirrored property assigned to the character *chr* as integer. Returns ``1`` if the character has been identified as a "mirrored" @@ -174,7 +174,7 @@ following functions: .. versionadded:: next -.. function:: decomposition(chr) +.. function:: decomposition(chr, /) Returns the character decomposition mapping assigned to the character *chr* as string. An empty string is returned in case no such mapping is @@ -184,7 +184,7 @@ following functions: '0041 0303' -.. function:: normalize(form, unistr) +.. function:: normalize(form, unistr, /) Return the normal form *form* for the Unicode string *unistr*. Valid values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'. @@ -217,7 +217,7 @@ following functions: doesn't, they may not compare equal. -.. function:: is_normalized(form, unistr) +.. function:: is_normalized(form, unistr, /) Return whether the Unicode string *unistr* is in the normal form *form*. Valid values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'. From ad0a3f733b23e7fc69aff13055c7fac8ab9dcd66 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 30 Oct 2025 13:00:42 +0200 Subject: [PATCH 06/13] gh-131927: Do not emit PEP 765 warnings in ast.parse() (GH-139642) ast.parse() no longer emits syntax warnings for return/break/continue in finally (see PEP-765) -- they are only emitted during compilation. --- Include/internal/pycore_compile.h | 3 +- Lib/test/test_ast/test_ast.py | 55 ----------------- Lib/test/test_compile.py | 60 +++++++++++++++++++ Lib/test/test_pyrepl/test_interact.py | 26 ++++++++ ...10-06-10-03-37.gh-issue-139640.gY5oTb2.rst | 3 + Python/ast_preprocess.c | 8 ++- Python/compile.c | 4 +- 7 files changed, 98 insertions(+), 61 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-10-03-37.gh-issue-139640.gY5oTb2.rst diff --git a/Include/internal/pycore_compile.h b/Include/internal/pycore_compile.h index c18e04bf67a5df..1c60834fa2058c 100644 --- a/Include/internal/pycore_compile.h +++ b/Include/internal/pycore_compile.h @@ -49,7 +49,8 @@ extern int _PyAST_Preprocess( PyObject *filename, int optimize, int ff_features, - int syntax_check_only); + int syntax_check_only, + int enable_warnings); typedef struct { diff --git a/Lib/test/test_ast/test_ast.py b/Lib/test/test_ast/test_ast.py index 5fdb3a458ae999..a979a4b1da1ad1 100644 --- a/Lib/test/test_ast/test_ast.py +++ b/Lib/test/test_ast/test_ast.py @@ -1057,61 +1057,6 @@ def test_repr_large_input_crash(self): r"Exceeds the limit \(\d+ digits\)"): repr(ast.Constant(value=eval(source))) - def test_pep_765_warnings(self): - srcs = [ - textwrap.dedent(""" - def f(): - try: - pass - finally: - return 42 - """), - textwrap.dedent(""" - for x in y: - try: - pass - finally: - break - """), - textwrap.dedent(""" - for x in y: - try: - pass - finally: - continue - """), - ] - for src in srcs: - with self.assertWarnsRegex(SyntaxWarning, 'finally'): - ast.parse(src) - - def test_pep_765_no_warnings(self): - srcs = [ - textwrap.dedent(""" - try: - pass - finally: - def f(): - return 42 - """), - textwrap.dedent(""" - try: - pass - finally: - for x in y: - break - """), - textwrap.dedent(""" - try: - pass - finally: - for x in y: - continue - """), - ] - for src in srcs: - ast.parse(src) - def test_tstring(self): # Test AST structure for simple t-string tree = ast.parse('t"Hello"') diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py index bc8ef93cb8f9de..846d38ae561fc5 100644 --- a/Lib/test/test_compile.py +++ b/Lib/test/test_compile.py @@ -1745,6 +1745,66 @@ def test_compile_warning_in_finally(self): self.assertEqual(wm.category, SyntaxWarning) self.assertIn("\"is\" with 'int' literal", str(wm.message)) + @support.subTests('src', [ + textwrap.dedent(""" + def f(): + try: + pass + finally: + return 42 + """), + textwrap.dedent(""" + for x in y: + try: + pass + finally: + break + """), + textwrap.dedent(""" + for x in y: + try: + pass + finally: + continue + """), + ]) + def test_pep_765_warnings(self, src): + with self.assertWarnsRegex(SyntaxWarning, 'finally'): + compile(src, '', 'exec') + with warnings.catch_warnings(): + warnings.simplefilter("error") + tree = ast.parse(src) + with self.assertWarnsRegex(SyntaxWarning, 'finally'): + compile(tree, '', 'exec') + + @support.subTests('src', [ + textwrap.dedent(""" + try: + pass + finally: + def f(): + return 42 + """), + textwrap.dedent(""" + try: + pass + finally: + for x in y: + break + """), + textwrap.dedent(""" + try: + pass + finally: + for x in y: + continue + """), + ]) + def test_pep_765_no_warnings(self, src): + with warnings.catch_warnings(): + warnings.simplefilter("error") + compile(src, '', 'exec') + class TestBooleanExpression(unittest.TestCase): class Value: diff --git a/Lib/test/test_pyrepl/test_interact.py b/Lib/test/test_pyrepl/test_interact.py index 1a3146da8eadc8..fd4530ebc004aa 100644 --- a/Lib/test/test_pyrepl/test_interact.py +++ b/Lib/test/test_pyrepl/test_interact.py @@ -1,5 +1,6 @@ import contextlib import io +import warnings import unittest from unittest.mock import patch from textwrap import dedent @@ -273,3 +274,28 @@ def test_incomplete_statement(self): code = "if foo:" console = InteractiveColoredConsole(namespace, filename="") self.assertTrue(_more_lines(console, code)) + + +class TestWarnings(unittest.TestCase): + def test_pep_765_warning(self): + """ + Test that a SyntaxWarning emitted from the + AST optimizer is only shown once in the REPL. + """ + # gh-131927 + console = InteractiveColoredConsole() + code = dedent("""\ + def f(): + try: + return 1 + finally: + return 2 + """) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + console.runsource(code) + + count = sum("'return' in a 'finally' block" in str(w.message) + for w in caught) + self.assertEqual(count, 1) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-10-03-37.gh-issue-139640.gY5oTb2.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-10-03-37.gh-issue-139640.gY5oTb2.rst new file mode 100644 index 00000000000000..b147b430ccccf5 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-06-10-03-37.gh-issue-139640.gY5oTb2.rst @@ -0,0 +1,3 @@ +:func:`ast.parse` no longer emits syntax warnings for +``return``/``break``/``continue`` in ``finally`` (see :pep:`765`) -- they are +only emitted during compilation. diff --git a/Python/ast_preprocess.c b/Python/ast_preprocess.c index 44d3075098be75..fe6fd9479d1531 100644 --- a/Python/ast_preprocess.c +++ b/Python/ast_preprocess.c @@ -19,6 +19,7 @@ typedef struct { int optimize; int ff_features; int syntax_check_only; + int enable_warnings; _Py_c_array_t cf_finally; /* context for PEP 765 check */ int cf_finally_used; @@ -78,7 +79,7 @@ control_flow_in_finally_warning(const char *kw, stmt_ty n, _PyASTPreprocessState static int before_return(_PyASTPreprocessState *state, stmt_ty node_) { - if (state->cf_finally_used > 0) { + if (state->enable_warnings && state->cf_finally_used > 0) { ControlFlowInFinallyContext *ctx = get_cf_finally_top(state); if (ctx->in_finally && ! ctx->in_funcdef) { if (!control_flow_in_finally_warning("return", node_, state)) { @@ -92,7 +93,7 @@ before_return(_PyASTPreprocessState *state, stmt_ty node_) static int before_loop_exit(_PyASTPreprocessState *state, stmt_ty node_, const char *kw) { - if (state->cf_finally_used > 0) { + if (state->enable_warnings && state->cf_finally_used > 0) { ControlFlowInFinallyContext *ctx = get_cf_finally_top(state); if (ctx->in_finally && ! ctx->in_loop) { if (!control_flow_in_finally_warning(kw, node_, state)) { @@ -968,7 +969,7 @@ astfold_type_param(type_param_ty node_, PyArena *ctx_, _PyASTPreprocessState *st int _PyAST_Preprocess(mod_ty mod, PyArena *arena, PyObject *filename, int optimize, - int ff_features, int syntax_check_only) + int ff_features, int syntax_check_only, int enable_warnings) { _PyASTPreprocessState state; memset(&state, 0, sizeof(_PyASTPreprocessState)); @@ -976,6 +977,7 @@ _PyAST_Preprocess(mod_ty mod, PyArena *arena, PyObject *filename, int optimize, state.optimize = optimize; state.ff_features = ff_features; state.syntax_check_only = syntax_check_only; + state.enable_warnings = enable_warnings; if (_Py_CArray_Init(&state.cf_finally, sizeof(ControlFlowInFinallyContext), 20) < 0) { return -1; } diff --git a/Python/compile.c b/Python/compile.c index 8070d3f03760ef..e2f1c7e8eb5bce 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -136,7 +136,7 @@ compiler_setup(compiler *c, mod_ty mod, PyObject *filename, c->c_optimize = (optimize == -1) ? _Py_GetConfig()->optimization_level : optimize; c->c_save_nested_seqs = false; - if (!_PyAST_Preprocess(mod, arena, filename, c->c_optimize, merged, 0)) { + if (!_PyAST_Preprocess(mod, arena, filename, c->c_optimize, merged, 0, 1)) { return ERROR; } c->c_st = _PySymtable_Build(mod, filename, &c->c_future); @@ -1502,7 +1502,7 @@ _PyCompile_AstPreprocess(mod_ty mod, PyObject *filename, PyCompilerFlags *cf, if (optimize == -1) { optimize = _Py_GetConfig()->optimization_level; } - if (!_PyAST_Preprocess(mod, arena, filename, optimize, flags, no_const_folding)) { + if (!_PyAST_Preprocess(mod, arena, filename, optimize, flags, no_const_folding, 0)) { return -1; } return 0; From 09b1f10ef7b1183d40fe08e56d42dc6152d31f9a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 30 Oct 2025 13:11:56 +0200 Subject: [PATCH 07/13] gh-140481: Improve error message when trying to iterate a Tk widget, image or font (GH-140501) --- Lib/test/test_tkinter/test_font.py | 11 ++++ Lib/test/test_tkinter/test_images.py | 52 +++++++++++-------- Lib/test/test_tkinter/test_misc.py | 12 +++++ Lib/tkinter/__init__.py | 3 ++ Lib/tkinter/font.py | 2 + ...-10-23-13-42-15.gh-issue-140481.XKxWpq.rst | 1 + 6 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-10-23-13-42-15.gh-issue-140481.XKxWpq.rst diff --git a/Lib/test/test_tkinter/test_font.py b/Lib/test/test_tkinter/test_font.py index 3616da54cf7075..fc50f9fdbb588c 100644 --- a/Lib/test/test_tkinter/test_font.py +++ b/Lib/test/test_tkinter/test_font.py @@ -1,3 +1,4 @@ +import collections.abc import unittest import tkinter from tkinter import font @@ -118,6 +119,16 @@ def test_repr(self): repr(self.font), f'' ) + def test_iterable_protocol(self): + self.assertNotIsSubclass(font.Font, collections.abc.Iterable) + self.assertNotIsSubclass(font.Font, collections.abc.Container) + self.assertNotIsInstance(self.font, collections.abc.Iterable) + self.assertNotIsInstance(self.font, collections.abc.Container) + with self.assertRaisesRegex(TypeError, 'is not iterable'): + iter(self.font) + with self.assertRaisesRegex(TypeError, 'is not a container or iterable'): + self.font in self.font + class DefaultRootTest(AbstractDefaultRootTest, unittest.TestCase): diff --git a/Lib/test/test_tkinter/test_images.py b/Lib/test/test_tkinter/test_images.py index 38371fe00d6eb5..358a18beee2571 100644 --- a/Lib/test/test_tkinter/test_images.py +++ b/Lib/test/test_tkinter/test_images.py @@ -1,3 +1,4 @@ +import collections.abc import unittest import tkinter from test import support @@ -61,7 +62,33 @@ def test_image_create_photo(self): self.assertRaises(RuntimeError, tkinter.PhotoImage) -class BitmapImageTest(AbstractTkTest, unittest.TestCase): +class BaseImageTest: + def create(self): + return self.image_class('::img::test', master=self.root, + file=self.testfile) + + def test_bug_100814(self): + # gh-100814: Passing a callable option value causes AttributeError. + with self.assertRaises(tkinter.TclError): + self.image_class('::img::test', master=self.root, spam=print) + image = self.image_class('::img::test', master=self.root) + with self.assertRaises(tkinter.TclError): + image.configure(spam=print) + + def test_iterable_protocol(self): + image = self.create() + self.assertNotIsSubclass(self.image_class, collections.abc.Iterable) + self.assertNotIsSubclass(self.image_class, collections.abc.Container) + self.assertNotIsInstance(image, collections.abc.Iterable) + self.assertNotIsInstance(image, collections.abc.Container) + with self.assertRaisesRegex(TypeError, 'is not iterable'): + iter(image) + with self.assertRaisesRegex(TypeError, 'is not a container or iterable'): + image in image + + +class BitmapImageTest(BaseImageTest, AbstractTkTest, unittest.TestCase): + image_class = tkinter.BitmapImage @classmethod def setUpClass(cls): @@ -144,26 +171,15 @@ def test_configure_foreground(self): self.assertEqual(image['foreground'], '-foreground {} {} #000000 yellow') - def test_bug_100814(self): - # gh-100814: Passing a callable option value causes AttributeError. - with self.assertRaises(tkinter.TclError): - tkinter.BitmapImage('::img::test', master=self.root, spam=print) - image = tkinter.BitmapImage('::img::test', master=self.root) - with self.assertRaises(tkinter.TclError): - image.configure(spam=print) - -class PhotoImageTest(AbstractTkTest, unittest.TestCase): +class PhotoImageTest(BaseImageTest, AbstractTkTest, unittest.TestCase): + image_class = tkinter.PhotoImage @classmethod def setUpClass(cls): AbstractTkTest.setUpClass.__func__(cls) cls.testfile = support.findfile('python.gif', subdir='tkinterdata') - def create(self): - return tkinter.PhotoImage('::img::test', master=self.root, - file=self.testfile) - def colorlist(self, *args): if tkinter.TkVersion >= 8.6 and self.wantobjects: return args @@ -282,14 +298,6 @@ def test_configure_palette(self): image.configure(palette='3/4/2') self.assertEqual(image['palette'], '3/4/2') - def test_bug_100814(self): - # gh-100814: Passing a callable option value causes AttributeError. - with self.assertRaises(tkinter.TclError): - tkinter.PhotoImage('::img::test', master=self.root, spam=print) - image = tkinter.PhotoImage('::img::test', master=self.root) - with self.assertRaises(tkinter.TclError): - image.configure(spam=print) - def test_blank(self): image = self.create() image.blank() diff --git a/Lib/test/test_tkinter/test_misc.py b/Lib/test/test_tkinter/test_misc.py index 0c76e07066f8a8..32e2329506e7ff 100644 --- a/Lib/test/test_tkinter/test_misc.py +++ b/Lib/test/test_tkinter/test_misc.py @@ -1,3 +1,4 @@ +import collections.abc import functools import unittest import tkinter @@ -508,6 +509,17 @@ def test_embedded_null(self): widget.selection_range(0, 'end') self.assertEqual(widget.selection_get(), '\u20ac\0abc\x00def') + def test_iterable_protocol(self): + widget = tkinter.Entry(self.root) + self.assertNotIsSubclass(tkinter.Entry, collections.abc.Iterable) + self.assertNotIsSubclass(tkinter.Entry, collections.abc.Container) + self.assertNotIsInstance(widget, collections.abc.Iterable) + self.assertNotIsInstance(widget, collections.abc.Container) + with self.assertRaisesRegex(TypeError, 'is not iterable'): + iter(widget) + with self.assertRaisesRegex(TypeError, 'is not a container or iterable'): + widget in widget + class WmTest(AbstractTkTest, unittest.TestCase): diff --git a/Lib/tkinter/__init__.py b/Lib/tkinter/__init__.py index 9526d8b949fa3b..c54530740395f7 100644 --- a/Lib/tkinter/__init__.py +++ b/Lib/tkinter/__init__.py @@ -1848,6 +1848,7 @@ def cget(self, key): return self.tk.call(self._w, 'cget', '-' + key) __getitem__ = cget + __iter__ = None # prevent using __getitem__ for iteration def __setitem__(self, key, value): self.configure({key: value}) @@ -4280,6 +4281,8 @@ def __setitem__(self, key, value): def __getitem__(self, key): return self.tk.call(self.name, 'configure', '-'+key) + __iter__ = None # prevent using __getitem__ for iteration + def configure(self, **kw): """Configure the image.""" res = () diff --git a/Lib/tkinter/font.py b/Lib/tkinter/font.py index 7aed523cce3784..896e910d69f6f3 100644 --- a/Lib/tkinter/font.py +++ b/Lib/tkinter/font.py @@ -114,6 +114,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): self.configure(**{key: value}) + __iter__ = None # prevent using __getitem__ for iteration + def __del__(self): try: if self.delete_font: diff --git a/Misc/NEWS.d/next/Library/2025-10-23-13-42-15.gh-issue-140481.XKxWpq.rst b/Misc/NEWS.d/next/Library/2025-10-23-13-42-15.gh-issue-140481.XKxWpq.rst new file mode 100644 index 00000000000000..1f511c3b9d0583 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-23-13-42-15.gh-issue-140481.XKxWpq.rst @@ -0,0 +1 @@ +Improve error message when trying to iterate a Tk widget, image or font. From 134adb32e86c5c4f1335c5884483adf9c56120b0 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 11:12:45 +0000 Subject: [PATCH 08/13] gh-139188: Remove `Tools/tz/zdump.py` script (GH-139189) --- ...-09-20-20-31-54.gh-issue-139188.zfcxkW.rst | 1 + Tools/README | 2 - Tools/tz/zdump.py | 81 ------------------- 3 files changed, 1 insertion(+), 83 deletions(-) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2025-09-20-20-31-54.gh-issue-139188.zfcxkW.rst delete mode 100644 Tools/tz/zdump.py diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-09-20-20-31-54.gh-issue-139188.zfcxkW.rst b/Misc/NEWS.d/next/Tools-Demos/2025-09-20-20-31-54.gh-issue-139188.zfcxkW.rst new file mode 100644 index 00000000000000..9f52d0163ab038 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-09-20-20-31-54.gh-issue-139188.zfcxkW.rst @@ -0,0 +1 @@ +Remove ``Tools/tz/zdump.py`` script. diff --git a/Tools/README b/Tools/README index c8a34d82206672..22d76dfdbcf4a4 100644 --- a/Tools/README +++ b/Tools/README @@ -51,8 +51,6 @@ ssl Scripts to generate ssl_data.h from OpenSSL sources, and run tsan Utilities for building CPython with thread-sanitizer. -tz A script to dump timezone from /usr/share/zoneinfo. - unicode Tools for generating unicodedata and codecs from unicode.org and other mapping files (by Fredrik Lundh, Marc-Andre Lemburg and Martin von Loewis). diff --git a/Tools/tz/zdump.py b/Tools/tz/zdump.py deleted file mode 100644 index 39de0a416d0251..00000000000000 --- a/Tools/tz/zdump.py +++ /dev/null @@ -1,81 +0,0 @@ -import sys -import os -import struct -from array import array -from collections import namedtuple -from datetime import datetime - -ttinfo = namedtuple('ttinfo', ['tt_gmtoff', 'tt_isdst', 'tt_abbrind']) - -class TZInfo: - def __init__(self, transitions, type_indices, ttis, abbrs): - self.transitions = transitions - self.type_indices = type_indices - self.ttis = ttis - self.abbrs = abbrs - - @classmethod - def fromfile(cls, fileobj): - if fileobj.read(4).decode() != "TZif": - raise ValueError("not a zoneinfo file") - fileobj.seek(20) - header = fileobj.read(24) - tzh = (tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt, - tzh_timecnt, tzh_typecnt, tzh_charcnt) = struct.unpack(">6l", header) - transitions = array('i') - transitions.fromfile(fileobj, tzh_timecnt) - if sys.byteorder != 'big': - transitions.byteswap() - - type_indices = array('B') - type_indices.fromfile(fileobj, tzh_timecnt) - - ttis = [] - for i in range(tzh_typecnt): - ttis.append(ttinfo._make(struct.unpack(">lbb", fileobj.read(6)))) - - abbrs = fileobj.read(tzh_charcnt) - - self = cls(transitions, type_indices, ttis, abbrs) - self.tzh = tzh - - return self - - def dump(self, stream, start=None, end=None): - for j, (trans, i) in enumerate(zip(self.transitions, self.type_indices)): - utc = datetime.utcfromtimestamp(trans) - tti = self.ttis[i] - lmt = datetime.utcfromtimestamp(trans + tti.tt_gmtoff) - abbrind = tti.tt_abbrind - abbr = self.abbrs[abbrind:self.abbrs.find(0, abbrind)].decode() - if j > 0: - prev_tti = self.ttis[self.type_indices[j - 1]] - shift = " %+g" % ((tti.tt_gmtoff - prev_tti.tt_gmtoff) / 3600) - else: - shift = '' - print("%s UTC = %s %-5s isdst=%d" % (utc, lmt, abbr, tti[1]) + shift, file=stream) - - @classmethod - def zonelist(cls, zonedir='/usr/share/zoneinfo'): - zones = [] - for root, _, files in os.walk(zonedir): - for f in files: - p = os.path.join(root, f) - with open(p, 'rb') as o: - magic = o.read(4) - if magic == b'TZif': - zones.append(p[len(zonedir) + 1:]) - return zones - -if __name__ == '__main__': - if len(sys.argv) < 2: - zones = TZInfo.zonelist() - for z in zones: - print(z) - sys.exit() - filepath = sys.argv[1] - if not filepath.startswith('/'): - filepath = os.path.join('/usr/share/zoneinfo', filepath) - with open(filepath, 'rb') as fileobj: - tzi = TZInfo.fromfile(fileobj) - tzi.dump(sys.stdout) From 75a1cbdd38b142b359849eae16a2ecc12f6b3881 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 30 Oct 2025 12:14:17 +0100 Subject: [PATCH 09/13] gh-140748: socket_helper.transient_internet: Unwrap UrlError.__cause__ (GH-140749) --- Lib/test/support/socket_helper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/support/socket_helper.py b/Lib/test/support/socket_helper.py index 87941ee1791b4e..a41e487f3e4bc5 100644 --- a/Lib/test/support/socket_helper.py +++ b/Lib/test/support/socket_helper.py @@ -259,6 +259,10 @@ def filter_error(err): # raise OSError('socket error', msg) from msg elif len(a) >= 2 and isinstance(a[1], OSError): err = a[1] + # The error can also be wrapped as __cause__: + # raise URLError(f"ftp error: {exp}") from exp + elif isinstance(err, urllib.error.URLError) and err.__cause__: + err = err.__cause__ else: break filter_error(err) From efc37ba49eef07dad83698cf8e68820c955aacf9 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 30 Oct 2025 14:36:15 +0100 Subject: [PATCH 10/13] gh-139353: Add Objects/unicode_writer.c file (#139911) Move the public PyUnicodeWriter API and the private _PyUnicodeWriter API to a new Objects/unicode_writer.c file. Rename a few helper functions to share them between unicodeobject.c and unicode_writer.c, such as resize_compact() or unicode_result(). --- Include/internal/pycore_unicodeobject.h | 40 ++ Makefile.pre.in | 1 + Objects/unicode_writer.c | 639 +++++++++++++++++++++++ Objects/unicodeobject.c | 667 ++---------------------- PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + 8 files changed, 717 insertions(+), 638 deletions(-) create mode 100644 Objects/unicode_writer.c diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index f384fad8713adc..e7ca65a56b6ec3 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -17,6 +17,46 @@ extern "C" { extern int _PyUnicode_IsModifiable(PyObject *unicode); +extern void _PyUnicodeWriter_InitWithBuffer( + _PyUnicodeWriter *writer, + PyObject *buffer); +extern PyObject* _PyUnicode_Result(PyObject *unicode); +extern int _PyUnicode_DecodeUTF8Writer( + _PyUnicodeWriter *writer, + const char *s, + Py_ssize_t size, + _Py_error_handler error_handler, + const char *errors, + Py_ssize_t *consumed); +extern PyObject* _PyUnicode_ResizeCompact( + PyObject *unicode, + Py_ssize_t length); +extern PyObject* _PyUnicode_GetEmpty(void); + + +/* Generic helper macro to convert characters of different types. + from_type and to_type have to be valid type names, begin and end + are pointers to the source characters which should be of type + "from_type *". to is a pointer of type "to_type *" and points to the + buffer where the result characters are written to. */ +#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ + do { \ + to_type *_to = (to_type *)(to); \ + const from_type *_iter = (const from_type *)(begin);\ + const from_type *_end = (const from_type *)(end);\ + Py_ssize_t n = (_end) - (_iter); \ + const from_type *_unrolled_end = \ + _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ + while (_iter < (_unrolled_end)) { \ + _to[0] = (to_type) _iter[0]; \ + _to[1] = (to_type) _iter[1]; \ + _to[2] = (to_type) _iter[2]; \ + _to[3] = (to_type) _iter[3]; \ + _iter += 4; _to += 4; \ + } \ + while (_iter < (_end)) \ + *_to++ = (to_type) *_iter++; \ + } while (0) static inline void diff --git a/Makefile.pre.in b/Makefile.pre.in index 0a1b8d028addad..656d9dacd962e3 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -559,6 +559,7 @@ OBJECT_OBJS= \ Objects/typevarobject.o \ Objects/unicode_format.o \ Objects/unicode_formatter.o \ + Objects/unicode_writer.o \ Objects/unicodectype.o \ Objects/unicodeobject.o \ Objects/unionobject.o \ diff --git a/Objects/unicode_writer.c b/Objects/unicode_writer.c new file mode 100644 index 00000000000000..2b944bf1ea8cde --- /dev/null +++ b/Objects/unicode_writer.c @@ -0,0 +1,639 @@ +/* + +Unicode implementation based on original code by Fredrik Lundh, +modified by Marc-Andre Lemburg . + +Major speed upgrades to the method implementations at the Reykjavik +NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. + +Copyright (c) Corporation for National Research Initiatives. + +-------------------------------------------------------------------- +The original string type implementation is: + + Copyright (c) 1999 by Secret Labs AB + Copyright (c) 1999 by Fredrik Lundh + +By obtaining, using, and/or copying this software and/or its +associated documentation, you agree that you have read, understood, +and will comply with the following terms and conditions: + +Permission to use, copy, modify, and distribute this software and its +associated documentation for any purpose and without fee is hereby +granted, provided that the above copyright notice appears in all +copies, and that both that copyright notice and this permission notice +appear in supporting documentation, and that the name of Secret Labs +AB or the author not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior +permission. + +SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO +THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR +ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +-------------------------------------------------------------------- + +*/ + +#include "Python.h" +#include "pycore_freelist.h" // _Py_FREELIST_FREE() +#include "pycore_long.h" // _PyLong_FormatWriter() +#include "pycore_unicodeobject.h" // _PyUnicode_Result() + + +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + + +/* Compilation of templated routines */ + +#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty() + +#include "stringlib/ucs1lib.h" +#include "stringlib/find_max_char.h" +#include "stringlib/undef.h" + + +/* Copy an ASCII or latin1 char* string into a Python Unicode string. + + WARNING: The function doesn't copy the terminating null character and + doesn't check the maximum character (may write a latin1 character in an + ASCII string). */ +static void +unicode_write_cstr(PyObject *unicode, Py_ssize_t index, + const char *str, Py_ssize_t len) +{ + int kind = PyUnicode_KIND(unicode); + const void *data = PyUnicode_DATA(unicode); + const char *end = str + len; + + assert(index + len <= PyUnicode_GET_LENGTH(unicode)); + switch (kind) { + case PyUnicode_1BYTE_KIND: { +#ifdef Py_DEBUG + if (PyUnicode_IS_ASCII(unicode)) { + Py_UCS4 maxchar = ucs1lib_find_max_char( + (const Py_UCS1*)str, + (const Py_UCS1*)str + len); + assert(maxchar < 128); + } +#endif + memcpy((char *) data + index, str, len); + break; + } + case PyUnicode_2BYTE_KIND: { + Py_UCS2 *start = (Py_UCS2 *)data + index; + Py_UCS2 *ucs2 = start; + + for (; str < end; ++ucs2, ++str) + *ucs2 = (Py_UCS2)*str; + + assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); + break; + } + case PyUnicode_4BYTE_KIND: { + Py_UCS4 *start = (Py_UCS4 *)data + index; + Py_UCS4 *ucs4 = start; + + for (; str < end; ++ucs4, ++str) + *ucs4 = (Py_UCS4)*str; + + assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); + break; + } + default: + Py_UNREACHABLE(); + } +} + + +static inline void +_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) +{ + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + + if (!writer->readonly) { + writer->kind = PyUnicode_KIND(writer->buffer); + writer->size = PyUnicode_GET_LENGTH(writer->buffer); + } + else { + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = 0; + assert(writer->kind <= PyUnicode_1BYTE_KIND); + + /* Copy-on-write mode: set buffer size to 0 so + * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on + * next write. */ + writer->size = 0; + } +} + + +void +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) +{ + memset(writer, 0, sizeof(*writer)); + + /* ASCII is the bare minimum */ + writer->min_char = 127; + + /* use a kind value smaller than PyUnicode_1BYTE_KIND so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + assert(writer->kind == 0); + assert(writer->kind < PyUnicode_1BYTE_KIND); +} + + +PyUnicodeWriter* +PyUnicodeWriter_Create(Py_ssize_t length) +{ + if (length < 0) { + PyErr_SetString(PyExc_ValueError, + "length must be positive"); + return NULL; + } + + const size_t size = sizeof(_PyUnicodeWriter); + PyUnicodeWriter *pub_writer; + pub_writer = _Py_FREELIST_POP_MEM(unicode_writers); + if (pub_writer == NULL) { + pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); + if (pub_writer == NULL) { + return (PyUnicodeWriter *)PyErr_NoMemory(); + } + } + _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; + + _PyUnicodeWriter_Init(writer); + if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) { + PyUnicodeWriter_Discard(pub_writer); + return NULL; + } + writer->overallocate = 1; + + return pub_writer; +} + + +void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) +{ + if (writer == NULL) { + return; + } + _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer); + _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); +} + + +// Initialize _PyUnicodeWriter with initial buffer +void +_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) +{ + memset(writer, 0, sizeof(*writer)); + writer->buffer = buffer; + _PyUnicodeWriter_Update(writer); + writer->min_length = writer->size; +} + + +int +_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, + Py_ssize_t length, Py_UCS4 maxchar) +{ + Py_ssize_t newlen; + PyObject *newbuffer; + + assert(length >= 0); + assert(maxchar <= _Py_MAX_UNICODE); + + /* ensure that the _PyUnicodeWriter_Prepare macro was used */ + assert((maxchar > writer->maxchar && length >= 0) + || length > 0); + + if (length > PY_SSIZE_T_MAX - writer->pos) { + PyErr_NoMemory(); + return -1; + } + newlen = writer->pos + length; + + maxchar = Py_MAX(maxchar, writer->min_char); + + if (writer->buffer == NULL) { + assert(!writer->readonly); + if (writer->overallocate + && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { + /* overallocate to limit the number of realloc() */ + newlen += newlen / OVERALLOCATE_FACTOR; + } + if (newlen < writer->min_length) + newlen = writer->min_length; + + writer->buffer = PyUnicode_New(newlen, maxchar); + if (writer->buffer == NULL) + return -1; + } + else if (newlen > writer->size) { + if (writer->overallocate + && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { + /* overallocate to limit the number of realloc() */ + newlen += newlen / OVERALLOCATE_FACTOR; + } + if (newlen < writer->min_length) + newlen = writer->min_length; + + if (maxchar > writer->maxchar || writer->readonly) { + /* resize + widen */ + maxchar = Py_MAX(maxchar, writer->maxchar); + newbuffer = PyUnicode_New(newlen, maxchar); + if (newbuffer == NULL) + return -1; + _PyUnicode_FastCopyCharacters(newbuffer, 0, + writer->buffer, 0, writer->pos); + Py_DECREF(writer->buffer); + writer->readonly = 0; + } + else { + newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen); + if (newbuffer == NULL) + return -1; + } + writer->buffer = newbuffer; + } + else if (maxchar > writer->maxchar) { + assert(!writer->readonly); + newbuffer = PyUnicode_New(writer->size, maxchar); + if (newbuffer == NULL) + return -1; + _PyUnicode_FastCopyCharacters(newbuffer, 0, + writer->buffer, 0, writer->pos); + Py_SETREF(writer->buffer, newbuffer); + } + _PyUnicodeWriter_Update(writer); + return 0; + +#undef OVERALLOCATE_FACTOR +} + +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, + int kind) +{ + Py_UCS4 maxchar; + + /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ + assert(writer->kind < kind); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; + case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; + case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break; + default: + Py_UNREACHABLE(); + } + + return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} + + +int +_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) +{ + return _PyUnicodeWriter_WriteCharInline(writer, ch); +} + + +int +PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) +{ + if (ch > _Py_MAX_UNICODE) { + PyErr_SetString(PyExc_ValueError, + "character must be in range(0x110000)"); + return -1; + } + + return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); +} + + +int +_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) +{ + assert(PyUnicode_Check(str)); + + Py_UCS4 maxchar; + Py_ssize_t len; + + len = PyUnicode_GET_LENGTH(str); + if (len == 0) + return 0; + maxchar = PyUnicode_MAX_CHAR_VALUE(str); + if (maxchar > writer->maxchar || len > writer->size - writer->pos) { + if (writer->buffer == NULL && !writer->overallocate) { + assert(_PyUnicode_CheckConsistency(str, 1)); + writer->readonly = 1; + writer->buffer = Py_NewRef(str); + _PyUnicodeWriter_Update(writer); + writer->pos += len; + return 0; + } + if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) + return -1; + } + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, 0, len); + writer->pos += len; + return 0; +} + + +int +PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) +{ + PyTypeObject *type = Py_TYPE(obj); + if (type == &PyUnicode_Type) { + return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj); + } + + if (type == &PyLong_Type) { + return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); + } + + PyObject *str = PyObject_Str(obj); + if (str == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} + + +int +PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) +{ + if (Py_TYPE(obj) == &PyLong_Type) { + return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); + } + + PyObject *repr = PyObject_Repr(obj); + if (repr == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr); + Py_DECREF(repr); + return res; +} + + +int +_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t start, Py_ssize_t end) +{ + assert(0 <= start); + assert(end <= PyUnicode_GET_LENGTH(str)); + assert(start <= end); + + if (start == 0 && end == PyUnicode_GET_LENGTH(str)) + return _PyUnicodeWriter_WriteStr(writer, str); + + Py_ssize_t len = end - start; + if (len == 0) { + return 0; + } + + Py_UCS4 maxchar; + if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) { + maxchar = _PyUnicode_FindMaxChar(str, start, end); + } + else { + maxchar = writer->maxchar; + } + if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) { + return -1; + } + + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, start, len); + writer->pos += len; + return 0; +} + + +int +PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t start, Py_ssize_t end) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str, not %T", str); + return -1; + } + if (start < 0 || start > end) { + PyErr_Format(PyExc_ValueError, "invalid start argument"); + return -1; + } + if (end > PyUnicode_GET_LENGTH(str)) { + PyErr_Format(PyExc_ValueError, "invalid end argument"); + return -1; + } + + return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str, + start, end); +} + + +int +_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, + const char *ascii, Py_ssize_t len) +{ + if (len == -1) + len = strlen(ascii); + + assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128); + + if (writer->buffer == NULL && !writer->overallocate) { + PyObject *str; + + str = _PyUnicode_FromASCII(ascii, len); + if (str == NULL) + return -1; + + writer->readonly = 1; + writer->buffer = str; + _PyUnicodeWriter_Update(writer); + writer->pos += len; + return 0; + } + + if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) + return -1; + + switch (writer->kind) + { + case PyUnicode_1BYTE_KIND: + { + const Py_UCS1 *str = (const Py_UCS1 *)ascii; + Py_UCS1 *data = writer->data; + + memcpy(data + writer->pos, str, len); + break; + } + case PyUnicode_2BYTE_KIND: + { + _PyUnicode_CONVERT_BYTES( + Py_UCS1, Py_UCS2, + ascii, ascii + len, + (Py_UCS2 *)writer->data + writer->pos); + break; + } + case PyUnicode_4BYTE_KIND: + { + _PyUnicode_CONVERT_BYTES( + Py_UCS1, Py_UCS4, + ascii, ascii + len, + (Py_UCS4 *)writer->data + writer->pos); + break; + } + default: + Py_UNREACHABLE(); + } + + writer->pos += len; + return 0; +} + + +int +PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size) +{ + assert(writer != NULL); + _Py_AssertHoldsTstate(); + + _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer; + return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size); +} + + +int +PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, + const char *str, + Py_ssize_t size) +{ + if (size < 0) { + size = strlen(str); + } + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; + int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size, + _Py_ERROR_STRICT, NULL, NULL); + if (res < 0) { + _writer->pos = old_pos; + } + return res; +} + + +int +PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, + const char *string, + Py_ssize_t length, + const char *errors, + Py_ssize_t *consumed) +{ + if (length < 0) { + length = strlen(string); + } + + _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; + Py_ssize_t old_pos = _writer->pos; + int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length, + _Py_ERROR_UNKNOWN, errors, + consumed); + if (res < 0) { + _writer->pos = old_pos; + if (consumed) { + *consumed = 0; + } + } + return res; +} + + +int +_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, + const char *str, Py_ssize_t len) +{ + Py_UCS4 maxchar; + + maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len); + if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) + return -1; + unicode_write_cstr(writer->buffer, writer->pos, str, len); + writer->pos += len; + return 0; +} + + +PyObject * +_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) +{ + PyObject *str; + + if (writer->pos == 0) { + Py_CLEAR(writer->buffer); + return _PyUnicode_GetEmpty(); + } + + str = writer->buffer; + writer->buffer = NULL; + + if (writer->readonly) { + assert(PyUnicode_GET_LENGTH(str) == writer->pos); + return str; + } + + if (PyUnicode_GET_LENGTH(str) != writer->pos) { + PyObject *str2; + str2 = _PyUnicode_ResizeCompact(str, writer->pos); + if (str2 == NULL) { + Py_DECREF(str); + return NULL; + } + str = str2; + } + + assert(_PyUnicode_CheckConsistency(str, 1)); + return _PyUnicode_Result(str); +} + + +PyObject* +PyUnicodeWriter_Finish(PyUnicodeWriter *writer) +{ + PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer); + assert(((_PyUnicodeWriter*)writer)->buffer == NULL); + _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); + return str; +} + + +void +_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) +{ + Py_CLEAR(writer->buffer); +} diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8a5638ac1406ab..1c443e88e05029 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -46,7 +46,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "pycore_codecs.h" // _PyCodec_Lookup() #include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST #include "pycore_format.h" // F_LJUST -#include "pycore_freelist.h" // _Py_FREELIST_FREE(), _Py_FREELIST_POP() #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.fs_codec #include "pycore_long.h" // _PyLong_FormatWriter() @@ -184,45 +183,9 @@ static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op) } -/* Generic helper macro to convert characters of different types. - from_type and to_type have to be valid type names, begin and end - are pointers to the source characters which should be of type - "from_type *". to is a pointer of type "to_type *" and points to the - buffer where the result characters are written to. */ -#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ - do { \ - to_type *_to = (to_type *)(to); \ - const from_type *_iter = (const from_type *)(begin);\ - const from_type *_end = (const from_type *)(end);\ - Py_ssize_t n = (_end) - (_iter); \ - const from_type *_unrolled_end = \ - _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ - while (_iter < (_unrolled_end)) { \ - _to[0] = (to_type) _iter[0]; \ - _to[1] = (to_type) _iter[1]; \ - _to[2] = (to_type) _iter[2]; \ - _to[3] = (to_type) _iter[3]; \ - _iter += 4; _to += 4; \ - } \ - while (_iter < (_end)) \ - *_to++ = (to_type) *_iter++; \ - } while (0) - #define LATIN1 _Py_LATIN1_CHR -#ifdef MS_WINDOWS - /* On Windows, overallocate by 50% is the best factor */ -# define OVERALLOCATE_FACTOR 2 -#else - /* On Linux, overallocate by 25% is the best factor */ -# define OVERALLOCATE_FACTOR 4 -#endif - /* Forward declaration */ -static inline int -_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); -static inline void -_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); static PyObject * unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, const char *errors); @@ -230,11 +193,6 @@ static PyObject * unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed); -static int -unicode_decode_utf8_writer(_PyUnicodeWriter *writer, - const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed); #ifdef Py_DEBUG static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); @@ -242,7 +200,8 @@ static int unicode_is_singleton(PyObject *unicode); // Return a reference to the immortal empty string singleton. -static inline PyObject* unicode_get_empty(void) +PyObject* +_PyUnicode_GetEmpty(void) { _Py_DECLARE_STR(empty, ""); return &_Py_STR(empty); @@ -416,7 +375,7 @@ static void clear_global_interned_strings(void) #define _Py_RETURN_UNICODE_EMPTY() \ do { \ - return unicode_get_empty(); \ + return _PyUnicode_GetEmpty();\ } while (0) @@ -748,14 +707,14 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) #undef CHECK } -static PyObject* -unicode_result(PyObject *unicode) +PyObject* +_PyUnicode_Result(PyObject *unicode) { assert(_PyUnicode_CHECK(unicode)); Py_ssize_t length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { - PyObject *empty = unicode_get_empty(); + PyObject *empty = _PyUnicode_GetEmpty(); if (unicode != empty) { Py_DECREF(unicode); } @@ -778,6 +737,7 @@ unicode_result(PyObject *unicode) assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; } +#define unicode_result _PyUnicode_Result static PyObject* unicode_result_unchanged(PyObject *unicode) @@ -985,7 +945,7 @@ make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) /* Compilation of templated routines */ -#define STRINGLIB_GET_EMPTY() unicode_get_empty() +#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty() #include "stringlib/asciilib.h" #include "stringlib/fastsearch.h" @@ -1097,8 +1057,8 @@ resize_copy(PyObject *unicode, Py_ssize_t length) return copy; } -static PyObject* -resize_compact(PyObject *unicode, Py_ssize_t length) +PyObject* +_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length) { Py_ssize_t char_size; Py_ssize_t struct_size; @@ -1306,7 +1266,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) { /* Optimization for empty strings */ if (size == 0) { - return unicode_get_empty(); + return _PyUnicode_GetEmpty(); } PyObject *obj; @@ -1799,7 +1759,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) return 0; if (length == 0) { - PyObject *empty = unicode_get_empty(); + PyObject *empty = _PyUnicode_GetEmpty(); Py_SETREF(*p_unicode, empty); return 0; } @@ -1813,7 +1773,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) } if (PyUnicode_IS_COMPACT(unicode)) { - PyObject *new_unicode = resize_compact(unicode, length); + PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length); if (new_unicode == NULL) return -1; *p_unicode = new_unicode; @@ -1839,58 +1799,6 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) return unicode_resize(p_unicode, length); } -/* Copy an ASCII or latin1 char* string into a Python Unicode string. - - WARNING: The function doesn't copy the terminating null character and - doesn't check the maximum character (may write a latin1 character in an - ASCII string). */ -static void -unicode_write_cstr(PyObject *unicode, Py_ssize_t index, - const char *str, Py_ssize_t len) -{ - int kind = PyUnicode_KIND(unicode); - const void *data = PyUnicode_DATA(unicode); - const char *end = str + len; - - assert(index + len <= PyUnicode_GET_LENGTH(unicode)); - switch (kind) { - case PyUnicode_1BYTE_KIND: { -#ifdef Py_DEBUG - if (PyUnicode_IS_ASCII(unicode)) { - Py_UCS4 maxchar = ucs1lib_find_max_char( - (const Py_UCS1*)str, - (const Py_UCS1*)str + len); - assert(maxchar < 128); - } -#endif - memcpy((char *) data + index, str, len); - break; - } - case PyUnicode_2BYTE_KIND: { - Py_UCS2 *start = (Py_UCS2 *)data + index; - Py_UCS2 *ucs2 = start; - - for (; str < end; ++ucs2, ++str) - *ucs2 = (Py_UCS2)*str; - - assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); - break; - } - case PyUnicode_4BYTE_KIND: { - Py_UCS4 *start = (Py_UCS4 *)data + index; - Py_UCS4 *ucs4 = start; - - for (; str < end; ++ucs4, ++str) - *ucs4 = (Py_UCS4)*str; - - assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); - break; - } - default: - Py_UNREACHABLE(); - } -} - static PyObject* get_latin1_char(Py_UCS1 ch) { @@ -2105,7 +2013,7 @@ PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize"); return NULL; } - return unicode_get_empty(); + return _PyUnicode_GetEmpty(); } PyObject * @@ -2672,8 +2580,8 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, } if (width < 0) { - return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", pconsumed); + return _PyUnicode_DecodeUTF8Writer(writer, str, length, + _Py_ERROR_REPLACE, "replace", pconsumed); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, @@ -5424,11 +5332,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, // Used by PyUnicodeWriter_WriteUTF8() implementation -static int -unicode_decode_utf8_writer(_PyUnicodeWriter *writer, - const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed) +int +_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer, + const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed) { if (size == 0) { if (consumed) { @@ -10766,7 +10674,7 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) { - u = unicode_get_empty(); + u = _PyUnicode_GetEmpty(); goto done; } if (new_size > (PY_SSIZE_T_MAX / rkind)) { @@ -11439,7 +11347,7 @@ PyUnicode_Concat(PyObject *left, PyObject *right) } /* Shortcuts */ - PyObject *empty = unicode_get_empty(); // Borrowed reference + PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference if (left == empty) { return PyUnicode_FromObject(right); } @@ -11491,7 +11399,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) } /* Shortcuts */ - PyObject *empty = unicode_get_empty(); // Borrowed reference + PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference if (left == empty) { Py_DECREF(left); *p_left = Py_NewRef(right); @@ -12987,7 +12895,7 @@ PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - PyObject *empty = unicode_get_empty(); // Borrowed reference + PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference return PyTuple_Pack(3, str_obj, empty, empty); } buf1 = PyUnicode_DATA(str_obj); @@ -13039,7 +12947,7 @@ PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); if (kind1 < kind2 || len1 < len2) { - PyObject *empty = unicode_get_empty(); // Borrowed reference + PyObject *empty = _PyUnicode_GetEmpty(); // Borrowed reference return PyTuple_Pack(3, empty, empty, str_obj); } buf1 = PyUnicode_DATA(str_obj); @@ -13518,523 +13426,6 @@ unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start, } -static inline void -_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) -{ - writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); - writer->data = PyUnicode_DATA(writer->buffer); - - if (!writer->readonly) { - writer->kind = PyUnicode_KIND(writer->buffer); - writer->size = PyUnicode_GET_LENGTH(writer->buffer); - } - else { - /* use a value smaller than PyUnicode_1BYTE_KIND() so - _PyUnicodeWriter_PrepareKind() will copy the buffer. */ - writer->kind = 0; - assert(writer->kind <= PyUnicode_1BYTE_KIND); - - /* Copy-on-write mode: set buffer size to 0 so - * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on - * next write. */ - writer->size = 0; - } -} - - -void -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) -{ - memset(writer, 0, sizeof(*writer)); - - /* ASCII is the bare minimum */ - writer->min_char = 127; - - /* use a kind value smaller than PyUnicode_1BYTE_KIND so - _PyUnicodeWriter_PrepareKind() will copy the buffer. */ - assert(writer->kind == 0); - assert(writer->kind < PyUnicode_1BYTE_KIND); -} - - -PyUnicodeWriter* -PyUnicodeWriter_Create(Py_ssize_t length) -{ - if (length < 0) { - PyErr_SetString(PyExc_ValueError, - "length must be positive"); - return NULL; - } - - const size_t size = sizeof(_PyUnicodeWriter); - PyUnicodeWriter *pub_writer; - pub_writer = _Py_FREELIST_POP_MEM(unicode_writers); - if (pub_writer == NULL) { - pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); - if (pub_writer == NULL) { - return (PyUnicodeWriter *)PyErr_NoMemory(); - } - } - _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; - - _PyUnicodeWriter_Init(writer); - if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) { - PyUnicodeWriter_Discard(pub_writer); - return NULL; - } - writer->overallocate = 1; - - return pub_writer; -} - - -void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) -{ - if (writer == NULL) { - return; - } - _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer); - _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); -} - - -// Initialize _PyUnicodeWriter with initial buffer -static inline void -_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) -{ - memset(writer, 0, sizeof(*writer)); - writer->buffer = buffer; - _PyUnicodeWriter_Update(writer); - writer->min_length = writer->size; -} - - -int -_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, - Py_ssize_t length, Py_UCS4 maxchar) -{ - Py_ssize_t newlen; - PyObject *newbuffer; - - assert(length >= 0); - assert(maxchar <= MAX_UNICODE); - - /* ensure that the _PyUnicodeWriter_Prepare macro was used */ - assert((maxchar > writer->maxchar && length >= 0) - || length > 0); - - if (length > PY_SSIZE_T_MAX - writer->pos) { - PyErr_NoMemory(); - return -1; - } - newlen = writer->pos + length; - - maxchar = Py_MAX(maxchar, writer->min_char); - - if (writer->buffer == NULL) { - assert(!writer->readonly); - if (writer->overallocate - && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { - /* overallocate to limit the number of realloc() */ - newlen += newlen / OVERALLOCATE_FACTOR; - } - if (newlen < writer->min_length) - newlen = writer->min_length; - - writer->buffer = PyUnicode_New(newlen, maxchar); - if (writer->buffer == NULL) - return -1; - } - else if (newlen > writer->size) { - if (writer->overallocate - && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { - /* overallocate to limit the number of realloc() */ - newlen += newlen / OVERALLOCATE_FACTOR; - } - if (newlen < writer->min_length) - newlen = writer->min_length; - - if (maxchar > writer->maxchar || writer->readonly) { - /* resize + widen */ - maxchar = Py_MAX(maxchar, writer->maxchar); - newbuffer = PyUnicode_New(newlen, maxchar); - if (newbuffer == NULL) - return -1; - _PyUnicode_FastCopyCharacters(newbuffer, 0, - writer->buffer, 0, writer->pos); - Py_DECREF(writer->buffer); - writer->readonly = 0; - } - else { - newbuffer = resize_compact(writer->buffer, newlen); - if (newbuffer == NULL) - return -1; - } - writer->buffer = newbuffer; - } - else if (maxchar > writer->maxchar) { - assert(!writer->readonly); - newbuffer = PyUnicode_New(writer->size, maxchar); - if (newbuffer == NULL) - return -1; - _PyUnicode_FastCopyCharacters(newbuffer, 0, - writer->buffer, 0, writer->pos); - Py_SETREF(writer->buffer, newbuffer); - } - _PyUnicodeWriter_Update(writer); - return 0; - -#undef OVERALLOCATE_FACTOR -} - -int -_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, - int kind) -{ - Py_UCS4 maxchar; - - /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ - assert(writer->kind < kind); - - switch (kind) - { - case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; - case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; - case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break; - default: - Py_UNREACHABLE(); - } - - return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); -} - -int -_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) -{ - return _PyUnicodeWriter_WriteCharInline(writer, ch); -} - -int -PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) -{ - if (ch > MAX_UNICODE) { - PyErr_SetString(PyExc_ValueError, - "character must be in range(0x110000)"); - return -1; - } - - return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); -} - -int -_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) -{ - assert(PyUnicode_Check(str)); - - Py_UCS4 maxchar; - Py_ssize_t len; - - len = PyUnicode_GET_LENGTH(str); - if (len == 0) - return 0; - maxchar = PyUnicode_MAX_CHAR_VALUE(str); - if (maxchar > writer->maxchar || len > writer->size - writer->pos) { - if (writer->buffer == NULL && !writer->overallocate) { - assert(_PyUnicode_CheckConsistency(str, 1)); - writer->readonly = 1; - writer->buffer = Py_NewRef(str); - _PyUnicodeWriter_Update(writer); - writer->pos += len; - return 0; - } - if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) - return -1; - } - _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, - str, 0, len); - writer->pos += len; - return 0; -} - -int -PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) -{ - PyTypeObject *type = Py_TYPE(obj); - if (type == &PyUnicode_Type) { - return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj); - } - - if (type == &PyLong_Type) { - return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); - } - - PyObject *str = PyObject_Str(obj); - if (str == NULL) { - return -1; - } - - int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); - Py_DECREF(str); - return res; -} - - -int -PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) -{ - if (Py_TYPE(obj) == &PyLong_Type) { - return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0); - } - - PyObject *repr = PyObject_Repr(obj); - if (repr == NULL) { - return -1; - } - - int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr); - Py_DECREF(repr); - return res; -} - - -int -_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, - Py_ssize_t start, Py_ssize_t end) -{ - assert(0 <= start); - assert(end <= PyUnicode_GET_LENGTH(str)); - assert(start <= end); - - if (start == 0 && end == PyUnicode_GET_LENGTH(str)) - return _PyUnicodeWriter_WriteStr(writer, str); - - Py_ssize_t len = end - start; - if (len == 0) { - return 0; - } - - Py_UCS4 maxchar; - if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) { - maxchar = _PyUnicode_FindMaxChar(str, start, end); - } - else { - maxchar = writer->maxchar; - } - if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) { - return -1; - } - - _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, - str, start, len); - writer->pos += len; - return 0; -} - - -int -PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, - Py_ssize_t start, Py_ssize_t end) -{ - if (!PyUnicode_Check(str)) { - PyErr_Format(PyExc_TypeError, "expect str, not %T", str); - return -1; - } - if (start < 0 || start > end) { - PyErr_Format(PyExc_ValueError, "invalid start argument"); - return -1; - } - if (end > PyUnicode_GET_LENGTH(str)) { - PyErr_Format(PyExc_ValueError, "invalid end argument"); - return -1; - } - - return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str, - start, end); -} - - -int -_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, - const char *ascii, Py_ssize_t len) -{ - if (len == -1) - len = strlen(ascii); - - assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128); - - if (writer->buffer == NULL && !writer->overallocate) { - PyObject *str; - - str = _PyUnicode_FromASCII(ascii, len); - if (str == NULL) - return -1; - - writer->readonly = 1; - writer->buffer = str; - _PyUnicodeWriter_Update(writer); - writer->pos += len; - return 0; - } - - if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) - return -1; - - switch (writer->kind) - { - case PyUnicode_1BYTE_KIND: - { - const Py_UCS1 *str = (const Py_UCS1 *)ascii; - Py_UCS1 *data = writer->data; - - memcpy(data + writer->pos, str, len); - break; - } - case PyUnicode_2BYTE_KIND: - { - _PyUnicode_CONVERT_BYTES( - Py_UCS1, Py_UCS2, - ascii, ascii + len, - (Py_UCS2 *)writer->data + writer->pos); - break; - } - case PyUnicode_4BYTE_KIND: - { - _PyUnicode_CONVERT_BYTES( - Py_UCS1, Py_UCS4, - ascii, ascii + len, - (Py_UCS4 *)writer->data + writer->pos); - break; - } - default: - Py_UNREACHABLE(); - } - - writer->pos += len; - return 0; -} - - -int -PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer, - const char *str, - Py_ssize_t size) -{ - assert(writer != NULL); - _Py_AssertHoldsTstate(); - - _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer; - return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size); -} - - -int -PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, - const char *str, - Py_ssize_t size) -{ - if (size < 0) { - size = strlen(str); - } - - _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; - Py_ssize_t old_pos = _writer->pos; - int res = unicode_decode_utf8_writer(_writer, str, size, - _Py_ERROR_STRICT, NULL, NULL); - if (res < 0) { - _writer->pos = old_pos; - } - return res; -} - - -int -PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, - const char *string, - Py_ssize_t length, - const char *errors, - Py_ssize_t *consumed) -{ - if (length < 0) { - length = strlen(string); - } - - _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer; - Py_ssize_t old_pos = _writer->pos; - int res = unicode_decode_utf8_writer(_writer, string, length, - _Py_ERROR_UNKNOWN, errors, consumed); - if (res < 0) { - _writer->pos = old_pos; - if (consumed) { - *consumed = 0; - } - } - return res; -} - - -int -_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, - const char *str, Py_ssize_t len) -{ - Py_UCS4 maxchar; - - maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len); - if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) - return -1; - unicode_write_cstr(writer->buffer, writer->pos, str, len); - writer->pos += len; - return 0; -} - -PyObject * -_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) -{ - PyObject *str; - - if (writer->pos == 0) { - Py_CLEAR(writer->buffer); - _Py_RETURN_UNICODE_EMPTY(); - } - - str = writer->buffer; - writer->buffer = NULL; - - if (writer->readonly) { - assert(PyUnicode_GET_LENGTH(str) == writer->pos); - return str; - } - - if (PyUnicode_GET_LENGTH(str) != writer->pos) { - PyObject *str2; - str2 = resize_compact(str, writer->pos); - if (str2 == NULL) { - Py_DECREF(str); - return NULL; - } - str = str2; - } - - assert(_PyUnicode_CheckConsistency(str, 1)); - return unicode_result(str); -} - - -PyObject* -PyUnicodeWriter_Finish(PyUnicodeWriter *writer) -{ - PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer); - assert(((_PyUnicodeWriter*)writer)->buffer == NULL); - _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free); - return str; -} - - -void -_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) -{ - Py_CLEAR(writer->buffer); -} - #include "stringlib/unicode_format.h" PyDoc_STRVAR(format__doc__, @@ -14456,7 +13847,7 @@ unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding, { PyObject *unicode; if (x == NULL) { - unicode = unicode_get_empty(); + unicode = _PyUnicode_GetEmpty(); } else if (encoding == NULL && errors == NULL) { unicode = PyObject_Str(x); @@ -14510,7 +13901,7 @@ unicode_vectorcall(PyObject *type, PyObject *const *args, return NULL; } if (nargs == 0) { - return unicode_get_empty(); + return _PyUnicode_GetEmpty(); } PyObject *object = args[0]; if (nargs == 1) { @@ -15186,7 +14577,7 @@ unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored)) if (it->it_seq != NULL) { return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index); } else { - PyObject *u = unicode_get_empty(); + PyObject *u = _PyUnicode_GetEmpty(); if (u == NULL) { Py_XDECREF(iter); return NULL; diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 02b6f35798f845..e65f201623fbbe 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -167,6 +167,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 39462a6380cd21..a9fb6f2328ad95 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -490,6 +490,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index a101c1b45cf25c..5e101ee1d0e697 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -562,6 +562,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index e3f261c2b92ab9..3d20ea41cd2476 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -1283,6 +1283,9 @@ Objects + + Objects + Objects From 6826166280d6518441a729b444173db205c4ab20 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 30 Oct 2025 15:55:39 +0200 Subject: [PATCH 11/13] gh-135801: Improve filtering by module in warn_explicit() without module argument (GH-140151) * Try to match the module name pattern with module names constructed starting from different parent directories of the filename. E.g., for "/path/to/package/module" try to match with "path.to.package.module", "to.package.module", "package.module" and "module". * Ignore trailing "/__init__.py". * Ignore trailing ".pyw" on Windows. * Keep matching with the full filename (without optional ".py" extension) for compatibility. * Only ignore the case of the ".py" extension on Windows. --- Doc/library/warnings.rst | 13 +++- Doc/whatsnew/3.15.rst | 12 +++ Lib/_py_warnings.py | 46 ++++++++++-- Lib/test/test_ast/test_ast.py | 14 ++++ Lib/test/test_builtin.py | 22 ++++++ Lib/test/test_cmd_line_script.py | 13 ++++ Lib/test/test_compile.py | 14 ++++ Lib/test/test_import/__init__.py | 34 ++++++++- Lib/test/test_import/data/syntax_warnings.py | 21 ++++++ Lib/test/test_symtable.py | 15 ++++ Lib/test/test_warnings/__init__.py | 31 +++++--- ...-10-16-17-17-20.gh-issue-135801.faH3fa.rst | 6 ++ Python/_warnings.c | 75 ++++++------------- 13 files changed, 243 insertions(+), 73 deletions(-) create mode 100644 Lib/test/test_import/data/syntax_warnings.py create mode 100644 Misc/NEWS.d/next/Library/2025-10-16-17-17-20.gh-issue-135801.faH3fa.rst diff --git a/Doc/library/warnings.rst b/Doc/library/warnings.rst index 03b7a8dc378ef3..2f3cf6008f58e2 100644 --- a/Doc/library/warnings.rst +++ b/Doc/library/warnings.rst @@ -487,7 +487,14 @@ Available Functions ignored. *module*, if supplied, should be the module name. - If no module is passed, the filename with ``.py`` stripped is used. + If no module is passed, the module regular expression in + :ref:`warnings filter ` will be tested against the module + names constructed from the path components starting from all parent + directories (with ``/__init__.py``, ``.py`` and, on Windows, ``.pyw`` + stripped) and against the filename with ``.py`` stripped. + For example, when the filename is ``'/path/to/package/module.py'``, it will + be tested against ``'path.to.package.module'``, ``'to.package.module'`` + ``'package.module'``, ``'module'``, and ``'/path/to/package/module'``. *registry*, if supplied, should be the ``__warningregistry__`` dictionary of the module. @@ -506,6 +513,10 @@ Available Functions .. versionchanged:: 3.6 Add the *source* parameter. + .. versionchanged:: next + If no module is passed, test the filter regular expression against + module names created from the path, not only the path itself. + .. function:: showwarning(message, category, filename, lineno, file=None, line=None) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index fe9adfe9f730ec..903645fb40bb14 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -611,6 +611,18 @@ unittest (Contributed by Garry Cairns in :gh:`134567`.) +warnings +-------- + +* Improve filtering by module in :func:`warnings.warn_explicit` if no *module* + argument is passed. + It now tests the module regular expression in the warnings filter not only + against the filename with ``.py`` stripped, but also against module names + constructed starting from different parent directories of the filename + (with ``/__init__.py``, ``.py`` and, on Windows, ``.pyw`` stripped). + (Contributed by Serhiy Storchaka in :gh:`135801`.) + + venv ---- diff --git a/Lib/_py_warnings.py b/Lib/_py_warnings.py index 576a17ea7b8501..91a9f44b201733 100644 --- a/Lib/_py_warnings.py +++ b/Lib/_py_warnings.py @@ -520,20 +520,50 @@ def warn(message, category=None, stacklevel=1, source=None, ) +def _match_filename(pattern, filename, *, MS_WINDOWS=(sys.platform == 'win32')): + if not filename: + return pattern.match('') is not None + if filename[0] == '<' and filename[-1] == '>': + return pattern.match(filename) is not None + + is_py = (filename[-3:].lower() == '.py' + if MS_WINDOWS else + filename.endswith('.py')) + if is_py: + filename = filename[:-3] + if pattern.match(filename): # for backward compatibility + return True + if MS_WINDOWS: + if not is_py and filename[-4:].lower() == '.pyw': + filename = filename[:-4] + is_py = True + if is_py and filename[-9:].lower() in (r'\__init__', '/__init__'): + filename = filename[:-9] + filename = filename.replace('\\', '/') + else: + if is_py and filename.endswith('/__init__'): + filename = filename[:-9] + filename = filename.replace('/', '.') + i = 0 + while True: + if pattern.match(filename, i): + return True + i = filename.find('.', i) + 1 + if not i: + return False + + def warn_explicit(message, category, filename, lineno, module=None, registry=None, module_globals=None, source=None): lineno = int(lineno) - if module is None: - module = filename or "" - if module[-3:].lower() == ".py": - module = module[:-3] # XXX What about leading pathname? if isinstance(message, Warning): text = str(message) category = message.__class__ else: text = message message = category(message) + modules = None key = (text, category, lineno) with _wm._lock: if registry is None: @@ -549,9 +579,11 @@ def warn_explicit(message, category, filename, lineno, action, msg, cat, mod, ln = item if ((msg is None or msg.match(text)) and issubclass(category, cat) and - (mod is None or mod.match(module)) and - (ln == 0 or lineno == ln)): - break + (ln == 0 or lineno == ln) and + (mod is None or (_match_filename(mod, filename) + if module is None else + mod.match(module)))): + break else: action = _wm.defaultaction # Early exit actions diff --git a/Lib/test/test_ast/test_ast.py b/Lib/test/test_ast/test_ast.py index a979a4b1da1ad1..551de5851daace 100644 --- a/Lib/test/test_ast/test_ast.py +++ b/Lib/test/test_ast/test_ast.py @@ -13,6 +13,7 @@ import textwrap import types import unittest +import warnings import weakref from io import StringIO from pathlib import Path @@ -1069,6 +1070,19 @@ def test_tstring(self): self.assertIsInstance(tree.body[0].value.values[0], ast.Constant) self.assertIsInstance(tree.body[0].value.values[1], ast.Interpolation) + def test_filter_syntax_warnings_by_module(self): + filename = support.findfile('test_import/data/syntax_warnings.py') + with open(filename, 'rb') as f: + source = f.read() + with warnings.catch_warnings(record=True) as wlog: + warnings.simplefilter('error') + warnings.filterwarnings('always', module=r'\z') + ast.parse(source) + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10]) + for wm in wlog: + self.assertEqual(wm.filename, '') + self.assertIs(wm.category, SyntaxWarning) + class CopyTests(unittest.TestCase): """Test copying and pickling AST nodes.""" diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index fe3e391a7f5ba1..fba46af6617640 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -1088,6 +1088,28 @@ def four_freevars(): three_freevars.__globals__, closure=my_closure) + def test_exec_filter_syntax_warnings_by_module(self): + filename = support.findfile('test_import/data/syntax_warnings.py') + with open(filename, 'rb') as f: + source = f.read() + with warnings.catch_warnings(record=True) as wlog: + warnings.simplefilter('error') + warnings.filterwarnings('always', module=r'\z') + exec(source, {}) + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10, 13, 14, 21]) + for wm in wlog: + self.assertEqual(wm.filename, '') + self.assertIs(wm.category, SyntaxWarning) + + with warnings.catch_warnings(record=True) as wlog: + warnings.simplefilter('error') + warnings.filterwarnings('always', module=r'\z') + exec(source, {'__name__': 'package.module', '__file__': filename}) + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10, 13, 14, 21]) + for wm in wlog: + self.assertEqual(wm.filename, '') + self.assertIs(wm.category, SyntaxWarning) + def test_filter(self): self.assertEqual(list(filter(lambda c: 'a' <= c <= 'z', 'Hello World')), list('elloorld')) diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py index 784c45aa96f8a7..f8115cc8300df7 100644 --- a/Lib/test/test_cmd_line_script.py +++ b/Lib/test/test_cmd_line_script.py @@ -810,6 +810,19 @@ def test_script_as_dev_fd(self): out, err = p.communicate() self.assertEqual(out, b"12345678912345678912345\n") + def test_filter_syntax_warnings_by_module(self): + filename = support.findfile('test_import/data/syntax_warnings.py') + rc, out, err = assert_python_ok( + '-Werror', + '-Walways:::test.test_import.data.syntax_warnings', + filename) + self.assertEqual(err.count(b': SyntaxWarning: '), 6) + + rc, out, err = assert_python_ok( + '-Werror', + '-Walways:::syntax_warnings', + filename) + self.assertEqual(err.count(b': SyntaxWarning: '), 6) def tearDownModule(): diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py index 846d38ae561fc5..9c2364491fe08d 100644 --- a/Lib/test/test_compile.py +++ b/Lib/test/test_compile.py @@ -1745,6 +1745,20 @@ def test_compile_warning_in_finally(self): self.assertEqual(wm.category, SyntaxWarning) self.assertIn("\"is\" with 'int' literal", str(wm.message)) + def test_filter_syntax_warnings_by_module(self): + filename = support.findfile('test_import/data/syntax_warnings.py') + with open(filename, 'rb') as f: + source = f.read() + module_re = r'test\.test_import\.data\.syntax_warnings\z' + with warnings.catch_warnings(record=True) as wlog: + warnings.simplefilter('error') + warnings.filterwarnings('always', module=module_re) + compile(source, filename, 'exec') + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10, 13, 14, 21]) + for wm in wlog: + self.assertEqual(wm.filename, filename) + self.assertIs(wm.category, SyntaxWarning) + @support.subTests('src', [ textwrap.dedent(""" def f(): diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index 95121debbbfa74..072021e595975a 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -15,6 +15,7 @@ import os import py_compile import random +import re import shutil import stat import subprocess @@ -23,6 +24,7 @@ import threading import time import types +import warnings import unittest from unittest import mock import _imp @@ -51,7 +53,7 @@ TESTFN, rmtree, temp_umask, TESTFN_UNENCODABLE) from test.support import script_helper from test.support import threading_helper -from test.test_importlib.util import uncache +from test.test_importlib.util import uncache, temporary_pycache_prefix from types import ModuleType try: import _testsinglephase @@ -412,7 +414,6 @@ def test_from_import_missing_attr_path_is_canonical(self): self.assertIsNotNone(cm.exception) def test_from_import_star_invalid_type(self): - import re with ready_to_import() as (name, path): with open(path, 'w', encoding='utf-8') as f: f.write("__all__ = [b'invalid_type']") @@ -1250,6 +1251,35 @@ class Spec2: origin = "a\x00b" _imp.create_dynamic(Spec2()) + def test_filter_syntax_warnings_by_module(self): + module_re = r'test\.test_import\.data\.syntax_warnings\z' + unload('test.test_import.data.syntax_warnings') + with (os_helper.temp_dir() as tmpdir, + temporary_pycache_prefix(tmpdir), + warnings.catch_warnings(record=True) as wlog): + warnings.simplefilter('error') + warnings.filterwarnings('always', module=module_re) + import test.test_import.data.syntax_warnings + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10, 13, 14, 21]) + filename = test.test_import.data.syntax_warnings.__file__ + for wm in wlog: + self.assertEqual(wm.filename, filename) + self.assertIs(wm.category, SyntaxWarning) + + module_re = r'syntax_warnings\z' + unload('test.test_import.data.syntax_warnings') + with (os_helper.temp_dir() as tmpdir, + temporary_pycache_prefix(tmpdir), + warnings.catch_warnings(record=True) as wlog): + warnings.simplefilter('error') + warnings.filterwarnings('always', module=module_re) + import test.test_import.data.syntax_warnings + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10, 13, 14, 21]) + filename = test.test_import.data.syntax_warnings.__file__ + for wm in wlog: + self.assertEqual(wm.filename, filename) + self.assertIs(wm.category, SyntaxWarning) + @skip_if_dont_write_bytecode class FilePermissionTests(unittest.TestCase): diff --git a/Lib/test/test_import/data/syntax_warnings.py b/Lib/test/test_import/data/syntax_warnings.py new file mode 100644 index 00000000000000..103f07b6187603 --- /dev/null +++ b/Lib/test/test_import/data/syntax_warnings.py @@ -0,0 +1,21 @@ +# Syntax warnings emitted in different parts of the Python compiler. + +# Parser/lexer/lexer.c +x = 1or 0 # line 4 + +# Parser/tokenizer/helpers.c +'\z' # line 7 + +# Parser/string_parser.c +'\400' # line 10 + +# _PyCompile_Warn() in Python/codegen.c +assert(x, 'message') # line 13 +x is 1 # line 14 + +# _PyErr_EmitSyntaxWarning() in Python/ast_preprocess.c +def f(): + try: + pass + finally: + return 42 # line 21 diff --git a/Lib/test/test_symtable.py b/Lib/test/test_symtable.py index 943e63fc13c921..ef2c00e04b820c 100644 --- a/Lib/test/test_symtable.py +++ b/Lib/test/test_symtable.py @@ -5,6 +5,7 @@ import re import textwrap import symtable +import warnings import unittest from test import support @@ -586,6 +587,20 @@ def test__symtable_refleak(self): # check error path when 'compile_type' AC conversion failed self.assertRaises(TypeError, symtable.symtable, '', mortal_str, 1) + def test_filter_syntax_warnings_by_module(self): + filename = support.findfile('test_import/data/syntax_warnings.py') + with open(filename, 'rb') as f: + source = f.read() + module_re = r'test\.test_import\.data\.syntax_warnings\z' + with warnings.catch_warnings(record=True) as wlog: + warnings.simplefilter('error') + warnings.filterwarnings('always', module=module_re) + symtable.symtable(source, filename, 'exec') + self.assertEqual(sorted(wm.lineno for wm in wlog), [4, 7, 10]) + for wm in wlog: + self.assertEqual(wm.filename, filename) + self.assertIs(wm.category, SyntaxWarning) + class ComprehensionTests(unittest.TestCase): def get_identifiers_recursive(self, st, res): diff --git a/Lib/test/test_warnings/__init__.py b/Lib/test/test_warnings/__init__.py index 157852cfa91007..e6666ddc638037 100644 --- a/Lib/test/test_warnings/__init__.py +++ b/Lib/test/test_warnings/__init__.py @@ -249,10 +249,23 @@ def test_filter_module(self): self.module.warn_explicit('msg', UserWarning, 'filename', 42, module='package.module') self.assertEqual(len(w), 1) + self.module.warn_explicit('msg', UserWarning, '/path/to/package/module', 42) + self.assertEqual(len(w), 2) + self.module.warn_explicit('msg', UserWarning, '/path/to/package/module.py', 42) + self.assertEqual(len(w), 3) + self.module.warn_explicit('msg', UserWarning, '/path/to/package/module/__init__.py', 42) + self.assertEqual(len(w), 4) with self.assertRaises(UserWarning): - self.module.warn_explicit('msg', UserWarning, '/path/to/package/module', 42) - with self.assertRaises(UserWarning): - self.module.warn_explicit('msg', UserWarning, '/path/to/package/module.py', 42) + self.module.warn_explicit('msg', UserWarning, '/path/to/package/module/__init__', 42) + if MS_WINDOWS: + self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.PY', 42) + self.assertEqual(len(w), 5) + self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module\__INIT__.PY', 42) + self.assertEqual(len(w), 6) + self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.PYW', 42) + self.assertEqual(len(w), 7) + self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module\__INIT__.PYW', 42) + self.assertEqual(len(w), 8) with self.module.catch_warnings(record=True) as w: self.module.simplefilter('error') @@ -276,9 +289,8 @@ def test_filter_module(self): with self.assertRaises(UserWarning): self.module.warn_explicit('msg', UserWarning, '/PATH/TO/PACKAGE/MODULE', 42) if MS_WINDOWS: - if self.module is py_warnings: - self.module.warn_explicit('msg', UserWarning, r'/path/to/package/module.PY', 42) - self.assertEqual(len(w), 3) + self.module.warn_explicit('msg', UserWarning, r'/path/to/package/module.PY', 42) + self.assertEqual(len(w), 3) with self.assertRaises(UserWarning): self.module.warn_explicit('msg', UserWarning, r'/path/to/package/module/__init__.py', 42) with self.assertRaises(UserWarning): @@ -302,9 +314,8 @@ def test_filter_module(self): self.assertEqual(len(w), 1) self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.py', 42) self.assertEqual(len(w), 2) - if self.module is py_warnings: - self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.PY', 42) - self.assertEqual(len(w), 3) + self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.PY', 42) + self.assertEqual(len(w), 3) with self.assertRaises(UserWarning): self.module.warn_explicit('msg', UserWarning, r'C:\path\to\package\module.pyw', 42) with self.assertRaises(UserWarning): @@ -399,7 +410,7 @@ def test_message_matching(self): def test_mutate_filter_list(self): class X: - def match(self, a): + def match(self, a, start=0): L[:] = [] L = [("default",X(),UserWarning,X(),0) for i in range(2)] diff --git a/Misc/NEWS.d/next/Library/2025-10-16-17-17-20.gh-issue-135801.faH3fa.rst b/Misc/NEWS.d/next/Library/2025-10-16-17-17-20.gh-issue-135801.faH3fa.rst new file mode 100644 index 00000000000000..d680312d5829fb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-10-16-17-17-20.gh-issue-135801.faH3fa.rst @@ -0,0 +1,6 @@ +Improve filtering by module in :func:`warnings.warn_explicit` if no *module* +argument is passed. It now tests the module regular expression in the +warnings filter not only against the filename with ``.py`` stripped, but +also against module names constructed starting from different parent +directories of the filename (with ``/__init__.py``, ``.py`` and, on Windows, +``.pyw`` stripped). diff --git a/Python/_warnings.c b/Python/_warnings.c index 9989b623dbce3a..d44d414bc93a04 100644 --- a/Python/_warnings.c +++ b/Python/_warnings.c @@ -171,7 +171,7 @@ _PyWarnings_InitState(PyInterpreterState *interp) /*************************************************************************/ static int -check_matched(PyInterpreterState *interp, PyObject *obj, PyObject *arg) +check_matched(PyInterpreterState *interp, PyObject *obj, PyObject *arg, PyObject *arg2) { PyObject *result; int rc; @@ -182,6 +182,9 @@ check_matched(PyInterpreterState *interp, PyObject *obj, PyObject *arg) /* An internal plain text default filter must match exactly */ if (PyUnicode_CheckExact(obj)) { + if (arg == NULL) { + return 0; + } int cmp_result = PyUnicode_Compare(obj, arg); if (cmp_result == -1 && PyErr_Occurred()) { return -1; @@ -190,10 +193,19 @@ check_matched(PyInterpreterState *interp, PyObject *obj, PyObject *arg) } /* Otherwise assume a regex filter and call its match() method */ - result = PyObject_CallMethodOneArg(obj, &_Py_ID(match), arg); + if (arg != NULL) { + result = PyObject_CallMethodOneArg(obj, &_Py_ID(match), arg); + } + else { + PyObject *match = PyImport_ImportModuleAttrString("_py_warnings", "_match_filename"); + if (match == NULL) { + return -1; + } + result = PyObject_CallFunctionObjArgs(match, obj, arg2, NULL); + Py_DECREF(match); + } if (result == NULL) return -1; - rc = PyObject_IsTrue(result); Py_DECREF(result); return rc; @@ -423,7 +435,7 @@ get_default_action(PyInterpreterState *interp) static bool filter_search(PyInterpreterState *interp, PyObject *category, PyObject *text, Py_ssize_t lineno, - PyObject *module, char *list_name, PyObject *filters, + PyObject *module, PyObject *filename, char *list_name, PyObject *filters, PyObject **item, PyObject **matched_action) { bool result = true; *matched_action = NULL; @@ -459,14 +471,14 @@ filter_search(PyInterpreterState *interp, PyObject *category, break; } - good_msg = check_matched(interp, msg, text); + good_msg = check_matched(interp, msg, text, NULL); if (good_msg == -1) { Py_DECREF(tmp_item); result = false; break; } - good_mod = check_matched(interp, mod, module); + good_mod = check_matched(interp, mod, module, filename); if (good_mod == -1) { Py_DECREF(tmp_item); result = false; @@ -504,7 +516,7 @@ filter_search(PyInterpreterState *interp, PyObject *category, static PyObject* get_filter(PyInterpreterState *interp, PyObject *category, PyObject *text, Py_ssize_t lineno, - PyObject *module, PyObject **item) + PyObject *module, PyObject *filename, PyObject **item) { #ifdef Py_DEBUG WarningsState *st = warnings_get_state(interp); @@ -522,7 +534,7 @@ get_filter(PyInterpreterState *interp, PyObject *category, use_global_filters = true; } else { PyObject *context_action = NULL; - if (!filter_search(interp, category, text, lineno, module, "_warnings_context _filters", + if (!filter_search(interp, category, text, lineno, module, filename, "_warnings_context _filters", context_filters, item, &context_action)) { Py_DECREF(context_filters); return NULL; @@ -541,7 +553,7 @@ get_filter(PyInterpreterState *interp, PyObject *category, if (filters == NULL) { return NULL; } - if (!filter_search(interp, category, text, lineno, module, "filters", + if (!filter_search(interp, category, text, lineno, module, filename, "filters", filters, item, &action)) { return NULL; } @@ -612,39 +624,6 @@ already_warned(PyInterpreterState *interp, PyObject *registry, PyObject *key, return 0; } -/* New reference. */ -static PyObject * -normalize_module(PyObject *filename) -{ - PyObject *module; - int kind; - const void *data; - Py_ssize_t len; - - len = PyUnicode_GetLength(filename); - if (len < 0) - return NULL; - - if (len == 0) - return PyUnicode_FromString(""); - - kind = PyUnicode_KIND(filename); - data = PyUnicode_DATA(filename); - - /* if filename.endswith(".py"): */ - if (len >= 3 && - PyUnicode_READ(kind, data, len-3) == '.' && - PyUnicode_READ(kind, data, len-2) == 'p' && - PyUnicode_READ(kind, data, len-1) == 'y') - { - module = PyUnicode_Substring(filename, 0, len-3); - } - else { - module = Py_NewRef(filename); - } - return module; -} - static int update_registry(PyInterpreterState *interp, PyObject *registry, PyObject *text, PyObject *category, int add_zero) @@ -812,15 +791,6 @@ warn_explicit(PyThreadState *tstate, PyObject *category, PyObject *message, return NULL; } - /* Normalize module. */ - if (module == NULL) { - module = normalize_module(filename); - if (module == NULL) - return NULL; - } - else - Py_INCREF(module); - /* Normalize message. */ Py_INCREF(message); /* DECREF'ed in cleanup. */ if (PyObject_TypeCheck(message, (PyTypeObject *)PyExc_Warning)) { @@ -858,7 +828,7 @@ warn_explicit(PyThreadState *tstate, PyObject *category, PyObject *message, /* Else this warning hasn't been generated before. */ } - action = get_filter(interp, category, text, lineno, module, &item); + action = get_filter(interp, category, text, lineno, module, filename, &item); if (action == NULL) goto cleanup; @@ -921,7 +891,6 @@ warn_explicit(PyThreadState *tstate, PyObject *category, PyObject *message, Py_XDECREF(key); Py_XDECREF(text); Py_XDECREF(lineno_obj); - Py_DECREF(module); Py_XDECREF(message); return result; /* Py_None or NULL. */ } From a3ce2f77f0813c214896ec66be3a26121f52361e Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:31:47 +0000 Subject: [PATCH 12/13] gh-55531: Implement `normalize_encoding` in C (#136643) Closes gh-55531 --- Lib/encodings/__init__.py | 14 +--- ...5-07-14-09-33-17.gh-issue-55531.Gt2e12.rst | 4 ++ Modules/_codecsmodule.c | 42 ++++++++++++ Modules/clinic/_codecsmodule.c.h | 66 ++++++++++++++++++- Objects/unicodeobject.c | 15 +++-- Python/fileutils.c | 4 +- 6 files changed, 123 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 298177eb8003a7..e7e4ca3358e0f9 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -30,6 +30,7 @@ import codecs import sys +from _codecs import _normalize_encoding from . import aliases _cache = {} @@ -55,18 +56,7 @@ def normalize_encoding(encoding): if isinstance(encoding, bytes): encoding = str(encoding, "ascii") - chars = [] - punct = False - for c in encoding: - if c.isalnum() or c == '.': - if punct and chars: - chars.append('_') - if c.isascii(): - chars.append(c) - punct = False - else: - punct = True - return ''.join(chars) + return _normalize_encoding(encoding) def search_function(encoding): diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst new file mode 100644 index 00000000000000..70e39a4f2c167c --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst @@ -0,0 +1,4 @@ +:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance +by implementing the function in C using the private +``_Py_normalize_encoding`` which has been modified to make lowercase +conversion optional. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index bdffeced7da5a9..2f2edbb05ab5c5 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name) return PyCodec_LookupError(name); } +extern int _Py_normalize_encoding(const char *, char *, size_t, int); + +/*[clinic input] +_codecs._normalize_encoding + encoding: unicode + +Normalize an encoding name *encoding*. + +Used for encodings.normalize_encoding. Does not convert to lower case. +[clinic start generated code]*/ + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding) +/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/ +{ + Py_ssize_t len; + const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len); + if (cstr == NULL) { + return NULL; + } + + if (len > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "encoding is too large"); + return NULL; + } + + char *normalized = PyMem_Malloc(len + 1); + if (normalized == NULL) { + return PyErr_NoMemory(); + } + + if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) { + PyMem_Free(normalized); + return NULL; + } + + PyObject *result = PyUnicode_FromString(normalized); + PyMem_Free(normalized); + return result; +} + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = { _CODECS_REGISTER_ERROR_METHODDEF _CODECS__UNREGISTER_ERROR_METHODDEF _CODECS_LOOKUP_ERROR_METHODDEF + _CODECS__NORMALIZE_ENCODING_METHODDEF {NULL, NULL} /* sentinel */ }; diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h index b0310325759326..9e2a7950ebde64 100644 --- a/Modules/clinic/_codecsmodule.c.h +++ b/Modules/clinic/_codecsmodule.c.h @@ -2779,6 +2779,70 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) return return_value; } +PyDoc_STRVAR(_codecs__normalize_encoding__doc__, +"_normalize_encoding($module, /, encoding)\n" +"--\n" +"\n" +"Normalize an encoding name *encoding*.\n" +"\n" +"Used for encodings.normalize_encoding. Does not convert to lower case."); + +#define _CODECS__NORMALIZE_ENCODING_METHODDEF \ + {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__}, + +static PyObject * +_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding); + +static PyObject * +_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(encoding), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"encoding", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "_normalize_encoding", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + PyObject *encoding; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!PyUnicode_Check(args[0])) { + _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]); + goto exit; + } + encoding = args[0]; + return_value = _codecs__normalize_encoding_impl(module, encoding); + +exit: + return return_value; +} + #ifndef _CODECS_MBCS_DECODE_METHODDEF #define _CODECS_MBCS_DECODE_METHODDEF #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */ @@ -2802,4 +2866,4 @@ _codecs_lookup_error(PyObject *module, PyObject *arg) #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF #define _CODECS_CODE_PAGE_ENCODE_METHODDEF #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */ -/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1c443e88e05029..4e8c132327b7d0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj, return v; } -/* Normalize an encoding name: similar to encodings.normalize_encoding(), but - also convert to lowercase. Return 1 on success, or 0 on error (encoding is - longer than lower_len-1). */ +/* Normalize an encoding name like encodings.normalize_encoding() + but allow to convert to lowercase if *to_lower* is true. + Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */ int _Py_normalize_encoding(const char *encoding, char *lower, - size_t lower_len) + size_t lower_len, + int to_lower) { const char *e; char *l; @@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding, if (l == l_end) { return 0; } - *l++ = Py_TOLOWER(c); + *l++ = to_lower ? Py_TOLOWER(c) : c; } else { punct = 1; @@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ @@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode, } /* Shortcuts for common default encodings */ - if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) { + if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) { char *lower = buflower; /* Fast paths */ diff --git a/Python/fileutils.c b/Python/fileutils.c index b808229716fd9c..93abd70a34d420 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs) #define USE_FORCE_ASCII -extern int _Py_normalize_encoding(const char *, char *, size_t); +extern int _Py_normalize_encoding(const char *, char *, size_t, int); /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale and POSIX locale. nl_langinfo(CODESET) announces an alias of the @@ -229,7 +229,7 @@ check_force_ascii(void) } char encoding[20]; /* longest name: "iso_646.irv_1991\0" */ - if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) { + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) { goto error; } From 4e6dba0ef74523a52f66547c16b9972664b18fd4 Mon Sep 17 00:00:00 2001 From: yihong Date: Thu, 30 Oct 2025 23:14:06 +0800 Subject: [PATCH 13/13] gh-139246: zero-width word paste can be wrong in default repl (GH-139254) Signed-off-by: yihong0618 Co-authored-by: grayjk --- Lib/_pyrepl/utils.py | 6 +++++ Lib/test/test_pyrepl/test_utils.py | 23 ++++++++++++++++++- ...-09-23-09-46-46.gh-issue-139246.pzfM-w.rst | 1 + 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2025-09-23-09-46-46.gh-issue-139246.pzfM-w.rst diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py index 64708e843b685b..06cddef851bb40 100644 --- a/Lib/_pyrepl/utils.py +++ b/Lib/_pyrepl/utils.py @@ -63,6 +63,12 @@ class ColorSpan(NamedTuple): def str_width(c: str) -> int: if ord(c) < 128: return 1 + # gh-139246 for zero-width joiner and combining characters + if unicodedata.combining(c): + return 0 + category = unicodedata.category(c) + if category == "Cf" and c != "\u00ad": + return 0 w = unicodedata.east_asian_width(c) if w in ("N", "Na", "H", "A"): return 1 diff --git a/Lib/test/test_pyrepl/test_utils.py b/Lib/test/test_pyrepl/test_utils.py index 05a4f329059835..656a1e441e0e47 100644 --- a/Lib/test/test_pyrepl/test_utils.py +++ b/Lib/test/test_pyrepl/test_utils.py @@ -5,10 +5,29 @@ class TestUtils(TestCase): def test_str_width(self): - characters = ['a', '1', '_', '!', '\x1a', '\u263A', '\uffb9'] + characters = [ + 'a', + '1', + '_', + '!', + '\x1a', + '\u263A', + '\uffb9', + '\N{LATIN SMALL LETTER E WITH ACUTE}', # é + '\N{LATIN SMALL LETTER E WITH CEDILLA}', # ȩ + '\u00ad', + ] for c in characters: self.assertEqual(str_width(c), 1) + zero_width_characters = [ + '\N{COMBINING ACUTE ACCENT}', + '\N{ZERO WIDTH JOINER}', + ] + for c in zero_width_characters: + with self.subTest(character=c): + self.assertEqual(str_width(c), 0) + characters = [chr(99989), chr(99999)] for c in characters: self.assertEqual(str_width(c), 2) @@ -25,6 +44,8 @@ def test_wlen(self): self.assertEqual(wlen('hello'), 5) self.assertEqual(wlen('hello' + '\x1a'), 7) + self.assertEqual(wlen('e\N{COMBINING ACUTE ACCENT}'), 1) + self.assertEqual(wlen('a\N{ZERO WIDTH JOINER}b'), 2) def test_prev_next_window(self): def gen_normal(): diff --git a/Misc/NEWS.d/next/Library/2025-09-23-09-46-46.gh-issue-139246.pzfM-w.rst b/Misc/NEWS.d/next/Library/2025-09-23-09-46-46.gh-issue-139246.pzfM-w.rst new file mode 100644 index 00000000000000..a816bda5cfe8e8 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-23-09-46-46.gh-issue-139246.pzfM-w.rst @@ -0,0 +1 @@ +fix: paste zero-width in default repl width is wrong.