From 3c430ec58afc2dae872e2cff26af441b0c3888e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 26 Sep 2025 17:15:00 +0200 Subject: [PATCH 1/2] [3.14] gh-90949: add Expat API to prevent XML deadly allocations (CVE-2025-59375) (GH-139234) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the XML Expat 2.7.2 mitigation APIs to disallow use of disproportional amounts of dynamic memory from within an Expat parser (see CVE-2025-59375 for instance). The exposed APIs are available on Expat parsers, that is, parsers created by `xml.parsers.expat.ParserCreate()`, as: - `parser.SetAllocTrackerActivationThreshold(threshold)`, and - `parser.SetAllocTrackerMaximumAmplification(max_factor)`. (cherry picked from commit f04bea44c37793561d753dd4ca6e7cd658137553) Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/library/pyexpat.rst | 57 +++++ Include/pyexpat.h | 5 + Lib/test/test_pyexpat.py | 200 +++++++++++++++++- ...5-09-22-14-40-11.gh-issue-90949.UM35nb.rst | 5 + Modules/clinic/pyexpat.c.h | 136 +++++++++++- Modules/expat/pyexpatns.h | 2 + Modules/pyexpat.c | 193 ++++++++++++++--- 7 files changed, 569 insertions(+), 29 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst diff --git a/Doc/library/pyexpat.rst b/Doc/library/pyexpat.rst index 5506ac828e5abe..4c21bc875217b9 100644 --- a/Doc/library/pyexpat.rst +++ b/Doc/library/pyexpat.rst @@ -72,6 +72,13 @@ The :mod:`xml.parsers.expat` module contains two functions: *encoding* [1]_ is given it will override the implicit or explicit encoding of the document. + .. _xmlparser-non-root: + + Parsers created through :func:`!ParserCreate` are called "root" parsers, + in the sense that they do not have any parent parser attached. Non-root + parsers are created by :meth:`parser.ExternalEntityParserCreate + `. + Expat can optionally do XML namespace processing for you, enabled by providing a value for *namespace_separator*. The value must be a one-character string; a :exc:`ValueError` will be raised if the string has an illegal length (``None`` @@ -231,6 +238,55 @@ XMLParser Objects .. versionadded:: 3.13 +:class:`!xmlparser` objects have the following methods to mitigate some +common XML vulnerabilities. + +.. method:: xmlparser.SetAllocTrackerActivationThreshold(threshold, /) + + Sets the number of allocated bytes of dynamic memory needed to activate + protection against disproportionate use of RAM. + + By default, parser objects have an allocation activation threshold of 64 MiB, + or equivalently 67,108,864 bytes. + + An :exc:`ExpatError` is raised if this method is called on a + |xml-non-root-parser| parser. + The corresponding :attr:`~ExpatError.lineno` and :attr:`~ExpatError.offset` + should not be used as they may have no special meaning. + + .. versionadded:: next + +.. method:: xmlparser.SetAllocTrackerMaximumAmplification(max_factor, /) + + Sets the maximum amplification factor between direct input and bytes + of dynamic memory allocated. + + The amplification factor is calculated as ``allocated / direct`` + while parsing, where ``direct`` is the number of bytes read from + the primary document in parsing and ``allocated`` is the number + of bytes of dynamic memory allocated in the parser hierarchy. + + The *max_factor* value must be a non-NaN :class:`float` value greater than + or equal to 1.0. Amplification factors greater than 100.0 can be observed + near the start of parsing even with benign files in practice. In particular, + the activation threshold should be carefully chosen to avoid false positives. + + By default, parser objects have a maximum amplification factor of 100.0. + + An :exc:`ExpatError` is raised if this method is called on a + |xml-non-root-parser| parser or if *max_factor* is outside the valid range. + The corresponding :attr:`~ExpatError.lineno` and :attr:`~ExpatError.offset` + should not be used as they may have no special meaning. + + .. note:: + + The maximum amplification factor is only considered if the threshold + that can be adjusted :meth:`.SetAllocTrackerActivationThreshold` is + exceeded. + + .. versionadded:: next + + :class:`xmlparser` objects have the following attributes: @@ -954,3 +1010,4 @@ The ``errors`` module has the following attributes: not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl and https://www.iana.org/assignments/character-sets/character-sets.xhtml. +.. |xml-non-root-parser| replace:: :ref:`non-root ` diff --git a/Include/pyexpat.h b/Include/pyexpat.h index 9824d099c3df7d..04548b7684a2fd 100644 --- a/Include/pyexpat.h +++ b/Include/pyexpat.h @@ -52,6 +52,11 @@ struct PyExpat_CAPI int (*SetHashSalt)(XML_Parser parser, unsigned long hash_salt); /* might be NULL for expat < 2.6.0 */ XML_Bool (*SetReparseDeferralEnabled)(XML_Parser parser, XML_Bool enabled); + /* might be NULL for expat < 2.7.2 */ + XML_Bool (*SetAllocTrackerActivationThreshold)( + XML_Parser parser, unsigned long long activationThresholdBytes); + XML_Bool (*SetAllocTrackerMaximumAmplification)( + XML_Parser parser, float maxAmplificationFactor); /* always add new stuff to the end! */ }; diff --git a/Lib/test/test_pyexpat.py b/Lib/test/test_pyexpat.py index 1d56ccd71cf962..836e6ff412f4ca 100644 --- a/Lib/test/test_pyexpat.py +++ b/Lib/test/test_pyexpat.py @@ -1,14 +1,18 @@ # XXX TypeErrors on calling handlers, or on bad return values from a # handler, are obscure and unhelpful. +import abc +import functools import os +import re import sys import sysconfig +import textwrap import unittest import traceback from io import BytesIO from test import support -from test.support import os_helper +from test.support import import_helper, os_helper from xml.parsers import expat from xml.parsers.expat import errors @@ -809,5 +813,199 @@ def start_element(name, _): self.assertEqual(started, ['doc']) +class AttackProtectionTestBase(abc.ABC): + """ + Base class for testing protections against XML payloads with + disproportionate amplification. + + The protections being tested should detect and prevent attacks + that leverage disproportionate amplification from small inputs. + """ + + @staticmethod + def exponential_expansion_payload(*, nrows, ncols, text='.'): + """Create a billion laughs attack payload. + + Be careful: the number of total items is pow(n, k), thereby + requiring at least pow(ncols, nrows) * sizeof(text) memory! + """ + template = textwrap.dedent(f"""\ + + + + {{body}} + ]> + &row{nrows}; + """).rstrip() + + body = '\n'.join( + f'' + for i in range(nrows) + ) + body = textwrap.indent(body, ' ' * 4) + return template.format(body=body) + + def test_payload_generation(self): + # self-test for exponential_expansion_payload() + payload = self.exponential_expansion_payload(nrows=2, ncols=3) + self.assertEqual(payload, textwrap.dedent("""\ + + + + + + ]> + &row2; + """).rstrip()) + + def assert_root_parser_failure(self, func, /, *args, **kwargs): + """Check that func(*args, **kwargs) is invalid for a sub-parser.""" + msg = "parser must be a root parser" + self.assertRaisesRegex(expat.ExpatError, msg, func, *args, **kwargs) + + @abc.abstractmethod + def assert_rejected(self, func, /, *args, **kwargs): + """Assert that func(*args, **kwargs) triggers the attack protection. + + Note: this method must ensure that the attack protection being tested + is the one that is actually triggered at runtime, e.g., by matching + the exact error message. + """ + + @abc.abstractmethod + def set_activation_threshold(self, parser, threshold): + """Set the activation threshold for the tested protection.""" + + @abc.abstractmethod + def set_maximum_amplification(self, parser, max_factor): + """Set the maximum amplification factor for the tested protection.""" + + @abc.abstractmethod + def test_set_activation_threshold__threshold_reached(self): + """Test when the activation threshold is exceeded.""" + + @abc.abstractmethod + def test_set_activation_threshold__threshold_not_reached(self): + """Test when the activation threshold is not exceeded.""" + + def test_set_activation_threshold__invalid_threshold_type(self): + parser = expat.ParserCreate() + setter = functools.partial(self.set_activation_threshold, parser) + + self.assertRaises(TypeError, setter, 1.0) + self.assertRaises(TypeError, setter, -1.5) + self.assertRaises(ValueError, setter, -5) + + def test_set_activation_threshold__invalid_threshold_range(self): + _testcapi = import_helper.import_module("_testcapi") + parser = expat.ParserCreate() + setter = functools.partial(self.set_activation_threshold, parser) + + self.assertRaises(OverflowError, setter, _testcapi.ULLONG_MAX + 1) + + def test_set_activation_threshold__fail_for_subparser(self): + parser = expat.ParserCreate() + subparser = parser.ExternalEntityParserCreate(None) + setter = functools.partial(self.set_activation_threshold, subparser) + self.assert_root_parser_failure(setter, 12345) + + @abc.abstractmethod + def test_set_maximum_amplification__amplification_exceeded(self): + """Test when the amplification factor is exceeded.""" + + @abc.abstractmethod + def test_set_maximum_amplification__amplification_not_exceeded(self): + """Test when the amplification factor is not exceeded.""" + + def test_set_maximum_amplification__infinity(self): + inf = float('inf') # an 'inf' threshold is allowed by Expat + parser = expat.ParserCreate() + self.assertIsNone(self.set_maximum_amplification(parser, inf)) + + def test_set_maximum_amplification__invalid_max_factor_type(self): + parser = expat.ParserCreate() + setter = functools.partial(self.set_maximum_amplification, parser) + + self.assertRaises(TypeError, setter, None) + self.assertRaises(TypeError, setter, 'abc') + + def test_set_maximum_amplification__invalid_max_factor_range(self): + parser = expat.ParserCreate() + setter = functools.partial(self.set_maximum_amplification, parser) + + msg = re.escape("'max_factor' must be at least 1.0") + self.assertRaisesRegex(expat.ExpatError, msg, setter, float('nan')) + self.assertRaisesRegex(expat.ExpatError, msg, setter, 0.99) + + def test_set_maximum_amplification__fail_for_subparser(self): + parser = expat.ParserCreate() + subparser = parser.ExternalEntityParserCreate(None) + setter = functools.partial(self.set_maximum_amplification, subparser) + self.assert_root_parser_failure(setter, 123.45) + + +@unittest.skipIf(expat.version_info < (2, 7, 2), "requires Expat >= 2.7.2") +class MemoryProtectionTest(AttackProtectionTestBase, unittest.TestCase): + + # NOTE: with the default Expat configuration, the billion laughs protection + # may hit before the allocation limiter if exponential_expansion_payload() + # is not carefully parametrized. As such, the payloads should be chosen so + # that either the allocation limiter is hit before other protections are + # triggered or no protection at all is triggered. + + def assert_rejected(self, func, /, *args, **kwargs): + """Check that func(*args, **kwargs) hits the allocation limit.""" + msg = r"out of memory: line \d+, column \d+" + self.assertRaisesRegex(expat.ExpatError, msg, func, *args, **kwargs) + + def set_activation_threshold(self, parser, threshold): + return parser.SetAllocTrackerActivationThreshold(threshold) + + def set_maximum_amplification(self, parser, max_factor): + return parser.SetAllocTrackerMaximumAmplification(max_factor) + + def test_set_activation_threshold__threshold_reached(self): + parser = expat.ParserCreate() + # Choose a threshold expected to be always reached. + self.set_activation_threshold(parser, 3) + # Check that the threshold is reached by choosing a small factor + # and a payload whose peak amplification factor exceeds it. + self.assertIsNone(self.set_maximum_amplification(parser, 1.0)) + payload = self.exponential_expansion_payload(ncols=10, nrows=4) + self.assert_rejected(parser.Parse, payload, True) + + def test_set_activation_threshold__threshold_not_reached(self): + parser = expat.ParserCreate() + # Choose a threshold expected to be never reached. + self.set_activation_threshold(parser, pow(10, 5)) + # Check that the threshold is reached by choosing a small factor + # and a payload whose peak amplification factor exceeds it. + self.assertIsNone(self.set_maximum_amplification(parser, 1.0)) + payload = self.exponential_expansion_payload(ncols=10, nrows=4) + self.assertIsNotNone(parser.Parse(payload, True)) + + def test_set_maximum_amplification__amplification_exceeded(self): + parser = expat.ParserCreate() + # Unconditionally enable maximum activation factor. + self.set_activation_threshold(parser, 0) + # Choose a max amplification factor expected to always be exceeded. + self.assertIsNone(self.set_maximum_amplification(parser, 1.0)) + # Craft a payload for which the peak amplification factor is > 1.0. + payload = self.exponential_expansion_payload(ncols=1, nrows=2) + self.assert_rejected(parser.Parse, payload, True) + + def test_set_maximum_amplification__amplification_not_exceeded(self): + parser = expat.ParserCreate() + # Unconditionally enable maximum activation factor. + self.set_activation_threshold(parser, 0) + # Choose a max amplification factor expected to never be exceeded. + self.assertIsNone(self.set_maximum_amplification(parser, 1e4)) + # Craft a payload for which the peak amplification factor is < 1e4. + payload = self.exponential_expansion_payload(ncols=1, nrows=2) + self.assertIsNotNone(parser.Parse(payload, True)) + + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst b/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst new file mode 100644 index 00000000000000..0719d4353fb708 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst @@ -0,0 +1,5 @@ +Add :func:`~xml.parsers.expat.xmlparser.SetAllocTrackerActivationThreshold` +and :func:`~xml.parsers.expat.xmlparser.SetAllocTrackerMaximumAmplification` +to :ref:`xmlparser ` objects to prevent use of +disproportional amounts of dynamic memory from within an Expat parser. +Patch by Bénédikt Tran. diff --git a/Modules/clinic/pyexpat.c.h b/Modules/clinic/pyexpat.c.h index 13210e3be0f747..e178547060446e 100644 --- a/Modules/clinic/pyexpat.c.h +++ b/Modules/clinic/pyexpat.c.h @@ -6,6 +6,7 @@ preserve # include "pycore_gc.h" // PyGC_Head # include "pycore_runtime.h" // _Py_SINGLETON() #endif +#include "pycore_long.h" // _PyLong_UnsignedLongLong_Converter() #include "pycore_modsupport.h" // _PyArg_UnpackKeywords() PyDoc_STRVAR(pyexpat_xmlparser_SetReparseDeferralEnabled__doc__, @@ -408,6 +409,131 @@ pyexpat_xmlparser_UseForeignDTD(PyObject *self, PyTypeObject *cls, PyObject *con #endif /* (XML_COMBINED_VERSION >= 19505) */ +#if (XML_COMBINED_VERSION >= 20702) + +PyDoc_STRVAR(pyexpat_xmlparser_SetAllocTrackerActivationThreshold__doc__, +"SetAllocTrackerActivationThreshold($self, threshold, /)\n" +"--\n" +"\n" +"Sets the number of allocated bytes of dynamic memory needed to activate protection against disproportionate use of RAM.\n" +"\n" +"By default, parser objects have an allocation activation threshold of 64 MiB."); + +#define PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF \ + {"SetAllocTrackerActivationThreshold", _PyCFunction_CAST(pyexpat_xmlparser_SetAllocTrackerActivationThreshold), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, pyexpat_xmlparser_SetAllocTrackerActivationThreshold__doc__}, + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl(xmlparseobject *self, + PyTypeObject *cls, + unsigned long long threshold); + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerActivationThreshold(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + # define KWTUPLE (PyObject *)&_Py_SINGLETON(tuple_empty) + #else + # define KWTUPLE NULL + #endif + + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "SetAllocTrackerActivationThreshold", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + unsigned long long threshold; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!_PyLong_UnsignedLongLong_Converter(args[0], &threshold)) { + goto exit; + } + return_value = pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl((xmlparseobject *)self, cls, threshold); + +exit: + return return_value; +} + +#endif /* (XML_COMBINED_VERSION >= 20702) */ + +#if (XML_COMBINED_VERSION >= 20702) + +PyDoc_STRVAR(pyexpat_xmlparser_SetAllocTrackerMaximumAmplification__doc__, +"SetAllocTrackerMaximumAmplification($self, max_factor, /)\n" +"--\n" +"\n" +"Sets the maximum amplification factor between direct input and bytes of dynamic memory allocated.\n" +"\n" +"The amplification factor is calculated as \"allocated / direct\" while parsing,\n" +"where \"direct\" is the number of bytes read from the primary document in parsing\n" +"and \"allocated\" is the number of bytes of dynamic memory allocated in the parser\n" +"hierarchy.\n" +"\n" +"The \'max_factor\' value must be a non-NaN floating point value greater than\n" +"or equal to 1.0. Amplification factors greater than 100.0 can be observed\n" +"near the start of parsing even with benign files in practice. In particular,\n" +"the activation threshold should be carefully chosen to avoid false positives.\n" +"\n" +"By default, parser objects have a maximum amplification factor of 100.0."); + +#define PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF \ + {"SetAllocTrackerMaximumAmplification", _PyCFunction_CAST(pyexpat_xmlparser_SetAllocTrackerMaximumAmplification), METH_METHOD|METH_FASTCALL|METH_KEYWORDS, pyexpat_xmlparser_SetAllocTrackerMaximumAmplification__doc__}, + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl(xmlparseobject *self, + PyTypeObject *cls, + float max_factor); + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerMaximumAmplification(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + # define KWTUPLE (PyObject *)&_Py_SINGLETON(tuple_empty) + #else + # define KWTUPLE NULL + #endif + + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "SetAllocTrackerMaximumAmplification", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + float max_factor; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (PyFloat_CheckExact(args[0])) { + max_factor = (float) (PyFloat_AS_DOUBLE(args[0])); + } + else + { + max_factor = (float) PyFloat_AsDouble(args[0]); + if (max_factor == -1.0 && PyErr_Occurred()) { + goto exit; + } + } + return_value = pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl((xmlparseobject *)self, cls, max_factor); + +exit: + return return_value; +} + +#endif /* (XML_COMBINED_VERSION >= 20702) */ + PyDoc_STRVAR(pyexpat_ParserCreate__doc__, "ParserCreate($module, /, encoding=None, namespace_separator=None,\n" " intern=)\n" @@ -552,4 +678,12 @@ pyexpat_ErrorString(PyObject *module, PyObject *arg) #ifndef PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF #define PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF #endif /* !defined(PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF) */ -/*[clinic end generated code: output=4dbdc959c67dc2d5 input=a9049054013a1b77]*/ + +#ifndef PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF + #define PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF +#endif /* !defined(PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF) */ + +#ifndef PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF + #define PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF +#endif /* !defined(PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF) */ +/*[clinic end generated code: output=e73935658c04c83e input=a9049054013a1b77]*/ diff --git a/Modules/expat/pyexpatns.h b/Modules/expat/pyexpatns.h index 8ee03ef0792815..fc6b482d587e0d 100644 --- a/Modules/expat/pyexpatns.h +++ b/Modules/expat/pyexpatns.h @@ -82,6 +82,8 @@ #define XmlPrologStateInit PyExpat_XmlPrologStateInit #define XmlPrologStateInitExternalEntity PyExpat_XmlPrologStateInitExternalEntity #define XML_ResumeParser PyExpat_XML_ResumeParser +#define XML_SetAllocTrackerActivationThreshold PyExpat_XML_SetAllocTrackerActivationThreshold +#define XML_SetAllocTrackerMaximumAmplification PyExpat_XML_SetAllocTrackerMaximumAmplification #define XML_SetAttlistDeclHandler PyExpat_XML_SetAttlistDeclHandler #define XML_SetBase PyExpat_XML_SetBase #define XML_SetBillionLaughsAttackProtectionActivationThreshold PyExpat_XML_SetBillionLaughsAttackProtectionActivationThreshold diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index fa153d86543e99..3433c48bc38bef 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -9,6 +9,8 @@ #include #include // offsetof() + +#include "expat_config.h" #include "expat.h" #include "pyexpat.h" @@ -120,50 +122,99 @@ CALL_XML_HANDLER_SETTER(const struct HandlerInfo *handler_info, setter(xml_parser, xml_handler); } +static int +set_error_code(PyObject *err, enum XML_Error code) +{ + PyObject *v = PyLong_FromLong((long)code); + int ok = v != NULL && PyObject_SetAttr(err, &_Py_ID(code), v) != -1; + Py_XDECREF(v); + return ok; +} + /* Set an integer attribute on the error object; return true on success, * false on an exception. */ static int -set_error_attr(PyObject *err, const char *name, int value) +set_error_location(PyObject *err, const char *name, XML_Size value) { - PyObject *v = PyLong_FromLong(value); + PyObject *v = PyLong_FromSize_t((size_t)value); + int ok = v != NULL && PyObject_SetAttrString(err, name, v) != -1; + Py_XDECREF(v); + return ok; +} - if (v == NULL || PyObject_SetAttrString(err, name, v) == -1) { - Py_XDECREF(v); - return 0; + +static PyObject * +format_xml_error(enum XML_Error code, XML_Size lineno, XML_Size column) +{ + const char *errmsg = XML_ErrorString(code); + PyUnicodeWriter *writer = PyUnicodeWriter_Create(strlen(errmsg) + 1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_Format(writer, + "%s: line %zu, column %zu", + errmsg, (size_t)lineno, (size_t)column) < 0) + { + PyUnicodeWriter_Discard(writer); + return NULL; } - Py_DECREF(v); - return 1; + return PyUnicodeWriter_Finish(writer); } +static PyObject * +set_xml_error(pyexpat_state *state, + enum XML_Error code, XML_Size lineno, XML_Size column, + const char *errmsg) +{ + PyObject *arg = errmsg == NULL + ? format_xml_error(code, lineno, column) + : PyUnicode_FromStringAndSize(errmsg, strlen(errmsg)); + if (arg == NULL) { + return NULL; + } + PyObject *res = PyObject_CallOneArg(state->error, arg); + Py_DECREF(arg); + if ( + res != NULL + && set_error_code(res, code) + && set_error_location(res, "lineno", lineno) + && set_error_location(res, "offset", column) + ) { + PyErr_SetObject(state->error, res); + } + Py_XDECREF(res); + return NULL; +} + +#define SET_XML_ERROR(STATE, SELF, CODE, ERRMSG) \ + do { \ + XML_Parser parser = SELF->itself; \ + assert(parser != NULL); \ + XML_Size lineno = XML_GetCurrentLineNumber(parser); \ + XML_Size column = XML_GetCurrentColumnNumber(parser); \ + (void)set_xml_error(state, CODE, lineno, column, ERRMSG); \ + } while (0) + /* Build and set an Expat exception, including positioning * information. Always returns NULL. */ static PyObject * set_error(pyexpat_state *state, xmlparseobject *self, enum XML_Error code) { - PyObject *err; - PyObject *buffer; - XML_Parser parser = self->itself; - int lineno = XML_GetErrorLineNumber(parser); - int column = XML_GetErrorColumnNumber(parser); + SET_XML_ERROR(state, self, code, NULL); + return NULL; +} - buffer = PyUnicode_FromFormat("%s: line %i, column %i", - XML_ErrorString(code), lineno, column); - if (buffer == NULL) - return NULL; - err = PyObject_CallOneArg(state->error, buffer); - Py_DECREF(buffer); - if ( err != NULL - && set_error_attr(err, "code", code) - && set_error_attr(err, "offset", column) - && set_error_attr(err, "lineno", lineno)) { - PyErr_SetObject(state->error, err); - } - Py_XDECREF(err); +static PyObject * +set_invalid_arg(pyexpat_state *state, xmlparseobject *self, const char *errmsg) +{ + SET_XML_ERROR(state, self, XML_ERROR_INVALID_ARGUMENT, errmsg); return NULL; } +#undef SET_XML_ERROR + static int have_handler(xmlparseobject *self, int type) { @@ -1125,6 +1176,87 @@ pyexpat_xmlparser_UseForeignDTD_impl(xmlparseobject *self, PyTypeObject *cls, } #endif +#if XML_COMBINED_VERSION >= 20702 +/*[clinic input] +pyexpat.xmlparser.SetAllocTrackerActivationThreshold + + cls: defining_class + threshold: unsigned_long_long + / + +Sets the number of allocated bytes of dynamic memory needed to activate protection against disproportionate use of RAM. + +By default, parser objects have an allocation activation threshold of 64 MiB. +[clinic start generated code]*/ + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl(xmlparseobject *self, + PyTypeObject *cls, + unsigned long long threshold) +/*[clinic end generated code: output=bed7e93207ba08c5 input=9c706b75c18e4ea1]*/ +{ + assert(self->itself != NULL); + if (XML_SetAllocTrackerActivationThreshold(self->itself, threshold) == XML_TRUE) { + Py_RETURN_NONE; + } + // XML_SetAllocTrackerActivationThreshold() can only fail if self->itself + // is not a root parser (currently, this is equivalent to be created + // by ExternalEntityParserCreate()). + pyexpat_state *state = PyType_GetModuleState(cls); + return set_invalid_arg(state, self, "parser must be a root parser"); +} +#endif + +#if XML_COMBINED_VERSION >= 20702 +/*[clinic input] +pyexpat.xmlparser.SetAllocTrackerMaximumAmplification + + cls: defining_class + max_factor: float + / + +Sets the maximum amplification factor between direct input and bytes of dynamic memory allocated. + +The amplification factor is calculated as "allocated / direct" while parsing, +where "direct" is the number of bytes read from the primary document in parsing +and "allocated" is the number of bytes of dynamic memory allocated in the parser +hierarchy. + +The 'max_factor' value must be a non-NaN floating point value greater than +or equal to 1.0. Amplification factors greater than 100.0 can be observed +near the start of parsing even with benign files in practice. In particular, +the activation threshold should be carefully chosen to avoid false positives. + +By default, parser objects have a maximum amplification factor of 100.0. +[clinic start generated code]*/ + +static PyObject * +pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl(xmlparseobject *self, + PyTypeObject *cls, + float max_factor) +/*[clinic end generated code: output=6e44bd48c9b112a0 input=918b9266b490a722]*/ +{ + assert(self->itself != NULL); + if (XML_SetAllocTrackerMaximumAmplification(self->itself, max_factor) == XML_TRUE) { + Py_RETURN_NONE; + } + // XML_SetAllocTrackerMaximumAmplification() can fail if self->itself + // is not a root parser (currently, this is equivalent to be created + // by ExternalEntityParserCreate()) or if 'max_factor' is NaN or < 1.0. + // + // Expat does not provide a way to determine whether a parser is a root + // or not, nor does it provide a way to distinguish between failures in + // XML_SetAllocTrackerMaximumAmplification() (see gh-90949), we manually + // detect the factor out-of-range issue here so that users have a better + // error message. + pyexpat_state *state = PyType_GetModuleState(cls); + const char *message = (isnan(max_factor) || max_factor < 1.0f) + ? "'max_factor' must be at least 1.0" + : "parser must be a root parser"; + return set_invalid_arg(state, self, message); +} +#endif + static struct PyMethodDef xmlparse_methods[] = { PYEXPAT_XMLPARSER_PARSE_METHODDEF PYEXPAT_XMLPARSER_PARSEFILE_METHODDEF @@ -1133,9 +1265,9 @@ static struct PyMethodDef xmlparse_methods[] = { PYEXPAT_XMLPARSER_GETINPUTCONTEXT_METHODDEF PYEXPAT_XMLPARSER_EXTERNALENTITYPARSERCREATE_METHODDEF PYEXPAT_XMLPARSER_SETPARAMENTITYPARSING_METHODDEF -#if XML_COMBINED_VERSION >= 19505 PYEXPAT_XMLPARSER_USEFOREIGNDTD_METHODDEF -#endif + PYEXPAT_XMLPARSER_SETALLOCTRACKERACTIVATIONTHRESHOLD_METHODDEF + PYEXPAT_XMLPARSER_SETALLOCTRACKERMAXIMUMAMPLIFICATION_METHODDEF PYEXPAT_XMLPARSER_SETREPARSEDEFERRALENABLED_METHODDEF PYEXPAT_XMLPARSER_GETREPARSEDEFERRALENABLED_METHODDEF {NULL, NULL} /* sentinel */ @@ -2141,6 +2273,13 @@ pyexpat_exec(PyObject *mod) #else capi->SetReparseDeferralEnabled = NULL; #endif +#if XML_COMBINED_VERSION >= 20702 + capi->SetAllocTrackerActivationThreshold = XML_SetAllocTrackerActivationThreshold; + capi->SetAllocTrackerMaximumAmplification = XML_SetAllocTrackerMaximumAmplification; +#else + capi->SetAllocTrackerActivationThreshold = NULL; + capi->SetAllocTrackerMaximumAmplification = NULL; +#endif /* export using capsule */ PyObject *capi_object = PyCapsule_New(capi, PyExpat_CAPSULE_NAME, From b52fc8829759e9ae63b19b857c892665c3f29ca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 26 Sep 2025 19:59:40 +0200 Subject: [PATCH 2/2] gh-90949: amend GH-139234 in prevision of future mitigation API (#139366) Fix some typos left in f04bea44c37793561d753dd4ca6e7cd658137553, and simplify some internal functions to ease maintenance of future mitigation APIs. --- Doc/library/pyexpat.rst | 4 +- ...5-09-22-14-40-11.gh-issue-90949.UM35nb.rst | 4 +- Modules/pyexpat.c | 121 ++++++++++-------- 3 files changed, 72 insertions(+), 57 deletions(-) diff --git a/Doc/library/pyexpat.rst b/Doc/library/pyexpat.rst index 4c21bc875217b9..0b08a9b0dedef6 100644 --- a/Doc/library/pyexpat.rst +++ b/Doc/library/pyexpat.rst @@ -281,8 +281,8 @@ common XML vulnerabilities. .. note:: The maximum amplification factor is only considered if the threshold - that can be adjusted :meth:`.SetAllocTrackerActivationThreshold` is - exceeded. + that can be adjusted by :meth:`.SetAllocTrackerActivationThreshold` + is exceeded. .. versionadded:: next diff --git a/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst b/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst index 0719d4353fb708..5611f33fb8e37b 100644 --- a/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst +++ b/Misc/NEWS.d/next/Library/2025-09-22-14-40-11.gh-issue-90949.UM35nb.rst @@ -1,5 +1,5 @@ -Add :func:`~xml.parsers.expat.xmlparser.SetAllocTrackerActivationThreshold` -and :func:`~xml.parsers.expat.xmlparser.SetAllocTrackerMaximumAmplification` +Add :meth:`~xml.parsers.expat.xmlparser.SetAllocTrackerActivationThreshold` +and :meth:`~xml.parsers.expat.xmlparser.SetAllocTrackerMaximumAmplification` to :ref:`xmlparser ` objects to prevent use of disproportional amounts of dynamic memory from within an Expat parser. Patch by Bénédikt Tran. diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 3433c48bc38bef..a0cbbc23150b7c 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -123,7 +123,7 @@ CALL_XML_HANDLER_SETTER(const struct HandlerInfo *handler_info, } static int -set_error_code(PyObject *err, enum XML_Error code) +set_xml_error_attr_code(PyObject *err, enum XML_Error code) { PyObject *v = PyLong_FromLong((long)code); int ok = v != NULL && PyObject_SetAttr(err, &_Py_ID(code), v) != -1; @@ -135,7 +135,7 @@ set_error_code(PyObject *err, enum XML_Error code) * false on an exception. */ static int -set_error_location(PyObject *err, const char *name, XML_Size value) +set_xml_error_attr_location(PyObject *err, const char *name, XML_Size value) { PyObject *v = PyLong_FromSize_t((size_t)value); int ok = v != NULL && PyObject_SetAttrString(err, name, v) != -1; @@ -144,32 +144,22 @@ set_error_location(PyObject *err, const char *name, XML_Size value) } -static PyObject * -format_xml_error(enum XML_Error code, XML_Size lineno, XML_Size column) -{ - const char *errmsg = XML_ErrorString(code); - PyUnicodeWriter *writer = PyUnicodeWriter_Create(strlen(errmsg) + 1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_Format(writer, - "%s: line %zu, column %zu", - errmsg, (size_t)lineno, (size_t)column) < 0) - { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); -} - static PyObject * set_xml_error(pyexpat_state *state, enum XML_Error code, XML_Size lineno, XML_Size column, const char *errmsg) { - PyObject *arg = errmsg == NULL - ? format_xml_error(code, lineno, column) - : PyUnicode_FromStringAndSize(errmsg, strlen(errmsg)); + PyObject *arg; + if (errmsg == NULL) { + arg = PyUnicode_FromFormat( + "%s: line %zu, column %zu", + XML_ErrorString(code), + (size_t)lineno, (size_t)column + ); + } + else { + arg = PyUnicode_FromStringAndSize(errmsg, strlen(errmsg)); + } if (arg == NULL) { return NULL; } @@ -177,9 +167,9 @@ set_xml_error(pyexpat_state *state, Py_DECREF(arg); if ( res != NULL - && set_error_code(res, code) - && set_error_location(res, "lineno", lineno) - && set_error_location(res, "offset", column) + && set_xml_error_attr_code(res, code) + && set_xml_error_attr_location(res, "lineno", lineno) + && set_xml_error_attr_location(res, "offset", column) ) { PyErr_SetObject(state->error, res); } @@ -1176,6 +1166,50 @@ pyexpat_xmlparser_UseForeignDTD_impl(xmlparseobject *self, PyTypeObject *cls, } #endif +#if XML_COMBINED_VERSION >= 20702 +static PyObject * +set_activation_threshold(xmlparseobject *self, + PyTypeObject *cls, + unsigned long long threshold, + XML_Bool (*setter)(XML_Parser, unsigned long long)) +{ + assert(self->itself != NULL); + if (setter(self->itself, threshold) == XML_TRUE) { + Py_RETURN_NONE; + } + // The setter fails if self->itself is NULL (which is not possible here) + // or is a non-root parser, which currently only happens for parsers + // created by ExternalEntityParserCreate(). + pyexpat_state *state = PyType_GetModuleState(cls); + return set_invalid_arg(state, self, "parser must be a root parser"); +} + +static PyObject * +set_maximum_amplification(xmlparseobject *self, + PyTypeObject *cls, + float max_factor, + XML_Bool (*setter)(XML_Parser, float)) +{ + assert(self->itself != NULL); + if (setter(self->itself, max_factor) == XML_TRUE) { + Py_RETURN_NONE; + } + // The setter fails if self->itself is NULL (which is not possible here), + // is a non-root parser, which currently only happens for parsers created + // by ExternalEntityParserCreate(), or if 'max_factor' is NaN or < 1.0. + pyexpat_state *state = PyType_GetModuleState(cls); + // Note: Expat has no API to determine whether a parser is a root parser, + // and since the Expat functions for defining the various maximum allowed + // amplifcation factors fail when a bad parser or an out-of-range factor + // is given without specifying which check failed, we check whether the + // factor is out-of-range to improve the error message. See also gh-90949. + const char *message = (isnan(max_factor) || max_factor < 1.0f) + ? "'max_factor' must be at least 1.0" + : "parser must be a root parser"; + return set_invalid_arg(state, self, message); +} +#endif + #if XML_COMBINED_VERSION >= 20702 /*[clinic input] pyexpat.xmlparser.SetAllocTrackerActivationThreshold @@ -1195,15 +1229,10 @@ pyexpat_xmlparser_SetAllocTrackerActivationThreshold_impl(xmlparseobject *self, unsigned long long threshold) /*[clinic end generated code: output=bed7e93207ba08c5 input=9c706b75c18e4ea1]*/ { - assert(self->itself != NULL); - if (XML_SetAllocTrackerActivationThreshold(self->itself, threshold) == XML_TRUE) { - Py_RETURN_NONE; - } - // XML_SetAllocTrackerActivationThreshold() can only fail if self->itself - // is not a root parser (currently, this is equivalent to be created - // by ExternalEntityParserCreate()). - pyexpat_state *state = PyType_GetModuleState(cls); - return set_invalid_arg(state, self, "parser must be a root parser"); + return set_activation_threshold( + self, cls, threshold, + XML_SetAllocTrackerActivationThreshold + ); } #endif @@ -1236,24 +1265,10 @@ pyexpat_xmlparser_SetAllocTrackerMaximumAmplification_impl(xmlparseobject *self, float max_factor) /*[clinic end generated code: output=6e44bd48c9b112a0 input=918b9266b490a722]*/ { - assert(self->itself != NULL); - if (XML_SetAllocTrackerMaximumAmplification(self->itself, max_factor) == XML_TRUE) { - Py_RETURN_NONE; - } - // XML_SetAllocTrackerMaximumAmplification() can fail if self->itself - // is not a root parser (currently, this is equivalent to be created - // by ExternalEntityParserCreate()) or if 'max_factor' is NaN or < 1.0. - // - // Expat does not provide a way to determine whether a parser is a root - // or not, nor does it provide a way to distinguish between failures in - // XML_SetAllocTrackerMaximumAmplification() (see gh-90949), we manually - // detect the factor out-of-range issue here so that users have a better - // error message. - pyexpat_state *state = PyType_GetModuleState(cls); - const char *message = (isnan(max_factor) || max_factor < 1.0f) - ? "'max_factor' must be at least 1.0" - : "parser must be a root parser"; - return set_invalid_arg(state, self, message); + return set_maximum_amplification( + self, cls, max_factor, + XML_SetAllocTrackerMaximumAmplification + ); } #endif