Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions Lib/test/test_pyexpat.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,48 @@ def resolve_entity(context, base, system_id, public_id):
self.assertEqual(handler_call_args, [("bar", "baz")])


class ParentParserLifetimeTest(unittest.TestCase):
"""
Subparsers make use of their parent XML_Parser inside of Expat.
As a result, parent parsers need to outlive subparsers.

See https://github.com/python/cpython/issues/139400.
"""

def test_parent_parser_outlives_its_subparsers__single(self):
parser = expat.ParserCreate()
subparser = parser.ExternalEntityParserCreate(None)

# Now try to cause garbage collection of the parent parser
# while it's still being referenced by a related subparser
del parser

def test_parent_parser_outlives_its_subparsers__multiple(self):
parser = expat.ParserCreate()
subparser_one = parser.ExternalEntityParserCreate(None)
subparser_two = parser.ExternalEntityParserCreate(None)

# Now try to cause garbage collection of the parent parser
# while it's still being referenced by a related subparser
del parser

def test_parent_parser_outlives_its_subparsers__chain(self):
parser = expat.ParserCreate()
subparser = parser.ExternalEntityParserCreate(None)
subsubparser = subparser.ExternalEntityParserCreate(None)

# Now try to cause garbage collection of the parent parsers
# while they are still being referenced by a related subparser
del parser
del subparser

def test_cycle(self):
parser = expat.ParserCreate()
subparser = parser.ExternalEntityParserCreate(None)
parser.StartElementHandler = lambda _1, _2: subparser
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
parser.StartElementHandler = lambda _1, _2: subparser
handler = lambda _1, _2: None
handler.subparser = subparser
parser.StartElementHandler = handler

I don't know if it's sufficient for the handler to have a strong reference to the subparser if it's only returned.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@picnixz I have an idea how to maybe find out. Will try in the next hour.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want me to wait for you to find out or can I merge the PR?

Copy link
Contributor Author

@hartwork hartwork Oct 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@picnixz depends on what the goal is: to merge with being sure that the test creates a cycle or being okay to potentially improve that test in a follow-up pull request (by you or me)? The two ideas I had were:

  • your pointer to -X showrefcount
  • gc.get_referrers and gc.get_referents

I'm not getting anywhere with the latter yet, current impression is that the test does not yet actually create a cycle. How would you like to move foward?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@picnixz update: I extended the test_cycle test now, please let me know what you think 🍻

Copy link
Member

@picnixz picnixz Oct 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My intent was to just create a test where we would have possible of cycles. The simplest way to create cycles is to have something that references itself. Since it's not possible to mutate the type itself, I suggested that the handler creates the cycle itself. The handlers are callables and when we call Py_VISIT, the default visitor function would visit their attributes, and one of these attributes would be the parser that we stored.

But I'm not entirely sure that my reasoning here is accurate. At least, that's how I always dealt with reference cycles and how I detected crashes, mostly by adding attributes that have references to the object or its type I'm trying to free.

Possibility one: use the following snippet:

import pyexpat as expat

p = expat.ParserCreate()
s = p.ExternalEntityParserCreate(None)
func = lambda *a, **kw: None
func.s = s
func.p = p
func.t = type(p)
p.StartElementHandler = func
p.Parse("<xml></xml>", True)

I think this should be enough to create cycles. But since I don't know how to check this (using gc in tests is not advised in general), the second possibility is to remove this test and investigate more later (I would appreciate if we had a debug option to see the cyclic isolates that the GC finds).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@picnixz I believe we want to limit ourselves to code we are certain about and also merge this pull request today. Let's drop the test then until we really know. Pushing…

parser.Parse('<doc/>', True)


class ReparseDeferralTest(unittest.TestCase):
def test_getter_setter_round_trip(self):
parser = expat.ParserCreate()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
:mod:`xml.parsers.expat`: Make sure that parent Expat parsers are only
garbage-collected once they are no longer referenced by subparsers created
by :meth:`~xml.parsers.expat.xmlparser.ExternalEntityParserCreate`.
Patch by Sebastian Pipping.
21 changes: 21 additions & 0 deletions Modules/pyexpat.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ typedef struct {
PyObject_HEAD

XML_Parser itself;
/*
* Strong reference to a parent `xmlparseobject` if this parser
* is a child parser. Set to NULL if this parser is a root parser.
* This is needed to keep the parent parser alive as long as it has
* at least one child parser.
*
* See https://github.com/python/cpython/issues/139400 for details.
*/
PyObject *parent;
int ordered_attributes; /* Return attributes as a list. */
int specified_attributes; /* Report only specified attributes. */
int in_callback; /* Is a callback active? */
Expand Down Expand Up @@ -1065,6 +1074,11 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self,
return NULL;
}

// The new subparser will make use of the parent XML_Parser inside of Expat.
// So we need to take subparsers into account with the reference counting
// of their parent parser.
Py_INCREF(self);

new_parser->buffer_size = self->buffer_size;
new_parser->buffer_used = 0;
new_parser->buffer = NULL;
Expand All @@ -1074,18 +1088,21 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self,
new_parser->ns_prefixes = self->ns_prefixes;
new_parser->itself = XML_ExternalEntityParserCreate(self->itself, context,
encoding);
new_parser->parent = (PyObject *)self;
new_parser->handlers = 0;
new_parser->intern = Py_XNewRef(self->intern);

if (self->buffer != NULL) {
new_parser->buffer = PyMem_Malloc(new_parser->buffer_size);
if (new_parser->buffer == NULL) {
Py_DECREF(new_parser);
Py_DECREF(self);
return PyErr_NoMemory();
}
}
if (!new_parser->itself) {
Py_DECREF(new_parser);
Py_DECREF(self);
return PyErr_NoMemory();
}

Expand All @@ -1099,6 +1116,7 @@ pyexpat_xmlparser_ExternalEntityParserCreate_impl(xmlparseobject *self,
new_parser->handlers = PyMem_New(PyObject *, i);
if (!new_parser->handlers) {
Py_DECREF(new_parser);
Py_DECREF(self);
return PyErr_NoMemory();
}
clear_handlers(new_parser, 1);
Expand Down Expand Up @@ -1479,6 +1497,7 @@ newxmlparseobject(pyexpat_state *state, const char *encoding,
/* namespace_separator is either NULL or contains one char + \0 */
self->itself = XML_ParserCreate_MM(encoding, &ExpatMemoryHandler,
namespace_separator);
self->parent = NULL;
if (self->itself == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"XML_ParserCreate failed");
Expand Down Expand Up @@ -1515,6 +1534,7 @@ xmlparse_traverse(PyObject *op, visitproc visit, void *arg)
for (size_t i = 0; handler_info[i].name != NULL; i++) {
Py_VISIT(self->handlers[i]);
}
Py_VISIT(self->parent);
Py_VISIT(Py_TYPE(op));
return 0;
}
Expand All @@ -1538,6 +1558,7 @@ xmlparse_dealloc(PyObject *op)
XML_ParserFree(self->itself);
}
self->itself = NULL;
Py_CLEAR(self->parent);

if (self->handlers != NULL) {
PyMem_Free(self->handlers);
Expand Down
Loading