Skip to content

Commit 89b2b3d

Browse files
Improve memory efficiency by discarding references to objects immediately
When "killing" or "removing" elements from a tree, it's important not to keep references to them in the lists where they're collected. Without this, the memory consumption was quadratic because the tails in this case were combined into parents'/previous' elements' tails and the original ones could not be removed as we kept the references to the elements in the _kill and _remove lists. For more info see: https://bugs.launchpad.net/lxml/+bug/1889653 Co-authored-by: Miro Hrončok <[email protected]>
1 parent da9d66c commit 89b2b3d

File tree

4 files changed

+45
-11
lines changed

4 files changed

+45
-11
lines changed

lxml_html_clean/clean.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import copy
1010
import re
11+
from collections import deque
1112
from urllib.parse import urlsplit, unquote_plus
1213

1314
from lxml import etree
@@ -383,8 +384,8 @@ def __call__(self, doc):
383384
if self.annoying_tags:
384385
remove_tags.update(('blink', 'marquee'))
385386

386-
_remove = []
387-
_kill = []
387+
_remove = deque()
388+
_kill = deque()
388389
for el in doc.iter():
389390
if el.tag in kill_tags:
390391
if self.allow_element(el):
@@ -398,22 +399,22 @@ def __call__(self, doc):
398399
if _remove and _remove[0] == doc:
399400
# We have to drop the parent-most tag, which we can't
400401
# do. Instead we'll rewrite it:
401-
el = _remove.pop(0)
402+
el = _remove.popleft()
402403
el.tag = 'div'
403404
el.attrib.clear()
404405
elif _kill and _kill[0] == doc:
405406
# We have to drop the parent-most element, which we can't
406407
# do. Instead we'll clear it:
407-
el = _kill.pop(0)
408+
el = _kill.popleft()
408409
if el.tag != 'html':
409410
el.tag = 'div'
410411
el.clear()
411412

412-
_kill.reverse() # start with innermost tags
413-
for el in _kill:
414-
el.drop_tree()
415-
for el in _remove:
416-
el.drop_tag()
413+
while _kill:
414+
_kill.popleft().drop_tree() # popleft to start with innermost elements
415+
416+
while _remove:
417+
_remove.pop().drop_tag()
417418

418419
if self.remove_unknown_tags:
419420
if allow_tags:

tests/test_clean.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import lxml.html
77
from lxml_html_clean import Cleaner, clean_html
8+
from .utils import peak_memory_usage
89

910

1011
class CleanerTest(unittest.TestCase):
@@ -333,3 +334,15 @@ def test_ascii_control_chars_removed(self):
333334
expected = """<a href="">Link</a>"""
334335
cleaner = Cleaner()
335336
self.assertEqual(expected, cleaner.clean_html(html))
337+
338+
def test_memory_usage_many_elements_with_long_tails(self):
339+
comment = "<!-- foo bar baz -->\n"
340+
empty_line = "\t" * 10 + "\n"
341+
element = comment + empty_line * 10
342+
content = element * 5_000
343+
html = f"<html>{content}</html>"
344+
345+
cleaner = Cleaner()
346+
mem = peak_memory_usage(cleaner.clean_html, html)
347+
348+
self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")

tests/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import unittest
2+
3+
4+
def peak_memory_usage(func, *args, **kwargs):
5+
"""
6+
Monitor the memory usage of a function and return the peak memory used, in MiB.
7+
"""
8+
try:
9+
from memory_profiler import memory_usage # type: ignore
10+
except ImportError:
11+
raise unittest.SkipTest("memory-profiler is not available")
12+
13+
try:
14+
mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None)
15+
except MemoryError:
16+
return float("inf")
17+
peak_memory = max(mem_usage) - min(mem_usage)
18+
return peak_memory

tox.ini

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ skipsdist = True
44

55
[testenv]
66
commands =
7-
python -m unittest tests.test_clean
7+
python -m unittest -v tests.test_clean
88
python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt
9-
deps = lxml
9+
deps =
10+
lxml
11+
memory_profiler
1012

1113
[testenv:mypy]
1214
commands =

0 commit comments

Comments
 (0)