Skip to content
This repository was archived by the owner on Mar 26, 2024. It is now read-only.

Commit fa13080

Browse files
reivilibrerichvdh
andauthored
Merge pull request from GHSA-22p3-qrh9-cx32
* Make _iterate_over_text easier to read by using simple data structures * Prefer a set of tags to ignore In my tests, it's 4x faster to check for containment in a set of this size * Add a stack size limit to _iterate_over_text * Continue accepting the case where there is no body element * Use an early return instead for None Co-authored-by: Richard van der Hoff <[email protected]>
1 parent 21e6c0e commit fa13080

File tree

2 files changed

+56
-24
lines changed

2 files changed

+56
-24
lines changed

synapse/rest/media/v1/preview_html.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import codecs
15-
import itertools
1615
import logging
1716
import re
18-
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
17+
from typing import TYPE_CHECKING, Dict, Generator, Iterable, List, Optional, Set, Union
1918

2019
if TYPE_CHECKING:
2120
from lxml import etree
@@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
276275

277276
from lxml import etree
278277

279-
TAGS_TO_REMOVE = (
278+
TAGS_TO_REMOVE = {
280279
"header",
281280
"nav",
282281
"aside",
@@ -291,31 +290,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
291290
"img",
292291
"picture",
293292
etree.Comment,
294-
)
293+
}
295294

296295
# Split all the text nodes into paragraphs (by splitting on new
297296
# lines)
298297
text_nodes = (
299298
re.sub(r"\s+", "\n", el).strip()
300-
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
299+
for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
301300
)
302301
return summarize_paragraphs(text_nodes)
303302

304303

305304
def _iterate_over_text(
306-
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
305+
tree: Optional["etree.Element"],
306+
tags_to_ignore: Set[Union[str, "etree.Comment"]],
307+
stack_limit: int = 1024,
307308
) -> Generator[str, None, None]:
308309
"""Iterate over the tree returning text nodes in a depth first fashion,
309310
skipping text nodes inside certain tags.
311+
312+
Args:
313+
tree: The parent element to iterate. Can be None if there isn't one.
314+
tags_to_ignore: Set of tags to ignore
315+
stack_limit: Maximum stack size limit for depth-first traversal.
316+
Nodes will be dropped if this limit is hit, which may truncate the
317+
textual result.
318+
Intended to limit the maximum working memory when generating a preview.
310319
"""
311-
# This is basically a stack that we extend using itertools.chain.
312-
# This will either consist of an element to iterate over *or* a string
320+
321+
if tree is None:
322+
return
323+
324+
# This is a stack whose items are elements to iterate over *or* strings
313325
# to be returned.
314-
elements = iter([tree])
315-
while True:
316-
el = next(elements, None)
317-
if el is None:
318-
return
326+
elements: List[Union[str, "etree.Element"]] = [tree]
327+
while elements:
328+
el = elements.pop()
319329

320330
if isinstance(el, str):
321331
yield el
@@ -329,17 +339,22 @@ def _iterate_over_text(
329339
if el.text:
330340
yield el.text
331341

332-
# We add to the stack all the elements children, interspersed with
333-
# each child's tail text (if it exists). The tail text of a node
334-
# is text that comes *after* the node, so we always include it even
335-
# if we ignore the child node.
336-
elements = itertools.chain(
337-
itertools.chain.from_iterable( # Basically a flatmap
338-
[child, child.tail] if child.tail else [child]
339-
for child in el.iterchildren()
340-
),
341-
elements,
342-
)
342+
# We add to the stack all the element's children, interspersed with
343+
# each child's tail text (if it exists).
344+
#
345+
# We iterate in reverse order so that earlier pieces of text appear
346+
# closer to the top of the stack.
347+
for child in el.iterchildren(reversed=True):
348+
if len(elements) > stack_limit:
349+
# We've hit our limit for working memory
350+
break
351+
352+
if child.tail:
353+
# The tail text of a node is text that comes *after* the node,
354+
# so we always include it even if we ignore the child node.
355+
elements.append(child.tail)
356+
357+
elements.append(child)
343358

344359

345360
def summarize_paragraphs(

tests/rest/media/v1/test_html_preview.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,23 @@ def test_windows_1252(self) -> None:
370370
og = parse_html_to_open_graph(tree)
371371
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
372372

373+
def test_nested_nodes(self) -> None:
374+
"""A body with some nested nodes. Tests that we iterate over children
375+
in the right order (and don't reverse the order of the text)."""
376+
html = b"""
377+
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
378+
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
379+
"""
380+
tree = decode_body(html, "http://example.com/test.html")
381+
og = parse_html_to_open_graph(tree)
382+
self.assertEqual(
383+
og,
384+
{
385+
"og:title": None,
386+
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
387+
},
388+
)
389+
373390

374391
class MediaEncodingTestCase(unittest.TestCase):
375392
def test_meta_charset(self) -> None:

0 commit comments

Comments
 (0)