Skip to content

Commit 5f027ad

Browse files
Update URL Preview code to work with lxml 6.0.0 (#18622)
1 parent e6dbbbb commit 5f027ad

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

changelog.d/18622.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Update URL Preview code to work with `lxml` 6.0.0+.

synapse/media/preview_html.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def decode_body(
133133
content_type: The Content-Type header.
134134
135135
Returns:
136-
The parsed HTML body, or None if an error occurred during processed.
136+
The parsed HTML body, or None if an error occurred during processing.
137137
"""
138138
# If there's no body, nothing useful is going to be found.
139139
if not body:
@@ -158,9 +158,31 @@ def decode_body(
158158
# Create an HTML parser.
159159
parser = etree.HTMLParser(recover=True, encoding=encoding)
160160

161-
# Attempt to parse the body. Returns None if the body was successfully
162-
# parsed, but no tree was found.
163-
return etree.fromstring(body, parser)
161+
# Attempt to parse the body. With `lxml` 6.0.0+, this will be an empty HTML
162+
# tree if the body was successfully parsed, but no tree was found. In
163+
# previous `lxml` versions, `etree.fromstring` would return `None` in that
164+
# case.
165+
html_tree = etree.fromstring(body, parser)
166+
167+
# Account for the above referenced case where `html_tree` is an HTML tree
168+
# with an empty body. If so, return None.
169+
if html_tree is not None and html_tree.tag == "html":
170+
# If the tree has only a single <body> element and it's empty, then
171+
# return None.
172+
body_el = html_tree.find("body")
173+
if body_el is not None and len(html_tree) == 1:
174+
# Extract the content of the body tag as text.
175+
body_text = "".join(cast(Iterable[str], body_el.itertext()))
176+
177+
# Strip any undecodable Unicode characters and whitespace.
178+
body_text = body_text.strip("\ufffd").strip()
179+
180+
# If there's no text left, and there were no child tags,
181+
# then we consider the <body> tag empty.
182+
if not body_text and len(body_el) == 0:
183+
return None
184+
185+
return html_tree
164186

165187

166188
def _get_meta_tags(

0 commit comments

Comments
 (0)