Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 0963d39

Browse files
authored
Handle additional errors when previewing URLs. (#9333)
* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
1 parent b0b2cac commit 0963d39

File tree

3 files changed

+145
-30
lines changed

3 files changed

+145
-30
lines changed

changelog.d/9333.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".

synapse/rest/media/v1/preview_url_resource.py

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,10 @@
5858

5959
logger = logging.getLogger(__name__)
6060

61-
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
61+
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
62+
_xml_encoding_match = re.compile(
63+
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
64+
)
6265
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
6366

6467
OG_TAG_NAME_MAXLEN = 50
@@ -300,24 +303,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
300303
with open(media_info["filename"], "rb") as file:
301304
body = file.read()
302305

303-
encoding = None
304-
305-
# Let's try and figure out if it has an encoding set in a meta tag.
306-
# Limit it to the first 1kb, since it ought to be in the meta tags
307-
# at the top.
308-
match = _charset_match.search(body[:1000])
309-
310-
# If we find a match, it should take precedence over the
311-
# Content-Type header, so set it here.
312-
if match:
313-
encoding = match.group(1).decode("ascii")
314-
315-
# If we don't find a match, we'll look at the HTTP Content-Type, and
316-
# if that doesn't exist, we'll fall back to UTF-8.
317-
if not encoding:
318-
content_match = _content_type_match.match(media_info["media_type"])
319-
encoding = content_match.group(1) if content_match else "utf-8"
320-
306+
encoding = get_html_media_encoding(body, media_info["media_type"])
321307
og = decode_and_calc_og(body, media_info["uri"], encoding)
322308

323309
# pre-cache the image for posterity
@@ -689,6 +675,48 @@ async def _expire_url_cache_data(self) -> None:
689675
logger.debug("No media removed from url cache")
690676

691677

678+
def get_html_media_encoding(body: bytes, content_type: str) -> str:
679+
"""
680+
Get the encoding of the body based on the (presumably) HTML body or media_type.
681+
682+
The precedence used for finding a character encoding is:
683+
684+
1. meta tag with a charset declared.
685+
2. The XML document's character encoding attribute.
686+
3. The Content-Type header.
687+
4. Fallback to UTF-8.
688+
689+
Args:
690+
body: The HTML document, as bytes.
691+
content_type: The Content-Type header.
692+
693+
Returns:
694+
The character encoding of the body, as a string.
695+
"""
696+
# Limit searches to the first 1kb, since it ought to be at the top.
697+
body_start = body[:1024]
698+
699+
# Let's try and figure out if it has an encoding set in a meta tag.
700+
match = _charset_match.search(body_start)
701+
if match:
702+
return match.group(1).decode("ascii")
703+
704+
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
705+
706+
# If we didn't find a match, see if it an XML document with an encoding.
707+
match = _xml_encoding_match.match(body_start)
708+
if match:
709+
return match.group(1).decode("ascii")
710+
711+
# If we don't find a match, we'll look at the HTTP Content-Type, and
712+
# if that doesn't exist, we'll fall back to UTF-8.
713+
content_match = _content_type_match.match(content_type)
714+
if content_match:
715+
return content_match.group(1)
716+
717+
return "utf-8"
718+
719+
692720
def decode_and_calc_og(
693721
body: bytes, media_uri: str, request_encoding: Optional[str] = None
694722
) -> Dict[str, Optional[str]]:
@@ -725,6 +753,11 @@ def decode_and_calc_og(
725753
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
726754
# Attempt to parse the body. If this fails, log and return no metadata.
727755
tree = etree.fromstring(body_attempt, parser)
756+
757+
# The data was successfully parsed, but no tree was found.
758+
if tree is None:
759+
return {}
760+
728761
return _calc_og(tree, media_uri)
729762

730763
# Attempt to parse the body. If this fails, log and return no metadata.

tests/test_preview.py

Lines changed: 92 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from synapse.rest.media.v1.preview_url_resource import (
1717
decode_and_calc_og,
18+
get_html_media_encoding,
1819
summarize_paragraphs,
1920
)
2021

@@ -26,7 +27,7 @@
2627
lxml = None
2728

2829

29-
class PreviewTestCase(unittest.TestCase):
30+
class SummarizeTestCase(unittest.TestCase):
3031
if not lxml:
3132
skip = "url preview feature requires lxml"
3233

@@ -144,12 +145,12 @@ def test_small_then_large_summarize(self):
144145
)
145146

146147

147-
class PreviewUrlTestCase(unittest.TestCase):
148+
class CalcOgTestCase(unittest.TestCase):
148149
if not lxml:
149150
skip = "url preview feature requires lxml"
150151

151152
def test_simple(self):
152-
html = """
153+
html = b"""
153154
<html>
154155
<head><title>Foo</title></head>
155156
<body>
@@ -163,7 +164,7 @@ def test_simple(self):
163164
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
164165

165166
def test_comment(self):
166-
html = """
167+
html = b"""
167168
<html>
168169
<head><title>Foo</title></head>
169170
<body>
@@ -178,7 +179,7 @@ def test_comment(self):
178179
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
179180

180181
def test_comment2(self):
181-
html = """
182+
html = b"""
182183
<html>
183184
<head><title>Foo</title></head>
184185
<body>
@@ -202,7 +203,7 @@ def test_comment2(self):
202203
)
203204

204205
def test_script(self):
205-
html = """
206+
html = b"""
206207
<html>
207208
<head><title>Foo</title></head>
208209
<body>
@@ -217,7 +218,7 @@ def test_script(self):
217218
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
218219

219220
def test_missing_title(self):
220-
html = """
221+
html = b"""
221222
<html>
222223
<body>
223224
Some text.
@@ -230,7 +231,7 @@ def test_missing_title(self):
230231
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
231232

232233
def test_h1_as_title(self):
233-
html = """
234+
html = b"""
234235
<html>
235236
<meta property="og:description" content="Some text."/>
236237
<body>
@@ -244,7 +245,7 @@ def test_h1_as_title(self):
244245
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
245246

246247
def test_missing_title_and_broken_h1(self):
247-
html = """
248+
html = b"""
248249
<html>
249250
<body>
250251
<h1><a href="foo"/></h1>
@@ -258,13 +259,20 @@ def test_missing_title_and_broken_h1(self):
258259
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
259260

260261
def test_empty(self):
261-
html = ""
262+
"""Test a body with no data in it."""
263+
html = b""
264+
og = decode_and_calc_og(html, "http://example.com/test.html")
265+
self.assertEqual(og, {})
266+
267+
def test_no_tree(self):
268+
"""A valid body with no tree in it."""
269+
html = b"\x00"
262270
og = decode_and_calc_og(html, "http://example.com/test.html")
263271
self.assertEqual(og, {})
264272

265273
def test_invalid_encoding(self):
266274
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
267-
html = """
275+
html = b"""
268276
<html>
269277
<head><title>Foo</title></head>
270278
<body>
@@ -290,3 +298,76 @@ def test_invalid_encoding2(self):
290298
"""
291299
og = decode_and_calc_og(html, "http://example.com/test.html")
292300
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
301+
302+
303+
class MediaEncodingTestCase(unittest.TestCase):
304+
def test_meta_charset(self):
305+
"""A character encoding is found via the meta tag."""
306+
encoding = get_html_media_encoding(
307+
b"""
308+
<html>
309+
<head><meta charset="ascii">
310+
</head>
311+
</html>
312+
""",
313+
"text/html",
314+
)
315+
self.assertEqual(encoding, "ascii")
316+
317+
# A less well-formed version.
318+
encoding = get_html_media_encoding(
319+
b"""
320+
<html>
321+
<head>< meta charset = ascii>
322+
</head>
323+
</html>
324+
""",
325+
"text/html",
326+
)
327+
self.assertEqual(encoding, "ascii")
328+
329+
def test_xml_encoding(self):
330+
"""A character encoding is found via the meta tag."""
331+
encoding = get_html_media_encoding(
332+
b"""
333+
<?xml version="1.0" encoding="ascii"?>
334+
<html>
335+
</html>
336+
""",
337+
"text/html",
338+
)
339+
self.assertEqual(encoding, "ascii")
340+
341+
def test_meta_xml_encoding(self):
342+
"""Meta tags take precedence over XML encoding."""
343+
encoding = get_html_media_encoding(
344+
b"""
345+
<?xml version="1.0" encoding="ascii"?>
346+
<html>
347+
<head><meta charset="UTF-16">
348+
</head>
349+
</html>
350+
""",
351+
"text/html",
352+
)
353+
self.assertEqual(encoding, "UTF-16")
354+
355+
def test_content_type(self):
356+
"""A character encoding is found via the Content-Type header."""
357+
# Test a few variations of the header.
358+
headers = (
359+
'text/html; charset="ascii";',
360+
"text/html;charset=ascii;",
361+
'text/html; charset="ascii"',
362+
"text/html; charset=ascii",
363+
'text/html; charset="ascii;',
364+
'text/html; charset=ascii";',
365+
)
366+
for header in headers:
367+
encoding = get_html_media_encoding(b"", header)
368+
self.assertEqual(encoding, "ascii")
369+
370+
def test_fallback(self):
371+
"""A character encoding cannot be found in the body or header."""
372+
encoding = get_html_media_encoding(b"", "text/html")
373+
self.assertEqual(encoding, "utf-8")

0 commit comments

Comments
 (0)