Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 8e1febc

Browse files
authored
Support underscores (in addition to hyphens) for charset detection. (#10410)
1 parent 5b22d5e commit 8e1febc

File tree

3 files changed

+18
-2
lines changed

3 files changed

+18
-2
lines changed

changelog.d/10410.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.

synapse/rest/media/v1/preview_url_resource.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,11 @@
5858

5959
logger = logging.getLogger(__name__)
6060

61-
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
61+
_charset_match = re.compile(
62+
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
63+
)
6264
_xml_encoding_match = re.compile(
63-
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
65+
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
6466
)
6567
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
6668

tests/test_preview.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,19 @@ def test_meta_charset(self):
325325
)
326326
self.assertEqual(encoding, "ascii")
327327

328+
def test_meta_charset_underscores(self):
329+
"""A character encoding contains underscore."""
330+
encoding = get_html_media_encoding(
331+
b"""
332+
<html>
333+
<head><meta charset="Shift_JIS">
334+
</head>
335+
</html>
336+
""",
337+
"text/html",
338+
)
339+
self.assertEqual(encoding, "Shift_JIS")
340+
328341
def test_xml_encoding(self):
329342
"""A character encoding is found via the meta tag."""
330343
encoding = get_html_media_encoding(

0 commit comments

Comments
 (0)