Skip to content
This repository was archived by the owner on Mar 26, 2024. It is now read-only.

Commit babeeb4

Browse files
authored
Unescape HTML entities in oEmbed titles. (matrix-org#14781)
It doesn't seem valid that HTML entities should appear in the title field of oEmbed responses, but a popular WordPress plug-in seems to do it. There should not be harm in unescaping these.
1 parent 7e582a2 commit babeeb4

File tree

3 files changed

+20
-6
lines changed

3 files changed

+20
-6
lines changed

changelog.d/14781.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Unescape HTML entities in URL preview titles making use of oEmbed responses.

synapse/rest/media/v1/oembed.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import html
1415
import logging
1516
import urllib.parse
1617
from typing import TYPE_CHECKING, List, Optional
@@ -161,7 +162,9 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
161162

162163
title = oembed.get("title")
163164
if title and isinstance(title, str):
164-
open_graph_response["og:title"] = title
165+
# A common WordPress plug-in seems to incorrectly escape entities
166+
# in the oEmbed response.
167+
open_graph_response["og:title"] = html.unescape(title)
165168

166169
author_name = oembed.get("author_name")
167170
if not isinstance(author_name, str):
@@ -180,9 +183,9 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
180183
# Process each type separately.
181184
oembed_type = oembed.get("type")
182185
if oembed_type == "rich":
183-
html = oembed.get("html")
184-
if isinstance(html, str):
185-
calc_description_and_urls(open_graph_response, html)
186+
html_str = oembed.get("html")
187+
if isinstance(html_str, str):
188+
calc_description_and_urls(open_graph_response, html_str)
186189

187190
elif oembed_type == "photo":
188191
# If this is a photo, use the full image, not the thumbnail.
@@ -192,8 +195,8 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
192195

193196
elif oembed_type == "video":
194197
open_graph_response["og:type"] = "video.other"
195-
html = oembed.get("html")
196-
if html and isinstance(html, str):
198+
html_str = oembed.get("html")
199+
if html_str and isinstance(html_str, str):
197200
calc_description_and_urls(open_graph_response, oembed["html"])
198201
for size in ("width", "height"):
199202
val = oembed.get(size)

tests/rest/media/v1/test_oembed.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,13 @@ def test_link(self) -> None:
150150
result = self.parse_response({"type": "link"})
151151
self.assertIn("og:type", result.open_graph_result)
152152
self.assertEqual(result.open_graph_result["og:type"], "website")
153+
154+
def test_title_html_entities(self) -> None:
155+
"""Test HTML entities in title"""
156+
result = self.parse_response(
157+
{"title": "Why JSON isn’t a Good Configuration Language"}
158+
)
159+
self.assertEqual(
160+
result.open_graph_result["og:title"],
161+
"Why JSON isn’t a Good Configuration Language",
162+
)

0 commit comments

Comments
 (0)