Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 0fcc0ae

Browse files
authored
Improve URL previews for sites with only Twitter card information. (#13056)
Pull out `twitter:` meta tags when generating a preview and use it to augment any `og:` meta tags. Prefers Open Graph information over Twitter card information.
1 parent 7552615 commit 0fcc0ae

File tree

3 files changed

+137
-17
lines changed

3 files changed

+137
-17
lines changed

changelog.d/13056.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve URL previews for sites which only provide Twitter Card metadata, e.g. LWN.net.

synapse/rest/media/v1/preview_html.py

Lines changed: 95 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,16 @@
1515
import itertools
1616
import logging
1717
import re
18-
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
18+
from typing import (
19+
TYPE_CHECKING,
20+
Callable,
21+
Dict,
22+
Generator,
23+
Iterable,
24+
Optional,
25+
Set,
26+
Union,
27+
)
1928

2029
if TYPE_CHECKING:
2130
from lxml import etree
@@ -146,6 +155,70 @@ def decode_body(
146155
return etree.fromstring(body, parser)
147156

148157

158+
def _get_meta_tags(
159+
tree: "etree.Element",
160+
property: str,
161+
prefix: str,
162+
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
163+
) -> Dict[str, Optional[str]]:
164+
"""
165+
Search for meta tags prefixed with a particular string.
166+
167+
Args:
168+
tree: The parsed HTML document.
169+
property: The name of the property which contains the tag name, e.g.
170+
"property" for Open Graph.
171+
prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
172+
property_mapper: An optional callable to map the property to the Open Graph
173+
form. Can return None for a key to ignore that key.
174+
175+
Returns:
176+
A map of tag name to value.
177+
"""
178+
results: Dict[str, Optional[str]] = {}
179+
for tag in tree.xpath(
180+
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
181+
):
182+
# if we've got more than 50 tags, someone is taking the piss
183+
if len(results) >= 50:
184+
logger.warning(
185+
"Skipping parsing of Open Graph for page with too many '%s:' tags",
186+
prefix,
187+
)
188+
return {}
189+
190+
key = tag.attrib[property]
191+
if property_mapper:
192+
key = property_mapper(key)
193+
# None is a special value used to ignore a value.
194+
if key is None:
195+
continue
196+
197+
results[key] = tag.attrib["content"]
198+
199+
return results
200+
201+
202+
def _map_twitter_to_open_graph(key: str) -> Optional[str]:
203+
"""
204+
Map a Twitter card property to the analogous Open Graph property.
205+
206+
Args:
207+
key: The Twitter card property (starts with "twitter:").
208+
209+
Returns:
210+
The Open Graph property (starts with "og:") or None to have this property
211+
be ignored.
212+
"""
213+
# Twitter card properties with no analogous Open Graph property.
214+
if key == "twitter:card" or key == "twitter:creator":
215+
return None
216+
if key == "twitter:site":
217+
return "og:site_name"
218+
# Otherwise, swap twitter to og.
219+
return "og" + key[7:]
220+
221+
149222
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
150223
"""
151224
Parse the HTML document into an Open Graph response.
@@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
160233
The Open Graph response as a dictionary.
161234
"""
162235

163-
# if we see any image URLs in the OG response, then spider them
164-
# (although the client could choose to do this by asking for previews of those
165-
# URLs to avoid DoSing the server)
166-
236+
# Search for Open Graph (og:) meta tags, e.g.:
237+
#
167238
# "og:type" : "video",
168239
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
169240
# "og:site_name" : "YouTube",
@@ -176,26 +247,33 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
176247
# "og:video:height" : "720",
177248
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
178249

179-
og: Dict[str, Optional[str]] = {}
180-
for tag in tree.xpath(
181-
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
182-
):
183-
# if we've got more than 50 tags, someone is taking the piss
184-
if len(og) >= 50:
185-
logger.warning("Skipping OG for page with too many 'og:' tags")
186-
return {}
187-
188-
og[tag.attrib["property"]] = tag.attrib["content"]
189-
190-
# TODO: grab article: meta tags too, e.g.:
250+
og = _get_meta_tags(tree, "property", "og")
191251

252+
# TODO: Search for properties specific to the different Open Graph types,
253+
# such as article: meta tags, e.g.:
254+
#
192255
# "article:publisher" : "https://www.facebook.com/thethudonline" />
193256
# "article:author" content="https://www.facebook.com/thethudonline" />
194257
# "article:tag" content="baby" />
195258
# "article:section" content="Breaking News" />
196259
# "article:published_time" content="2016-03-31T19:58:24+00:00" />
197260
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
198261

262+
# Search for Twitter Card (twitter:) meta tags, e.g.:
263+
#
264+
# "twitter:site" : "@matrixdotorg"
265+
# "twitter:creator" : "@matrixdotorg"
266+
#
267+
# Twitter cards tags also duplicate Open Graph tags.
268+
#
269+
# See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
270+
twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
271+
# Merge the Twitter values with the Open Graph values, but do not overwrite
272+
# information from Open Graph tags.
273+
for key, value in twitter.items():
274+
if key not in og:
275+
og[key] = value
276+
199277
if "og:title" not in og:
200278
# Attempt to find a title from the title tag, or the biggest header on the page.
201279
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")

tests/rest/media/v1/test_html_preview.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,47 @@ def test_windows_1252(self) -> None:
370370
og = parse_html_to_open_graph(tree)
371371
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
372372

373+
def test_twitter_tag(self) -> None:
374+
"""Twitter card tags should be used if nothing else is available."""
375+
html = b"""
376+
<html>
377+
<meta name="twitter:card" content="summary">
378+
<meta name="twitter:description" content="Description">
379+
<meta name="twitter:site" content="@matrixdotorg">
380+
</html>
381+
"""
382+
tree = decode_body(html, "http://example.com/test.html")
383+
og = parse_html_to_open_graph(tree)
384+
self.assertEqual(
385+
og,
386+
{
387+
"og:title": None,
388+
"og:description": "Description",
389+
"og:site_name": "@matrixdotorg",
390+
},
391+
)
392+
393+
# But they shouldn't override Open Graph values.
394+
html = b"""
395+
<html>
396+
<meta name="twitter:card" content="summary">
397+
<meta name="twitter:description" content="Description">
398+
<meta property="og:description" content="Real Description">
399+
<meta name="twitter:site" content="@matrixdotorg">
400+
<meta property="og:site_name" content="matrix.org">
401+
</html>
402+
"""
403+
tree = decode_body(html, "http://example.com/test.html")
404+
og = parse_html_to_open_graph(tree)
405+
self.assertEqual(
406+
og,
407+
{
408+
"og:title": None,
409+
"og:description": "Real Description",
410+
"og:site_name": "matrix.org",
411+
},
412+
)
413+
373414

374415
class MediaEncodingTestCase(unittest.TestCase):
375416
def test_meta_charset(self) -> None:

0 commit comments

Comments
 (0)