1515import itertools
1616import logging
1717import re
18- from typing import TYPE_CHECKING , Dict , Generator , Iterable , Optional , Set , Union
18+ from typing import (
19+ TYPE_CHECKING ,
20+ Callable ,
21+ Dict ,
22+ Generator ,
23+ Iterable ,
24+ Optional ,
25+ Set ,
26+ Union ,
27+ )
1928
2029if TYPE_CHECKING :
2130 from lxml import etree
@@ -146,6 +155,70 @@ def decode_body(
146155 return etree .fromstring (body , parser )
147156
148157
158+ def _get_meta_tags (
159+ tree : "etree.Element" ,
160+ property : str ,
161+ prefix : str ,
162+ property_mapper : Optional [Callable [[str ], Optional [str ]]] = None ,
163+ ) -> Dict [str , Optional [str ]]:
164+ """
165+ Search for meta tags prefixed with a particular string.
166+
167+ Args:
168+ tree: The parsed HTML document.
169+ property: The name of the property which contains the tag name, e.g.
170+ "property" for Open Graph.
171+ prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
172+ property_mapper: An optional callable to map the property to the Open Graph
173+ form. Can return None for a key to ignore that key.
174+
175+ Returns:
176+ A map of tag name to value.
177+ """
178+ results : Dict [str , Optional [str ]] = {}
179+ for tag in tree .xpath (
180+ f"//*/meta[starts-with(@{ property } , '{ prefix } :')][@content][not(@content='')]"
181+ ):
182+ # if we've got more than 50 tags, someone is taking the piss
183+ if len (results ) >= 50 :
184+ logger .warning (
185+ "Skipping parsing of Open Graph for page with too many '%s:' tags" ,
186+ prefix ,
187+ )
188+ return {}
189+
190+ key = tag .attrib [property ]
191+ if property_mapper :
192+ key = property_mapper (key )
193+ # None is a special value used to ignore a value.
194+ if key is None :
195+ continue
196+
197+ results [key ] = tag .attrib ["content" ]
198+
199+ return results
200+
201+
202+ def _map_twitter_to_open_graph (key : str ) -> Optional [str ]:
203+ """
204+ Map a Twitter card property to the analogous Open Graph property.
205+
206+ Args:
207+ key: The Twitter card property (starts with "twitter:").
208+
209+ Returns:
210+ The Open Graph property (starts with "og:") or None to have this property
211+ be ignored.
212+ """
213+ # Twitter card properties with no analogous Open Graph property.
214+ if key == "twitter:card" or key == "twitter:creator" :
215+ return None
216+ if key == "twitter:site" :
217+ return "og:site_name"
218+ # Otherwise, swap twitter to og.
219+ return "og" + key [7 :]
220+
221+
149222def parse_html_to_open_graph (tree : "etree.Element" ) -> Dict [str , Optional [str ]]:
150223 """
151224 Parse the HTML document into an Open Graph response.
@@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
160233 The Open Graph response as a dictionary.
161234 """
162235
163- # if we see any image URLs in the OG response, then spider them
164- # (although the client could choose to do this by asking for previews of those
165- # URLs to avoid DoSing the server)
166-
236+ # Search for Open Graph (og:) meta tags, e.g.:
237+ #
167238 # "og:type" : "video",
168239 # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
169240 # "og:site_name" : "YouTube",
@@ -176,26 +247,33 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
176247 # "og:video:height" : "720",
177248 # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
178249
179- og : Dict [str , Optional [str ]] = {}
180- for tag in tree .xpath (
181- "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
182- ):
183- # if we've got more than 50 tags, someone is taking the piss
184- if len (og ) >= 50 :
185- logger .warning ("Skipping OG for page with too many 'og:' tags" )
186- return {}
187-
188- og [tag .attrib ["property" ]] = tag .attrib ["content" ]
189-
190- # TODO: grab article: meta tags too, e.g.:
250+ og = _get_meta_tags (tree , "property" , "og" )
191251
252+ # TODO: Search for properties specific to the different Open Graph types,
253+ # such as article: meta tags, e.g.:
254+ #
192255 # "article:publisher" : "https://www.facebook.com/thethudonline" />
193256 # "article:author" content="https://www.facebook.com/thethudonline" />
194257 # "article:tag" content="baby" />
195258 # "article:section" content="Breaking News" />
196259 # "article:published_time" content="2016-03-31T19:58:24+00:00" />
197260 # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
198261
262+ # Search for Twitter Card (twitter:) meta tags, e.g.:
263+ #
264+ # "twitter:site" : "@matrixdotorg"
265+ # "twitter:creator" : "@matrixdotorg"
266+ #
267+ # Twitter cards tags also duplicate Open Graph tags.
268+ #
269+ # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
270+ twitter = _get_meta_tags (tree , "name" , "twitter" , _map_twitter_to_open_graph )
271+ # Merge the Twitter values with the Open Graph values, but do not overwrite
272+ # information from Open Graph tags.
273+ for key , value in twitter .items ():
274+ if key not in og :
275+ og [key ] = value
276+
199277 if "og:title" not in og :
200278 # Attempt to find a title from the title tag, or the biggest header on the page.
201279 title = tree .xpath ("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()" )
0 commit comments