3030)
3131_content_type_match = re .compile (r'.*; *charset="?(.*?)"?(;|$)' , flags = re .I )
3232
33+ # Certain elements aren't meant for display.
34+ ARIA_ROLES_TO_IGNORE = {"directory" , "menu" , "menubar" , "toolbar" }
35+
3336
3437def _normalise_encoding (encoding : str ) -> Optional [str ]:
3538 """Use the Python codec's name as the normalised entry."""
@@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
174177 # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
175178
176179 og : Dict [str , Optional [str ]] = {}
177- for tag in tree .xpath ("//*/meta[starts-with(@property, 'og:')]" ):
178- if "content" in tag .attrib :
179- # if we've got more than 50 tags, someone is taking the piss
180- if len (og ) >= 50 :
181- logger .warning ("Skipping OG for page with too many 'og:' tags" )
182- return {}
183- og [tag .attrib ["property" ]] = tag .attrib ["content" ]
180+ for tag in tree .xpath (
181+ "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
182+ ):
183+ # if we've got more than 50 tags, someone is taking the piss
184+ if len (og ) >= 50 :
185+ logger .warning ("Skipping OG for page with too many 'og:' tags" )
186+ return {}
187+
188+ og [tag .attrib ["property" ]] = tag .attrib ["content" ]
184189
185190 # TODO: grab article: meta tags too, e.g.:
186191
@@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
192197 # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
193198
194199 if "og:title" not in og :
195- # do some basic spidering of the HTML
196- title = tree .xpath ("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]" )
197- if title and title [ 0 ]. text is not None :
198- og ["og:title" ] = title [0 ].text . strip ()
200+ # Attempt to find a title from the title tag, or the biggest header on the page.
201+ title = tree .xpath ("(( //title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text() " )
202+ if title :
203+ og ["og:title" ] = title [0 ].strip ()
199204 else :
200205 og ["og:title" ] = None
201206
202207 if "og:image" not in og :
203- # TODO: extract a favicon failing all else
204208 meta_image = tree .xpath (
205- "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
209+ "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')] /@content[1] "
206210 )
211+ # If a meta image is found, use it.
207212 if meta_image :
208213 og ["og:image" ] = meta_image [0 ]
209214 else :
215+ # Try to find images which are larger than 10px by 10px.
216+ #
210217 # TODO: consider inlined CSS styles as well as width & height attribs
211218 images = tree .xpath ("//img[@src][number(@width)>10][number(@height)>10]" )
212219 images = sorted (
@@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
215222 - 1 * float (i .attrib ["width" ]) * float (i .attrib ["height" ])
216223 ),
217224 )
225+ # If no images were found, try to find *any* images.
218226 if not images :
219- images = tree .xpath ("//img[@src]" )
227+ images = tree .xpath ("//img[@src][1] " )
220228 if images :
221229 og ["og:image" ] = images [0 ].attrib ["src" ]
222230
231+ # Finally, fallback to the favicon if nothing else.
232+ else :
233+ favicons = tree .xpath ("//link[@href][contains(@rel, 'icon')]/@href[1]" )
234+ if favicons :
235+ og ["og:image" ] = favicons [0 ]
236+
223237 if "og:description" not in og :
238+ # Check the first meta description tag for content.
224239 meta_description = tree .xpath (
225- "//*/meta"
226- "[translate(@name, 'DESCRIPTION', 'description')='description']"
227- "/@content"
240+ "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
228241 )
242+ # If a meta description is found with content, use it.
229243 if meta_description :
230244 og ["og:description" ] = meta_description [0 ]
231245 else :
@@ -306,6 +320,10 @@ def _iterate_over_text(
306320 if isinstance (el , str ):
307321 yield el
308322 elif el .tag not in tags_to_ignore :
323+ # If the element isn't meant for display, ignore it.
324+ if el .get ("role" ) in ARIA_ROLES_TO_IGNORE :
325+ continue
326+
309327 # el.text is the text before the first child, so we can immediately
310328 # return it if the text exists.
311329 if el .text :
0 commit comments