3131 TWITTER_ATTRS , METANAME_TAG , EXTRA_META , METANAME_TITLE
3232)
3333LOGGER = logging .getLogger (__name__ )
34- logging .getLogger ('htmldate' ).setLevel (logging .WARNING )
34+ # logging.getLogger('htmldate').setLevel(logging.WARNING)
3535
3636
3737def criteria_fulfilled (metadata ):
@@ -298,8 +298,12 @@ def extract_author(tree):
298298 matches = tree .re_xpath ("//*[re:match( text(), '{}' )]" .format (text_author_pattern ))
299299 if len (matches ) > 0 :
300300 match_text = matches [0 ].text
301- author = re .search (text_author_pattern , match_text ).group (0 )
302- break
301+ try :
302+ author = re .search (text_author_pattern , match_text ).group (0 )
303+ except TypeError :
304+ continue
305+ else :
306+ break
303307
304308 return author
305309
@@ -311,7 +315,9 @@ def extract_url(tree, default_url=None):
311315 url = default_url
312316 # try canonical link first
313317 element = tree .find ('.//head//link[@rel="canonical"]' )
314- if element is not None and URL_COMP_CHECK .match (element .attrib ['href' ]):
318+ if element is not None and \
319+ 'href' in element .attrib and \
320+ URL_COMP_CHECK .match (element .attrib ['href' ]):
315321 url = element .attrib ['href' ]
316322 # try default language link
317323 else :
@@ -444,7 +450,10 @@ def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=F
444450 date_config ['url' ] = metadata ['url' ]
445451 metadata ['date' ] = find_date (tree , ** date_config )
446452
447- if metadata ['sitename' ] is not None :
453+ if isinstance (metadata ['sitename' ], list ):
454+ metadata ['sitename' ] = metadata ['sitename' ][0 ]
455+
456+ if isinstance (metadata ['sitename' ], str ):
448457 if metadata ['sitename' ].startswith ('@' ):
449458 # scrap Twitter ID
450459 metadata ['sitename' ] = re .sub (r'^@' , '' , metadata ['sitename' ])
0 commit comments