Skip to content

Commit 97b5d2b

Browse files
author
theblackcat102
committed
bug fixes
1 parent 3fb1a1d commit 97b5d2b

File tree

5 files changed

+31
-10
lines changed

5 files changed

+31
-10
lines changed

extractnet/metadata_extraction/json_ld.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@ def extract_json(schema, metadata):
3636

3737
if '@type' not in content:
3838
continue
39-
if isinstance(content["@type"], list):
39+
if isinstance(content["@type"], list) and len(content["@type"]):
4040
# some websites are using ['Person'] as type
4141
content_type = content["@type"][0].lower()
42-
else:
42+
elif isinstance(content['@type'], str):
4343
content_type = content["@type"].lower()
44+
else:
45+
continue
4446

4547
if content_type in JSON_PUBLISHER_SCHEMA:
4648
for candidate in ("name", "alternateName"):

extractnet/metadata_extraction/metadata.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
TWITTER_ATTRS, METANAME_TAG, EXTRA_META, METANAME_TITLE
3232
)
3333
LOGGER = logging.getLogger(__name__)
34-
logging.getLogger('htmldate').setLevel(logging.WARNING)
34+
# logging.getLogger('htmldate').setLevel(logging.WARNING)
3535

3636

3737
def criteria_fulfilled(metadata):
@@ -298,8 +298,12 @@ def extract_author(tree):
298298
matches = tree.re_xpath("//*[re:match( text(), '{}' )]".format(text_author_pattern))
299299
if len(matches) > 0:
300300
match_text = matches[0].text
301-
author = re.search(text_author_pattern, match_text).group(0)
302-
break
301+
try:
302+
author = re.search(text_author_pattern, match_text).group(0)
303+
except TypeError:
304+
continue
305+
else:
306+
break
303307

304308
return author
305309

@@ -311,7 +315,9 @@ def extract_url(tree, default_url=None):
311315
url = default_url
312316
# try canonical link first
313317
element = tree.find('.//head//link[@rel="canonical"]')
314-
if element is not None and URL_COMP_CHECK.match(element.attrib['href']):
318+
if element is not None and \
319+
'href' in element.attrib and \
320+
URL_COMP_CHECK.match(element.attrib['href']):
315321
url = element.attrib['href']
316322
# try default language link
317323
else:
@@ -444,7 +450,10 @@ def extract_metadata(filecontent, default_url=None, date_config=None, fastmode=F
444450
date_config['url'] = metadata['url']
445451
metadata['date'] = find_date(tree, **date_config)
446452

447-
if metadata['sitename'] is not None:
453+
if isinstance(metadata['sitename'], list):
454+
metadata['sitename'] = metadata['sitename'][0]
455+
456+
if isinstance(metadata['sitename'], str):
448457
if metadata['sitename'].startswith('@'):
449458
# scrap Twitter ID
450459
metadata['sitename'] = re.sub(r'^@', '', metadata['sitename'])

extractnet/metadata_extraction/url_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from multiprocessing.sharedctypes import Value
12
import re
23
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse
34
from tld import get_tld
@@ -57,7 +58,10 @@ def date_updater(url_date_token, date):
5758

5859
month = url_date_token[1]
5960
if month > 0 and month < 13 and date.month != month:
60-
date = date.replace(month = month)
61+
try:
62+
date = date.replace(month=month)
63+
except ValueError: # when month=2
64+
pass
6165

6266
day = url_date_token[2]
6367
if day > 0 and day < 32 and day != date.day:

extractnet/metadata_extraction/video.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ def get_advance_fields(raw_html):
7777
if speechkit.find('iframe'):
7878
if audio_urls == None:
7979
audio_urls = []
80-
audio_urls.append(speechkit_audio(speechkit.find('iframe').get('src')))
80+
try:
81+
audio_urls.append(speechkit_audio(speechkit.find('iframe').get('src')))
82+
except ValueError:
83+
pass
8184

8285
'''
8386
Video extraction

extractnet/util.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,10 @@ def fix_encoding(text):
177177
if isinstance(text, str):
178178
text = ftfy.fix_text(ftfy.fix_encoding(text))
179179
if '\\u' in text:
180-
text = text.encode().decode('unicode_escape')
180+
try:
181+
text = text.encode().decode('unicode_escape')
182+
except UnicodeDecodeError as e:
183+
return text
181184
return text
182185
elif isinstance(text, list):
183186
return [ ftfy.fix_text(ftfy.fix_encoding(t)) for t in text ]

0 commit comments

Comments
 (0)