diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py index 201133a..49474c4 100644 --- a/articleDateExtractor/__init__.py +++ b/articleDateExtractor/__init__.py @@ -1,69 +1,97 @@ __author__ = 'Ran Geva' -import re,json +import re, json from dateutil.parser import parse +import dateparser +from datetime import datetime +from webhose_metrics import count as metrics_count +import pytz +from logger import Logger -#try except for different urllib under python3 and python2 +datetime_html_attributes_formats = "pub+|article+|date+|time+|tms+|mod+" + +logger_handler = Logger(name="article_date_extractor_logger", path="/var/log/webhose/articleDateExtractor_logs", + level="DEBUG").get_logger() + +# try except for different urllib under python3 and python2 try: import urllib.request as urllib except ImportError: import urllib2 as urllib - try: - from bs4 import BeautifulSoup + from bs4 import BeautifulSoup, Tag except ImportError: - from BeautifulSoup import BeautifulSoup - + from BeautifulSoup import BeautifulSoup, Tag -def parseStrDate(dateString): +def parse_date_by_daetutil(dateString): try: dateTimeObj = parse(dateString) return dateTimeObj - except: + except Exception as err: return None + +def parse_date_by_dateparser(dateString): + try: + dateTimeObj = dateparser.parse(dateString) + return dateTimeObj + except Exception as err: + return None + + +def parseStrDate(dateString): + dateTimeObj = None + if dateString is not None: + dateString = dateString.rstrip().lstrip() + dateTimeObj = parse_date_by_daetutil(dateString) + if dateTimeObj is None or "": + dateTimeObj = parse_date_by_dateparser(dateString) + return dateTimeObj + + # Try to extract from the article URL - simple but might work as a fallback def _extractFromURL(url): - - #Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py - m = re.search(r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?', url) + # Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py + m = re.search( + r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?', + url) if m: return parseStrDate(m.group(0)) - return None + return None + + +def _extract_by_tag(tag, parsedHTML, attr): + for tag_span in parsedHTML.find_all(tag, **{attr: re.compile(datetime_html_attributes_formats, re.IGNORECASE)}): + dateText = tag_span.string or tag_span.text + return parseStrDate(dateText) + def _extractFromLDJson(parsedHTML): - jsonDate = None try: script = parsedHTML.find('script', type='application/ld+json') if script is None: + logger_handler.debug("ERROR: [_extractFromLDJson] - script none") return None + if len(script.text): + script_data = json.loads(script.text) + elif len(script.string): + script_data = json.loads(script.string) + if isinstance(script_data, dict): + script_data = [script_data] + for data in script_data: + jsonDate = parseStrDate(data.get('dateCreated', None)) or parseStrDate(data.get('datePublished', None)) + if jsonDate: + return jsonDate + except Exception as err: + logger_handler.debug("ERROR: [_extractFromLDJson] - {err}".format(err=err)) - data = json.loads(script.text) - - try: - jsonDate = parseStrDate(data['datePublished']) - except Exception as e: - pass - - try: - jsonDate = parseStrDate(data['dateCreated']) - except Exception as e: - pass - - - except Exception as e: - return None - - - - return jsonDate + return None def _extractFromMeta(parsedHTML): - metaDate = None for meta in parsedHTML.findAll("meta"): metaName = meta.get('name', '').lower() @@ -71,120 +99,119 @@ def _extractFromMeta(parsedHTML): httpEquiv = meta.get('http-equiv', '').lower() metaProperty = meta.get('property', '').lower() - - # + # if 'pubdate' == metaName: metaDate = meta['content'].strip() break - - # + # if 'publishdate' == metaName: metaDate = meta['content'].strip() break - # + # if 'timestamp' == metaName: metaDate = meta['content'].strip() break - # + # if 'dc.date.issued' == metaName: metaDate = meta['content'].strip() break - # + # if 'article:published_time' == metaProperty: metaDate = meta['content'].strip() break - # + # if 'date' == metaName: metaDate = meta['content'].strip() break - # + # if 'bt:pubdate' == metaProperty: metaDate = meta['content'].strip() break - # + # if 'sailthru.date' == metaName: metaDate = meta['content'].strip() break - # + # if 'article.published' == metaName: metaDate = meta['content'].strip() break - # + # if 'published-date' == metaName: metaDate = meta['content'].strip() break - # + # if 'article.created' == metaName: metaDate = meta['content'].strip() break - # + # if 'article_date_original' == metaName: metaDate = meta['content'].strip() break - # + # if 'cxenseparse:recs:publishtime' == metaName: metaDate = meta['content'].strip() break - # + # if 'date_published' == metaName: metaDate = meta['content'].strip() break - - # + # if 'datepublished' == itemProp: metaDate = meta['content'].strip() break - - # + # if 'datecreated' == itemProp: metaDate = meta['content'].strip() break - - - - # + # if 'og:image' == metaProperty or "image" == itemProp: url = meta['content'].strip() possibleDate = _extractFromURL(url) if possibleDate is not None: - return possibleDate - + return possibleDate - # + # if 'date' == httpEquiv: metaDate = meta['content'].strip() break + logger_handler.debug( + "ERROR-INFO- [_extractFromMeta] - not found properties for meta: {metadata}".format(metadata=meta)) + if metaDate is not None: return parseStrDate(metaDate) + logger_handler.debug("ERROR: [_extractFromMeta] - Failed to parse from meta properties") return None + def _extractFromHTMLTag(parsedHTML): - #