diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py
index 201133a..49474c4 100644
--- a/articleDateExtractor/__init__.py
+++ b/articleDateExtractor/__init__.py
@@ -1,69 +1,97 @@
__author__ = 'Ran Geva'
-import re,json
+import re, json
from dateutil.parser import parse
+import dateparser
+from datetime import datetime
+from webhose_metrics import count as metrics_count
+import pytz
+from logger import Logger
-#try except for different urllib under python3 and python2
+datetime_html_attributes_formats = "pub+|article+|date+|time+|tms+|mod+"
+
+logger_handler = Logger(name="article_date_extractor_logger", path="/var/log/webhose/articleDateExtractor_logs",
+ level="DEBUG").get_logger()
+
+# try except for different urllib under python3 and python2
try:
import urllib.request as urllib
except ImportError:
import urllib2 as urllib
-
try:
- from bs4 import BeautifulSoup
+ from bs4 import BeautifulSoup, Tag
except ImportError:
- from BeautifulSoup import BeautifulSoup
-
+ from BeautifulSoup import BeautifulSoup, Tag
-def parseStrDate(dateString):
+def parse_date_by_daetutil(dateString):
try:
dateTimeObj = parse(dateString)
return dateTimeObj
- except:
+ except Exception as err:
return None
+
+def parse_date_by_dateparser(dateString):
+ try:
+ dateTimeObj = dateparser.parse(dateString)
+ return dateTimeObj
+ except Exception as err:
+ return None
+
+
+def parseStrDate(dateString):
+ dateTimeObj = None
+ if dateString is not None:
+ dateString = dateString.rstrip().lstrip()
+ dateTimeObj = parse_date_by_daetutil(dateString)
+ if dateTimeObj is None or "":
+ dateTimeObj = parse_date_by_dateparser(dateString)
+ return dateTimeObj
+
+
# Try to extract from the article URL - simple but might work as a fallback
def _extractFromURL(url):
-
- #Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
- m = re.search(r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?', url)
+ # Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py
+ m = re.search(
+ r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?',
+ url)
if m:
return parseStrDate(m.group(0))
- return None
+ return None
+
+
+def _extract_by_tag(tag, parsedHTML, attr):
+ for tag_span in parsedHTML.find_all(tag, **{attr: re.compile(datetime_html_attributes_formats, re.IGNORECASE)}):
+ dateText = tag_span.string or tag_span.text
+ return parseStrDate(dateText)
+
def _extractFromLDJson(parsedHTML):
- jsonDate = None
try:
script = parsedHTML.find('script', type='application/ld+json')
if script is None:
+ logger_handler.debug("ERROR: [_extractFromLDJson] - script none")
return None
+ if len(script.text):
+ script_data = json.loads(script.text)
+ elif len(script.string):
+ script_data = json.loads(script.string)
+ if isinstance(script_data, dict):
+ script_data = [script_data]
+ for data in script_data:
+ jsonDate = parseStrDate(data.get('dateCreated', None)) or parseStrDate(data.get('datePublished', None))
+ if jsonDate:
+ return jsonDate
+ except Exception as err:
+ logger_handler.debug("ERROR: [_extractFromLDJson] - {err}".format(err=err))
- data = json.loads(script.text)
-
- try:
- jsonDate = parseStrDate(data['datePublished'])
- except Exception as e:
- pass
-
- try:
- jsonDate = parseStrDate(data['dateCreated'])
- except Exception as e:
- pass
-
-
- except Exception as e:
- return None
-
-
-
- return jsonDate
+ return None
def _extractFromMeta(parsedHTML):
-
metaDate = None
for meta in parsedHTML.findAll("meta"):
metaName = meta.get('name', '').lower()
@@ -71,120 +99,119 @@ def _extractFromMeta(parsedHTML):
httpEquiv = meta.get('http-equiv', '').lower()
metaProperty = meta.get('property', '').lower()
-
- #
+ #
if 'pubdate' == metaName:
metaDate = meta['content'].strip()
break
-
- #
+ #
if 'publishdate' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'timestamp' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'dc.date.issued' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'article:published_time' == metaProperty:
metaDate = meta['content'].strip()
break
- #
+ #
if 'date' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'bt:pubdate' == metaProperty:
metaDate = meta['content'].strip()
break
- #
+ #
if 'sailthru.date' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'article.published' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'published-date' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'article.created' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'article_date_original' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'cxenseparse:recs:publishtime' == metaName:
metaDate = meta['content'].strip()
break
- #
+ #
if 'date_published' == metaName:
metaDate = meta['content'].strip()
break
-
- #
+ #
if 'datepublished' == itemProp:
metaDate = meta['content'].strip()
break
-
- #
+ #
if 'datecreated' == itemProp:
metaDate = meta['content'].strip()
break
-
-
-
- #
+ #
if 'og:image' == metaProperty or "image" == itemProp:
url = meta['content'].strip()
possibleDate = _extractFromURL(url)
if possibleDate is not None:
- return possibleDate
-
+ return possibleDate
- #
+ #
if 'date' == httpEquiv:
metaDate = meta['content'].strip()
break
+ logger_handler.debug(
+ "ERROR-INFO- [_extractFromMeta] - not found properties for meta: {metadata}".format(metadata=meta))
+
if metaDate is not None:
return parseStrDate(metaDate)
+ logger_handler.debug("ERROR: [_extractFromMeta] - Failed to parse from meta properties")
return None
+
def _extractFromHTMLTag(parsedHTML):
- #