Webhose · yanivIsrael · Oct 3, 2021 · Apr 24, 2023 · Apr 27, 2023
diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py
@@ -2,6 +2,8 @@
 
 import re,json
 from dateutil.parser import parse
+from datetime import datetime
+import pytz
 
 #try except for different urllib under python3 and python2
 try:
@@ -220,10 +222,7 @@ def extractArticlePublishedDate(articleLink, html = None):
         articleDate = _extractFromURL(articleLink)
 
         if html is None:
-            request = urllib.Request(articleLink)
-            # Using a browser user agent, decreases the change of sites blocking this request - just a suggestion
-            # request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')
-            html = urllib.build_opener().open(request).read()
+            html = _get_html_response(articleLink)
 
         parsedHTML = BeautifulSoup(html,"lxml")
 
@@ -243,10 +242,52 @@ def extractArticlePublishedDate(articleLink, html = None):
     return articleDate
 
 
+def _get_html_response(url):
+    """
+    simple request execution
+    :param url: string of url
+    :return: html response
+    """
+    request = urllib.Request(url)
+    html = urllib.build_opener().open(request).read()
+
+    return html
+
+
+
+def get_relevant_date(url, html=None):
+    """
+    retrieves the most relevant published date for an article
+    :param url: string of url
+    :param html: string of html response (to avoid request execution)
+    :return: oldest date from the following options:
+        1) date in the url
+        2) headers of the response (json-ld, meta, etc.)
+        3) html known tags
+    """
+    # getting date by input url
+    url_base_date = _extractFromURL(url)
+
+    # bs parsing for extended data
+    html = html or _get_html_response(url)
+    parsed_html = BeautifulSoup(html, "lxml")
+
+    # extended dates (json-ld, html tags, etc.)
+    jsonld_base_date = _extractFromLDJson(parsed_html)
+    meta_base_date = _extractFromMeta(parsed_html)
+    html_tags_base_date = _extractFromHTMLTag(parsed_html)
+
+    possible_dates = [url_base_date, jsonld_base_date, meta_base_date, html_tags_base_date]
+    possible_dates = filter(lambda _date: _date is not None and isinstance(_date, datetime), possible_dates)
+    possible_dates = [_date.replace(tzinfo=pytz.UTC) for _date in possible_dates]
+    print(possible_dates)
+
+    # return oldest date
+    return min(possible_dates)
 
 
 if __name__ == '__main__':
     d = extractArticlePublishedDate("http://techcrunch.com/2015/11/30/atlassian-share-price/")
     print(d)
 
-
+
diff --git a/setup.py b/setup.py
@@ -9,9 +9,9 @@
     readme = f.read()
 
 setup(
-    name='articleDateExtractor',
+    name='webz-article-date-extractor',
     packages=['articleDateExtractor'],
-    version='0.20',
+    version='0.21.0',
     author='Ran Geva',
     author_email='[email protected], [email protected], [email protected]',
     url='https://github.com/Webhose/article-date-extractor',