From 371e39304151be8e35af6e4da541a803b96c6b4c Mon Sep 17 00:00:00 2001 From: Yaniv Israel Date: Sun, 3 Oct 2021 14:20:00 +0300 Subject: [PATCH 1/2] adding get_relevant_date method --- articleDateExtractor/__init__.py | 51 ++++++++++++++++++++++++++++---- setup.py | 2 +- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py index 201133a..a034fe1 100644 --- a/articleDateExtractor/__init__.py +++ b/articleDateExtractor/__init__.py @@ -2,6 +2,8 @@ import re,json from dateutil.parser import parse +from datetime import datetime +import pytz #try except for different urllib under python3 and python2 try: @@ -220,10 +222,7 @@ def extractArticlePublishedDate(articleLink, html = None): articleDate = _extractFromURL(articleLink) if html is None: - request = urllib.Request(articleLink) - # Using a browser user agent, decreases the change of sites blocking this request - just a suggestion - # request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36') - html = urllib.build_opener().open(request).read() + html = _get_html_response(articleLink) parsedHTML = BeautifulSoup(html,"lxml") @@ -243,10 +242,52 @@ def extractArticlePublishedDate(articleLink, html = None): return articleDate +def _get_html_response(url): + """ + simple request execution + :param url: string of url + :return: html response + """ + request = urllib.Request(url) + html = urllib.build_opener().open(request).read() + + return html + + + +def get_relevant_date(url, html=None): + """ + retrieves the most relevant published date for an article + :param url: string of url + :param html: string of html response (to avoid request execution) + :return: oldest date from the following options: + 1) date in the url + 2) headers of the response (json-ld, meta, etc.) + 3) html known tags + """ + # getting date by input url + url_base_date = _extractFromURL(url) + + # bs parsing for extended data + html = html or _get_html_response(url) + parsed_html = BeautifulSoup(html, "lxml") + + # extended dates (json-ld, html tags, etc.) + jsonld_base_date = _extractFromLDJson(parsed_html) + meta_base_date = _extractFromMeta(parsed_html) + html_tags_base_date = _extractFromHTMLTag(parsed_html) + + possible_dates = [url_base_date, jsonld_base_date, meta_base_date, html_tags_base_date] + possible_dates = filter(lambda _date: _date is not None and isinstance(_date, datetime), possible_dates) + possible_dates = [_date.replace(tzinfo=pytz.UTC) for _date in possible_dates] + print(possible_dates) + + # return oldest date + return min(possible_dates) if __name__ == '__main__': d = extractArticlePublishedDate("http://techcrunch.com/2015/11/30/atlassian-share-price/") print(d) - \ No newline at end of file + diff --git a/setup.py b/setup.py index cc9431e..834a6e2 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='articleDateExtractor', packages=['articleDateExtractor'], - version='0.20', + version='0.21.0', author='Ran Geva', author_email='ran@webhose.io, yitao.sun@yahoo.com, wilson.s.shilo@gmail.com', url='https://github.com/Webhose/article-date-extractor', From bfb7b389a37241c3e106d5d41cbc57bebef2102b Mon Sep 17 00:00:00 2001 From: Yaniv Israel Date: Mon, 24 Apr 2023 18:49:33 +0300 Subject: [PATCH 2/2] rename package to webz-article-date-extractor --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 834a6e2..4bba61b 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ readme = f.read() setup( - name='articleDateExtractor', + name='webz-article-date-extractor', packages=['articleDateExtractor'], version='0.21.0', author='Ran Geva',