From 371e39304151be8e35af6e4da541a803b96c6b4c Mon Sep 17 00:00:00 2001 From: Yaniv Israel Date: Sun, 3 Oct 2021 14:20:00 +0300 Subject: [PATCH 1/8] adding get_relevant_date method --- articleDateExtractor/__init__.py | 51 ++++++++++++++++++++++++++++---- setup.py | 2 +- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py index 201133a..a034fe1 100644 --- a/articleDateExtractor/__init__.py +++ b/articleDateExtractor/__init__.py @@ -2,6 +2,8 @@ import re,json from dateutil.parser import parse +from datetime import datetime +import pytz #try except for different urllib under python3 and python2 try: @@ -220,10 +222,7 @@ def extractArticlePublishedDate(articleLink, html = None): articleDate = _extractFromURL(articleLink) if html is None: - request = urllib.Request(articleLink) - # Using a browser user agent, decreases the change of sites blocking this request - just a suggestion - # request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36') - html = urllib.build_opener().open(request).read() + html = _get_html_response(articleLink) parsedHTML = BeautifulSoup(html,"lxml") @@ -243,10 +242,52 @@ def extractArticlePublishedDate(articleLink, html = None): return articleDate +def _get_html_response(url): + """ + simple request execution + :param url: string of url + :return: html response + """ + request = urllib.Request(url) + html = urllib.build_opener().open(request).read() + + return html + + + +def get_relevant_date(url, html=None): + """ + retrieves the most relevant published date for an article + :param url: string of url + :param html: string of html response (to avoid request execution) + :return: oldest date from the following options: + 1) date in the url + 2) headers of the response (json-ld, meta, etc.) + 3) html known tags + """ + # getting date by input url + url_base_date = _extractFromURL(url) + + # bs parsing for extended data + html = html or _get_html_response(url) + parsed_html = BeautifulSoup(html, "lxml") + + # extended dates (json-ld, html tags, etc.) + jsonld_base_date = _extractFromLDJson(parsed_html) + meta_base_date = _extractFromMeta(parsed_html) + html_tags_base_date = _extractFromHTMLTag(parsed_html) + + possible_dates = [url_base_date, jsonld_base_date, meta_base_date, html_tags_base_date] + possible_dates = filter(lambda _date: _date is not None and isinstance(_date, datetime), possible_dates) + possible_dates = [_date.replace(tzinfo=pytz.UTC) for _date in possible_dates] + print(possible_dates) + + # return oldest date + return min(possible_dates) if __name__ == '__main__': d = extractArticlePublishedDate("http://techcrunch.com/2015/11/30/atlassian-share-price/") print(d) - \ No newline at end of file + diff --git a/setup.py b/setup.py index cc9431e..834a6e2 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='articleDateExtractor', packages=['articleDateExtractor'], - version='0.20', + version='0.21.0', author='Ran Geva', author_email='ran@webhose.io, yitao.sun@yahoo.com, wilson.s.shilo@gmail.com', url='https://github.com/Webhose/article-date-extractor', From bfb7b389a37241c3e106d5d41cbc57bebef2102b Mon Sep 17 00:00:00 2001 From: Yaniv Israel Date: Mon, 24 Apr 2023 18:49:33 +0300 Subject: [PATCH 2/8] rename package to webz-article-date-extractor --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 834a6e2..4bba61b 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ readme = f.read() setup( - name='articleDateExtractor', + name='webz-article-date-extractor', packages=['articleDateExtractor'], version='0.21.0', author='Ran Geva', From ed7942dfb40a72226d35731dff2f4678f621439c Mon Sep 17 00:00:00 2001 From: orzinger Date: Thu, 27 Apr 2023 13:53:10 +0300 Subject: [PATCH 3/8] add domain extraction --- articleDateExtractor/__init__.py | 101 ++++++++++++++----------------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py index a034fe1..078fb11 100644 --- a/articleDateExtractor/__init__.py +++ b/articleDateExtractor/__init__.py @@ -1,24 +1,24 @@ __author__ = 'Ran Geva' -import re,json +import re, json from dateutil.parser import parse from datetime import datetime +from webhose_metrics import count as metrics_count import pytz +from urlparse import urlparse -#try except for different urllib under python3 and python2 +# try except for different urllib under python3 and python2 try: import urllib.request as urllib except ImportError: import urllib2 as urllib - try: from bs4 import BeautifulSoup except ImportError: from BeautifulSoup import BeautifulSoup - def parseStrDate(dateString): try: dateTimeObj = parse(dateString) @@ -26,15 +26,18 @@ def parseStrDate(dateString): except: return None + # Try to extract from the article URL - simple but might work as a fallback def _extractFromURL(url): - - #Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py - m = re.search(r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?', url) + # Regex by Newspaper3k - https://github.com/codelucas/newspaper/blob/master/newspaper/urls.py + m = re.search( + r'([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})?', + url) if m: return parseStrDate(m.group(0)) - return None + return None + def _extractFromLDJson(parsedHTML): jsonDate = None @@ -59,13 +62,10 @@ def _extractFromLDJson(parsedHTML): except Exception as e: return None - - return jsonDate def _extractFromMeta(parsedHTML): - metaDate = None for meta in parsedHTML.findAll("meta"): metaName = meta.get('name', '').lower() @@ -73,100 +73,92 @@ def _extractFromMeta(parsedHTML): httpEquiv = meta.get('http-equiv', '').lower() metaProperty = meta.get('property', '').lower() - - # + # if 'pubdate' == metaName: metaDate = meta['content'].strip() break - - # + # if 'publishdate' == metaName: metaDate = meta['content'].strip() break - # + # if 'timestamp' == metaName: metaDate = meta['content'].strip() break - # + # if 'dc.date.issued' == metaName: metaDate = meta['content'].strip() break - # + # if 'article:published_time' == metaProperty: metaDate = meta['content'].strip() break - # + # if 'date' == metaName: metaDate = meta['content'].strip() break - # + # if 'bt:pubdate' == metaProperty: metaDate = meta['content'].strip() break - # + # if 'sailthru.date' == metaName: metaDate = meta['content'].strip() break - # + # if 'article.published' == metaName: metaDate = meta['content'].strip() break - # + # if 'published-date' == metaName: metaDate = meta['content'].strip() break - # + # if 'article.created' == metaName: metaDate = meta['content'].strip() break - # + # if 'article_date_original' == metaName: metaDate = meta['content'].strip() break - # + # if 'cxenseparse:recs:publishtime' == metaName: metaDate = meta['content'].strip() break - # + # if 'date_published' == metaName: metaDate = meta['content'].strip() break - - # + # if 'datepublished' == itemProp: metaDate = meta['content'].strip() break - - # + # if 'datecreated' == itemProp: metaDate = meta['content'].strip() break - - - - # + # if 'og:image' == metaProperty or "image" == itemProp: url = meta['content'].strip() possibleDate = _extractFromURL(url) if possibleDate is not None: - return possibleDate + return possibleDate - - # + # if 'date' == httpEquiv: metaDate = meta['content'].strip() break @@ -176,8 +168,9 @@ def _extractFromMeta(parsedHTML): return None + def _extractFromHTMLTag(parsedHTML): - #