From 7edc0fd5d933320c91a5f27b416a035ca6cce643 Mon Sep 17 00:00:00 2001 From: Rishabh Shukla Date: Mon, 27 Feb 2017 17:32:58 +0530 Subject: [PATCH 1/4] (feat): Refactored library for python3 --- articleDateExtractor/__init__.py | 21 +++++++++++---------- setup.py | 10 +++++----- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/articleDateExtractor/__init__.py b/articleDateExtractor/__init__.py index 8753e4b..8d58383 100644 --- a/articleDateExtractor/__init__.py +++ b/articleDateExtractor/__init__.py @@ -1,6 +1,7 @@ __author__ = 'Ran Geva' -import urllib2,re, json +from six.moves import urllib +import re, json from dateutil.parser import parse try: from bs4 import BeautifulSoup @@ -38,16 +39,16 @@ def _extractFromLDJson(parsedHTML): try: jsonDate = parseStrDate(data['datePublished']) - except Exception, e: + except Exception as e: pass try: jsonDate = parseStrDate(data['dateCreated']) - except Exception, e: + except Exception as e: pass - except Exception, e: + except Exception as e: return None @@ -205,7 +206,7 @@ def _extractFromHTMLTag(parsedHTML): def extractArticlePublishedDate(articleLink, html = None): - print "Extracting date from " + articleLink + print("Extracting date from " + articleLink) articleDate = None @@ -213,10 +214,10 @@ def extractArticlePublishedDate(articleLink, html = None): articleDate = _extractFromURL(articleLink) if html is None: - request = urllib2.Request(articleLink) + request = urllib.request.Request(articleLink) # Using a browser user agent, decreases the change of sites blocking this request - just a suggestion # request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36') - html = urllib2.build_opener().open(request).read() + html = urllib.request.build_opener().open(request).read() parsedHTML = BeautifulSoup(html,"lxml") @@ -230,8 +231,8 @@ def extractArticlePublishedDate(articleLink, html = None): articleDate = possibleDate except Exception as e: - print "Exception in extractArticlePublishedDate for " + articleLink - print e.message, e.args + print("Exception in extractArticlePublishedDate for " + articleLink) + print(e.message, e.args) @@ -242,4 +243,4 @@ def extractArticlePublishedDate(articleLink, html = None): if __name__ == '__main__': d = extractArticlePublishedDate("http://techcrunch.com/2015/11/30/atlassian-share-price/") - print d + print(d) diff --git a/setup.py b/setup.py index 4c7af77..55fce6c 100644 --- a/setup.py +++ b/setup.py @@ -19,8 +19,9 @@ description='Automatically extracts and normalizes an online article or blog post publication date', long_description=readme, install_requires=[ - "BeautifulSoup >= 3.2.1", - "python-dateutil >= 2.4.2" + "beautifulsoup4>=4.5.3", + "python-dateutil>=2.6.0", + "six>=1.10.0" ], classifiers=( 'Development Status :: 4 - Beta', @@ -28,7 +29,6 @@ 'Natural Language :: English', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7' + 'Programming Language :: Python :: 3.5' ) -) \ No newline at end of file +) From 8f33de7c2cff59446358dbad7b957e52467884eb Mon Sep 17 00:00:00 2001 From: Thiago Galery Date: Tue, 16 May 2017 22:42:07 +0100 Subject: [PATCH 2/4] adding other deps --- setup.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.py b/setup.py index 55fce6c..830930a 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,8 @@ -from codecs import open - try: from setuptools import setup except ImportError: from distutils.core import setup -with open('README.md', 'r', 'utf-8') as f: - readme = f.read() setup( name='articleDateExtractor', @@ -17,7 +13,6 @@ url='https://github.com/Webhose/article-date-extractor', license='MIT', description='Automatically extracts and normalizes an online article or blog post publication date', - long_description=readme, install_requires=[ "beautifulsoup4>=4.5.3", "python-dateutil>=2.6.0", From 54a8016db879da4df518dab93b0e8d75fc3f6d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Kayisire?= Date: Tue, 23 May 2017 21:50:32 +0200 Subject: [PATCH 3/4] update setup.py change repo name in url --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 830930a..837cba0 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version='0.17', author='Ran Geva', author_email='ran@webhose.io', - url='https://github.com/Webhose/article-date-extractor', + url='https://github.com/factmata/article-date-extractor', license='MIT', description='Automatically extracts and normalizes an online article or blog post publication date', install_requires=[ From 9c1664ae3379b890efb41981e65cbfed0ec4963b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Kayisire?= Date: Tue, 23 May 2017 22:01:40 +0200 Subject: [PATCH 4/4] update change back --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 837cba0..830930a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version='0.17', author='Ran Geva', author_email='ran@webhose.io', - url='https://github.com/factmata/article-date-extractor', + url='https://github.com/Webhose/article-date-extractor', license='MIT', description='Automatically extracts and normalizes an online article or blog post publication date', install_requires=[