Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions articleDateExtractor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__author__ = 'Ran Geva'

import urllib2,re, json
from six.moves import urllib
import re, json
from dateutil.parser import parse
try:
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -38,16 +39,16 @@ def _extractFromLDJson(parsedHTML):

try:
jsonDate = parseStrDate(data['datePublished'])
except Exception, e:
except Exception as e:
pass

try:
jsonDate = parseStrDate(data['dateCreated'])
except Exception, e:
except Exception as e:
pass


except Exception, e:
except Exception as e:
return None


Expand Down Expand Up @@ -205,18 +206,18 @@ def _extractFromHTMLTag(parsedHTML):

def extractArticlePublishedDate(articleLink, html = None):

print "Extracting date from " + articleLink
print("Extracting date from " + articleLink)

articleDate = None

try:
articleDate = _extractFromURL(articleLink)

if html is None:
request = urllib2.Request(articleLink)
request = urllib.request.Request(articleLink)
# Using a browser user agent, decreases the change of sites blocking this request - just a suggestion
# request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36')
html = urllib2.build_opener().open(request).read()
html = urllib.request.build_opener().open(request).read()

parsedHTML = BeautifulSoup(html,"lxml")

Expand All @@ -230,8 +231,8 @@ def extractArticlePublishedDate(articleLink, html = None):
articleDate = possibleDate

except Exception as e:
print "Exception in extractArticlePublishedDate for " + articleLink
print e.message, e.args
print("Exception in extractArticlePublishedDate for " + articleLink)
print(e.message, e.args)



Expand All @@ -242,4 +243,4 @@ def extractArticlePublishedDate(articleLink, html = None):

if __name__ == '__main__':
d = extractArticlePublishedDate("http://techcrunch.com/2015/11/30/atlassian-share-price/")
print d
print(d)
15 changes: 5 additions & 10 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
from codecs import open

try:
from setuptools import setup
except ImportError:
from distutils.core import setup

with open('README.md', 'r', 'utf-8') as f:
readme = f.read()

setup(
name='articleDateExtractor',
Expand All @@ -17,18 +13,17 @@
url='https://github.com/Webhose/article-date-extractor',
license='MIT',
description='Automatically extracts and normalizes an online article or blog post publication date',
long_description=readme,
install_requires=[
"BeautifulSoup >= 3.2.1",
"python-dateutil >= 2.4.2"
"beautifulsoup4>=4.5.3",
"python-dateutil>=2.6.0",
"six>=1.10.0"
],
classifiers=(
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Natural Language :: English',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7'
'Programming Language :: Python :: 3.5'
)
)
)