-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
340 lines (288 loc) · 11.7 KB
/
scraper.py
File metadata and controls
340 lines (288 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import datetime
import feedparser
from time import mktime, time, sleep
import requests
import traceback
from urllib.request import build_opener, HTTPCookieProcessor, Request
import re
from scraper_const import feeds, cats, buzzwords
# Edit these dependant on the desired behaviour --------
URL_PROXY_PREFIX = "http://proxy.uchicago.edu/login?url="
RSS_USE_PREFIX = True
# -------------------------------------------------------
class Article:
CSS_DIV_CLASS = 'Article'
def __init__(self, post, **kwargs):
self.raw_post = post
self.journal = 'Article'
self.journal_img = None
self.use_proxy_prefix = False
for kw, val in kwargs.items():
setattr(self, kw, getattr(post, val, None))
#Get the authors
self.authors = ', '.join(a['name'] for a in post.authors if 'name' in a) if hasattr(post, 'authors') else None
#Get the date
self.date = datetime.date.today()
for kw in ['published_parsed','date_parsed']:
if hasattr(post, kw):
self.date = datetime.date.fromtimestamp(mktime(getattr(post, kw)))
break
def build_html(self, preview = True):
title, abstract = self.title, self.abstract
html_img = ''
if preview:
try:
preview = generate_dict(self.url)
if any(ext in preview['image'] for ext in ['png', 'jpeg']):
html_img = '<img src="{}" ALIGN="left">'.format(preview['image'])
title, abstract = preview['title'], preview['description']
except:
print("Failed to load preview for: {}".format(self.url))
traceback.print_exc()
# Remove some html tags from the abstract
img_in_abstract = re.search("<img [\s\S]*?>", abstract)
if not img_in_abstract is None:
img_in_abstract = img_in_abstract.group()
abstract = abstract.replace(img_in_abstract, '')
if html_img == '':
#TODO: Should check that there is not already a align parameter and remove
# TODO: Adjust the height and width
html_img = img_in_abstract[:-1] + ' align="left"' + '>'
# html_img = img_in_abstract
html = '<div class="{}">'.format(self.CSS_DIV_CLASS)
html += '<div class="journal">{}<span> {}</span></div>'.format('<img src="{}">'.format(self.journal_img) if not self.journal_img is None else '',self.journal)
html += '<a href="{}" target="_blank"><h3>{}</h3></a>'.format(URL_PROXY_PREFIX+self.url if self.use_proxy_prefix else self.url, title)
html += html_img
html += '{}<br>Date:<i>{}</i>\n<br><br>{}<br>'.format(self.authors, self.date, abstract)
html += '<br><br><i>Buzzwords: {}</i><br>'.format(', '.join(self.get_relevant_buzzwords()))
html += '<a href={} target="_blank">PDF</a></div>'.format(self.pdf) if hasattr(self, 'pdf') else "</div>"
return html
def get_relevant_buzzwords(self):
results =[]
text = ''
for content in [self.title, self.abstract, self.authors]:
text += content if not content is None else ''
for word in buzzwords:
if word.lower() in text.lower():
results.append(word)
return results
def contains_buzzwords(self):
return len(self.get_relevant_buzzwords()) > 0
class RSS_Articles(Article):
CSS_DIV_CLASS = 'Article RSS'
def __init__(self, post, feed, url='link', title='title', abstract='description', authors='author', **kwargs):
super().__init__(post, url=url, title=title, abstract=abstract, authors=authors, **kwargs)
self.feed = feed
self.journal = feed.journal_name
self.journal_img = feed.journal_img
self.use_proxy_prefix = RSS_USE_PREFIX
def build_html(self, preview=None):
if preview is None:
preview = not any(bad_ones in self.url for bad_ones in ['acsphotonics'])
return super().build_html(preview=preview)
class ArXiV_Articles(Article):
CSS_DIV_CLASS = 'Article ArXiV'
def __init__(self, post, url='link', title='title', abstract='summary', **kwargs):
super().__init__(post, url=url, title=title, abstract=abstract, **kwargs)
self.journal = 'ArXiV'
self.journal_img = 'img/arxiv.png'
for ref in post.links:
if ref['type'] == 'application/pdf':
self.pdf = ref['href']
def build_html(self, preview=False):
return super().build_html(preview=preview)
def scrape_RSS(days):
results = []
today = datetime.date.today()
datetosearch = today-datetime.timedelta(days=days)
for feed in feeds:
try:
d = feedparser.parse(feed.url)
for post in d.entries:
a = RSS_Articles(post, feed)
if a.date >= datetosearch and a.contains_buzzwords():
results.append(a)
except:
traceback.print_exc()
print(post)
print(post.authors)
print("Failed to import feed: {}".format(feed.url))
return results
def scrape_ArXiV(days):
results = []
today = datetime.date.today()
datetosearch = today-datetime.timedelta(days=days+1)
dates_str = [d.strftime('%Y%m%d%H%M') for d in [datetosearch,today]]
# Build ArXiV query
string = 'http://export.arxiv.org/api/query?search_query=('
string += ('all:{}+OR+'*(len(buzzwords)-1)+'all:{})').format(*buzzwords)
string += ('+AND+({}'+'+OR+{}'*(len(cats)-1)+')').format(*cats)
string += '+AND+submittedDate:[{}+TO+{}]'.format(*dates_str)
string += '&sortBy=submittedDate&max_results=10000'
try:
# Issue query
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
response = requests.get(string)
response.encoding = 'utf-8'
d = feedparser.parse(response.text)
for post in d.entries:
a = ArXiV_Articles(post)
if a.contains_buzzwords():
results.append(a)
except:
print("Failed to import from ArXiV")
return results
def build_html(articles, html_header=""):
articles.sort(key=lambda a: a.date, reverse=True)
t0 = time()
pre_html = """
<html>
<head>
<title>ScraperBot Results</title>
<link rel="stylesheet" href="scraperbot.css">
</head>
"""
body_html = '\n'.join(a.build_html() for a in articles)
post_html = "</div></body></html>"
pre_html += "<div class='stats'>Took {:.1f}s to generate the page</div>".format(time()-t0)
pre_html += html_header
pre_html += '<div class="Container">\n<h1>ScraperBot Results</h1>\n'
return pre_html + body_html + post_html
def scrape(days):
t0 = time()
rss = scrape_RSS(days)
t1 = time()
arxiv = scrape_ArXiV(days)
header = "<div class='stats'>RSS feeds -- Found {} articles. Took {:.1f}s</div>\n".format(len(rss),t1-t0)
header += "<div class='stats'>ArXiV -- Found {} articles. Took {:.1f}s</div>\n".format(len(arxiv),time()-t1)
html = build_html(rss+arxiv, header)
return rss+arxiv, html
"""
The following is a slightly modified version of the link_preview package which can be found here:
https://github.com/aakash4525/py_link_preview/blob/master/link_preview/link_preview.py
I've copied it here to:
1. Limit the use of external pakages
2. Make the url request compatible with some of the journals (ie: set user agent and enable cookie)
"""
def generate_dict(url):
'''
returns dictionary containing elements of link_preview:
dict_keys :
'title' : '',
'description': '',
'image': '',
'website': ''
if Exception occurs, it raises Exception of urllib.request module.
'''
return_dict = {}
try:
#---------------------------------- MODIFIED CODE --------------------------------------
opener = build_opener(HTTPCookieProcessor())
html = opener.open(Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla'
}
), timeout=30).read().decode('utf-8')
#----------------------------------------------------------------------------------------
meta_elems = re.findall('<[\s]*meta[^<>]+og:(?:title|image|description)(?!:)[^<>]+>', html)
og_map = map(return_og, meta_elems)
og_dict = dict(list(og_map))
# title
try:
return_dict['title'] = og_dict['og.title']
except KeyError:
return_dict['title'] = find_title(html)
# description
try:
return_dict['description'] = og_dict['og.description']
except KeyError:
return_dict['description'] = find_meta_desc(html)
# website
return_dict['website'] = find_host_website(url)
# Image
try:
return_dict['image'] = og_dict['og.image']
except KeyError:
image_path = find_image(html)
if 'http' not in image_path:
image_path = 'http://' + return_dict['website'] + image_path
return_dict['image'] = image_path
return return_dict
except Exception as e:
'Raises Occurred Exception'
raise e
def return_og(elem):
'''
returns content of og_elements
'''
content = re.findall('content[\s]*=[\s]*"[^<>"]+"', elem)[0]
p = re.findall('"[^<>]+"', content)[0][1:-1]
if 'og:title' in elem:
return ("og.title", p)
elif 'og:image' in elem and 'og:image:' not in elem:
return ("og.image", p)
elif 'og:description' in elem:
return ("og.description", p)
def find_title(html):
'''
returns the <title> of html
'''
try:
title_elem = re.findall('<[\s]*title[\s]*>[^<>]+<[\s]*/[\s]*title[\s]*>', html)[0]
title = re.findall('>[^<>]+<', title_elem)[0][1:-1]
except:
title = ''
return title
def find_meta_desc(html):
'''
returns the description (<meta name="description") of html
'''
try:
meta_elem = re.findall('<[\s]*meta[^<>]+name[\s]*=[\s]*"[\s]*description[\s]*"[^<>]*>', html)[0]
content = re.findall('content[\s]*=[\s]*"[^<>"]+"', meta_elem)[0]
description = re.findall('"[^<>]+"', content)[0][1:-1]
except:
description = ''
return description
def find_image(html):
'''
returns the favicon of html
'''
try:
favicon_elem = re.findall('<[\s]*link[^<>]+rel[\s]*=[\s]*"[\s]*shortcut icon[\s]*"[^<>]*>', html)[0]
href = re.findall('href[\s]*=[\s]*"[^<>"]+"', favicon_elem)[0]
image = re.findall('"[^<>]+"', href)[0][1:-1]
except:
image = ''
return image
def find_host_website(url):
'''
returns host website from the url
'''
return list(filter(lambda x: '.' in x, url.split('/')))[0]
"""
This handles the execution from a coomand line
"""
import webbrowser, os, sys
#Remove previous results
try:
os.remove("results.htm")
except:
pass
#Check if a specific number of days is to be scraped
days_to_scrape = 1
if len(sys.argv) > 1:
try:
days_to_scrape = int(sys.argv[1])
except:
pass
print("Searching through the last {} days...".format(days_to_scrape))
#Do the thing
a, html = scrape(days_to_scrape)
with open("results.htm", 'w+') as f:
f.write(html.encode('ascii', 'ignore').decode('ascii'))
chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s'
webbrowser.get(chrome_path).open("results.htm")