-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawler_reuters.py
More file actions
executable file
·131 lines (115 loc) · 5.29 KB
/
crawler_reuters.py
File metadata and controls
executable file
·131 lines (115 loc) · 5.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python
import re
import urllib2
import csv
import os
import sys
import time
import datetime
import numpy as np
from bs4 import BeautifulSoup
class news_Reuters:
def __init__(self):
fin = open('./input/tickerList.csv')
filterList = set()
try: # this is used when we restart a task
fList = open('./input/finished.reuters')
for l in fList:
filterList.add(l.strip())
except:
pass
dateList = self.dateGenerator(1000) # look back on the past X days
for line in fin: # iterate all possible tickers
line = line.strip().split(',')
ticker, name, exchange, MarketCap = line
if ticker in filterList: continue
print("%s - %s - %s - %s" % (ticker, name, exchange, MarketCap))
self.contents(ticker, name, line, dateList, exchange)
def contents(self, ticker, name, line, dateList, exchange):
# https://uk.reuters.com/info/disclaimer
suffix = {'AMEX': '.A', 'NASDAQ': '.O', 'NYSE': '.N'}
# e.g. http://www.reuters.com/finance/stocks/company-news/BIDU.O?date=09262017
url = "http://www.reuters.com/finance/stocks/company-news/" + ticker + suffix[exchange]
has_Content = 0
repeat_times = 4
# check the website to see if that ticker has many news
# if true, iterate url with date, otherwise stop
for _ in range(repeat_times): # repeat in case of http failure
try:
# time.sleep(np.random.poisson(3))
time.sleep(3)
response = urllib2.urlopen(url)
data = response.read()
soup = BeautifulSoup(data, "lxml")
has_Content = len(soup.find_all("div", {'class': ['topStory', 'feature']}))
break
except Exception as e:
print e
continue
# spider task for the past
# if some company has no news even if we don't input date
# set this ticker into the lowest priority list
#
# else
# if it doesn't have a single news for NN consecutive days, stop iterating dates
# set this ticker into the second-lowest priority list
ticker_failed = open('./input/news_failed_tickers.csv', 'a+')
if has_Content > 0:
missing_days = 0
for timestamp in dateList:
hasNews = self.repeatDownload(ticker, line, url, timestamp)
if hasNews:
missing_days = 0 # if get news, reset missing_days as 0
else:
missing_days += 1
if missing_days > has_Content * 5 + 20: # 2 NEWS: wait 30 days and stop, 10 news, wait 70 days
break # no news in X consecutive days, stop crawling
if missing_days > 0 and missing_days % 20 == 0: # print the process
print("%s has no news for %d days, stop this candidate ..." % (ticker, missing_days))
ticker_failed.write(ticker + ',' + timestamp + ',' + 'LOW\n')
else:
print("%s has no news" % (ticker))
today = datetime.datetime.today().strftime("%Y%m%d")
ticker_failed.write(ticker + ',' + today + ',' + 'LOWEST\n')
ticker_failed.close()
def repeatDownload(self, ticker, line, url, timestamp):
new_time = timestamp[4:] + timestamp[:4] # change 20151231 to 12312015 to match reuters format
repeat_times = 3 # repeat downloading in case of http error
for _ in range(repeat_times):
try:
# time.sleep(np.random.poisson(3))
time.sleep(3)
response = urllib2.urlopen(url + "?date=" + new_time)
data = response.read()
soup = BeautifulSoup(data, "lxml")
hasNews = self.parser(soup, line, ticker, timestamp)
if hasNews: return 1 # return if we get the news
break # stop looping if the content is empty (no error)
except: # repeat if http error appears
print('Http error')
continue
return 0
def parser(self, soup, line, ticker, timestamp):
content = soup.find_all("div", {'class': ['topStory', 'feature']})
if len(content) == 0: return 0
fout = open('./input/news_reuters.csv', 'a+')
for i in range(len(content)):
title = content[i].h2.get_text().replace(",", " ").replace("\n", " ")
body = content[i].p.get_text().replace(",", " ").replace("\n", " ")
if i == 0 and len(soup.find_all("div", class_="topStory")) > 0:
news_type = 'topStory'
else:
news_type = 'normal'
print(ticker, timestamp, title, news_type)
fout.write(','.join([ticker, line[1], timestamp, title, body, news_type]).encode('utf-8') + '\n')
fout.close()
return 1
def dateGenerator(self, numdays): # generate N days until now
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
for i in range(len(date_list)): date_list[i] = date_list[i].strftime("%Y%m%d")
return date_list
def main():
news_Reuters()
if __name__ == "__main__":
main()