-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetdata.py
More file actions
83 lines (77 loc) · 3.38 KB
/
getdata.py
File metadata and controls
83 lines (77 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python
# Screen scrape from buses.citytransport.org.uk to get bus times from
# Fiveways in the Brighton Centre direction
# (c) Tobias Quinn, 2011 <tobias@tobiasquinn.com>, GPLv3
# LiCENSE: GPLv3 - http://www.gnu.org/licenses/gpl.html
import urllib2
from datetime import timedelta, datetime
from BeautifulSoup import BeautifulSoup
class BusData:
def __init__(self, URLFILE=None):
# read data file urls
# Read live data
if URLFILE != None:
self._urls = []
for line in open(URLFILE).readlines():
line = line.strip()
if line != '' and line[0] != '#':
self._urls.append(line.strip())
else:
# Test data
self._urls = [
'file:data/set4/out1.html',
'file:data/set4/out2.html',
'file:data/set4/out3.html',
'file:data/set4/out4.html',
]
def getData(self):
# get and scrape all the data into a list
times = []
count = 0
for url in self._urls[:]:
doc = urllib2.urlopen(url).read()
soup = BeautifulSoup(doc, convertEntities=BeautifulSoup.HTML_ENTITIES)
dtimes = (soup.findAll('span', attrs = 'dfifahrten'))
# we want to iterate over the returned data 3 at a time
# as it is supplied in the for busnum, destination, time
for i in range(0, len(dtimes), 3):
busnum = dtimes[i].string
destination = dtimes[i+1].string
arrive = dtimes[i+2].string
if arrive == None:
continue
#print "%s : %s : %s" % (busnum, destination, arrive)
# the time can have various formats
# - calculated minutes to arrival
# - calculated time of arrival
# - timetabled time of arrival
# For now, lets convert the time to an absolute time
# the times decrease to zero then disappear
if arrive.find(':') == -1:
timenow = datetime.now()
# cut out anything pass the first space (encoded in html)
td = timedelta(minutes = int(arrive[:2]))
arrive = (timenow + td)
else:
# strip any following * which means calculated time when non offset
arrive = arrive.replace('*', '')
cpoint = arrive.find(':')
hr = int(arrive[:cpoint])
mins = int(arrive[cpoint+1:])
# we want to get time now, replace the hrs and minutes
# if the time is then before the current time, we advance the time by one day
timenow = datetime.now()
#print "replacing with %s:%s" % (hr, mins)
arrive = timenow.replace(hour = hr, minute = mins)
#print arrive
count += 1
times.append((busnum, destination, arrive))
#print "%s,%d->%s,%s" % (busnum, count, destination, arrive)
# sort the list by times
#print len(times)
data = sorted(times, key=lambda arrive: arrive[2])
#print len(data)
return data
if __name__ == '__main__':
bd = BusData('service.urls')
bd._getdata()