-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathharvest.py
More file actions
executable file
·135 lines (107 loc) · 4.77 KB
/
harvest.py
File metadata and controls
executable file
·135 lines (107 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/python
"""
This file is part of Barnacler.
Copyright 2012 Paul Rosenfeld
This code is released under a GPLv3 license. See the COPYING and LICENSE
files in this directory for more details.
"""
import json
import re
import datetime
import pdb
import sqlite3
import tempfile
import subprocess
verbose=True
class Cruise:
def __init__(self, departurePort, ship, price, portsOfCall, lengthLocationString, departureDateString, comment=""):
self.departurePort = departurePort
self.ship=ship
try:
self.price=float(price[1:].replace(",",""))
except ValueError:
self.price=-1.0;
self.portsOfCall= portsOfCall
self.cruiseLength = None
self.returnDate = None
self.departureDate = None
self.departureDateFromString(departureDateString)
self.arrivalDateFromString(lengthLocationString)
self.scrapeDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
self.comment = comment;
def departureDateFromString(self, dateStr):
regex = re.compile(r'^\w+,\s+(\w+)\s+(\d+),\s+(\d{4})')
self.departureDate = datetime.datetime.strptime(" ".join(regex.match(dateStr).groups()), "%b %d %Y")
def arrivalDateFromString(self, lengthLocationString):
regex = re.compile(r'^(\d+)\s+Day')
self.cruiseLength = int(regex.match(lengthLocationString).group(1))
self.returnDate = self.departureDate + datetime.timedelta(days=self.cruiseLength)
def getTableCreateString(self, table):
typeMap = {str: "varchar(255)", unicode: "varchar(255)", int: "integer", datetime.datetime: "date", float: "double"}
tableDef = map(lambda x: typeMap[x], map(type, self.__dict__.values()))
columns = []
for key, db_type in zip(self.__dict__.keys(), tableDef):
columns.append("%s %s"%(key, db_type))
createStr = "CREATE TABLE IF NOT EXISTS %s (id integer primary key, %s) "%(table, ",".join(columns))
return createStr;
def getInsertString(self, table):
fields = ",".join(self.__dict__.keys())
values = ",".join(map(lambda x: "\"%s\""%(x), self.__dict__.values()))
return "INSERT INTO %s (%s) VALUES (%s);"%(table,fields,values)
def __str__(self):
return "%s %s (%s-%s) %d day ports: %s"%(self.ship,self.departurePort, str(self.departureDate),str(self.returnDate), self.cruiseLength, self.portsOfCall)
def pretty_print(json_str):
print json.dumps(json_str, sort_keys=True, indent=2, separators=(',', ': '))
def jsonToCruiseObj(x, date, price, comment=""):
return Cruise(x["departurePort"], x["ship"], price, "|".join(x["ports"]), x["day_location"], date)
def execWrapper(cursor, sqlStr):
if verbose:
print "\tEXEC: '%s'"%(sqlStr)
cursor.execute(sqlStr);
if __name__ == "__main__":
from optparse import OptionParser
opt_parser = OptionParser("Usage: %prog [options]")
opt_parser.add_option("-i", "--json-file", dest="json_filename", type="string", help="Force insert this json file instead of getting new data")
opt_parser.add_option("-o", "--sqlite_filename", dest="sqlite_filename", type="string", help="sqlite file to write data to", default="cruises.sqlite")
opt_parser.add_option("-c", "--comment", dest="comment", type="string", help="a comment to be stored with the data (ex: 'start of sale')", default="")
opt_parser.add_option("-I", "--init", dest="init_database",action="store_true", help="run CREATE queries to setup tables before inserting data")
opt_parser.add_option("-s", "--scrape", dest="scrape",action="store_true", help="scrape carnival website for new data")
(options, args) = opt_parser.parse_args()
if verbose:
print "comment: ",options.comment
tmpFile = tempfile.NamedTemporaryFile(mode="r+b", dir="json_data/", delete=False);
if options.scrape:
cmdStr = "casperjs carnival.js %s"%tmpFile.name
if verbose:
print "Launching scraper: '%s'"%cmdStr
# launch the harvester
stdoutStr = subprocess.Popen(cmdStr.split(" "), stdout=subprocess.PIPE).communicate()[0];
if verbose:
print stdoutStr
textData = tmpFile.read();
elif options.json_filename:
textData = open(options.json_filename).read()
if verbose:
print "Read json: %s"%(textData)
json_data = json.loads(textData);
if verbose:
pretty_print(json_data);
dbCon = sqlite3.connect(options.sqlite_filename);
cursor = dbCon.cursor();
if options.init_database:
execWrapper(cursor, jsonToCruiseObj(json_data[0], json_data[0]["date"][0], json_data[0]["price"][0]).getTableCreateString("cruiseData"))
for x in json_data:
if verbose:
print "Cruise: %s on %s from %s to: "%(x["day_location"],x["ship"], x["departurePort"])
print " To: "
for p in x["ports"]:
print " %s"%p
for date,price in zip(x["date"], x["price"]):
if verbose:
print "\t\t%s\t%s"%(date,price)
c = jsonToCruiseObj(x, date, price, options.comment)
if verbose:
print "INSERT: %s"%(str(c))
execWrapper(cursor, c.getInsertString("cruiseData"))
dbCon.commit();
dbCon.close();