-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_comps.py
More file actions
146 lines (98 loc) · 3.03 KB
/
parse_comps.py
File metadata and controls
146 lines (98 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import sys
import db
import glob
from bs4 import BeautifulSoup
from fsutils import FileRepository
import config
debug = True
def parse_comp_name(d):
h = d.select("div.v-public-page-heading")
if len(h):
return h[0].text
else:
return "unknown"
def parse_competition(html_str, fname=""):
# init html parser
d = BeautifulSoup(html_str, 'html.parser')
# competition id
cid = os.path.basename(fname).split(".")[0]
# create competition dict
comp = {"id": cid, "name": parse_comp_name(d), "results": {}}
if debug:
print " * competition name: %s" % (comp["name"].encode('utf8'))
# for each class result table
it1 = d.find_all("table")[1].find_all("table")[2:-1:2]
it2 = d.find_all("table")[1].find_all("table")[3:-1:2]
for ct, rt in zip(it1, it2):
# parse class name
k = ct.find_all("td")[1].text.strip()
if not k:
print " ** invalid class in file %s" % fname
continue
comp["results"][k] = []
# parse results
for result in rt.find_all("tr")[1:]:
result_line = [ td.text.strip() for td in result.find_all("td") ]
if len(result_line) < 5:
print " ** invalid result in '%s': %s" % (fname, k)
if result_line[0].isdigit():
r = {
"placing": result_line[0],
"name": result_line[1],
"club": result_line[2],
"time": result_line[3],
"diff": result_line[4]
}
comp["results"][k].append(r)
return comp
def time_as_float(s):
parts = map(float, s.split("."))
if len(parts) == 3:
h, m, s = parts
elif len(parts) == 2:
m, s = parts
h = 0
return 60.0 * h + m + s / 60.0
def persist_competition(conn, competition):
assert "id" in competition
assert "name" in competition
assert "results" in competition
# write competition row
db.write_competition(conn, competition["id"], competition["name"])
for klass, results in competition["results"].iteritems():
for result in results:
# find runner
runner = db.read_runner_by_name(conn, result["name"])
# create runner if not exists
if not runner:
runner = db.write_runner(conn, result["name"], result["club"])
# write result record
db.write_result(conn, runner["id"], competition["id"], klass,
result["placing"], time_as_float(result["time"]),
time_as_float(result["diff"]))
conn.commit()
def get_competition_html_paths(comp_dir):
return glob.glob(os.path.join(comp_dir, "*.html"))
if __name__ == "__main__":
# parse args
if len(sys.argv) < 2:
print "Missing year."
sys.exit(1)
year = int(sys.argv[1])
fsrepo = FileRepository(config.conf["rootpath"], year)
fsrepo.ensure_directory_layout()
# database connection
conn = db.create_sqlite_by_filename(fsrepo.get_competition_db_path())
# read directory listing
comp_dir = "competition_html"
fs = fsrepo.get_html_paths()
if debug:
print " * start parsing %i htmls" % (len(fs))
# parse each file
for i, fn in enumerate(fs):
if debug:
print " * parsing: %s" % (fn)
competition = parse_competition(open(fn).read(), fn)
persist_competition(conn, competition)
print " * competition read: %d/%d" % (i+1, len(fs))