-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_session_metadata.py
More file actions
89 lines (77 loc) · 2.66 KB
/
scrape_session_metadata.py
File metadata and controls
89 lines (77 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from utils.models import Session
from utils.backend import PleDB
from scripts.scrape_sessions import request_html
base = 'https://www.parlament.cat/'
sessions_meta_pages = urljoin(base,
'/web/canal-parlament/activitat/plens/index.html')
def main():
db = PleDB(task_name='v1')
db.connect()
get_all_session_metadata(db)
def get_all_session_metadata(db):
sessions = get_session_list()
for session in sessions:
get_session_meta(session, db)
def get_session_list():
msg = "scraping the list of sessions"
print(msg)
pages = get_session_pages()
sessions = []
for page in pages:
sessions += extract_sessions(page)
msg = "%i sessions extracted from %i pages"%(len(sessions), len(pages))
print(msg)
return sessions
def get_session_pages():
no_pages = get_page_no()
return [urljoin(sessions_meta_pages,'?p_cp20=%i'%(page_no+1)) \
for page_no in range(no_pages)]
def get_page_no():
#TODO
return 33
def extract_sessions(url):
html = request_html(url)
soup = BeautifulSoup(html,'html.parser')
sessions = parse_for_sessions(soup)
return sessions
def parse_for_sessions(soup):
sessions = []
key_convert = {'Data':'date', 'Durada':'duration'}
llista = soup.find('ul', attrs={'class':'llista_videos'})
for element in llista.find_all('h2'):
session = {}
div = element.find_parent()
session['base_url'] = base
session['url'] = div.find('a').get('href')
session['name'] = div.find('a').text
for p in div.find_all('p'):
# we expet to find Data: <date> or Durada: <duration>
if ':' in p.text:
key, value = p.text.split(':')
session[key_convert[key.strip()]] = value.strip()
sessions.append(session)
return sessions
def get_session_meta(session, db):
try:
current_session = Session(**session)
except TypeError as e:
print(session)
raise TypeError()
if db.get(current_session.url):
msg = '%s - %s already in db. skipping'%(current_session.ple_code,
current_session.name)
print(msg)
#logging.info(msg)
else:
current_session.get_interventions()
db.insert(current_session.url,
current_session.meta_to_dict())
msg = '%s - %s with %i interventions inserted to db'\
%(current_session.ple_code,
current_session.name,
current_session.no_interventions)
print(msg)
if __name__ == "__main__":
main()