-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdraft_EM_scraper.py
More file actions
70 lines (48 loc) · 2.2 KB
/
draft_EM_scraper.py
File metadata and controls
70 lines (48 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#! python3
# Memos_download.py - Scrapes SI explanatory memoranda (pfs) from www.legislation.gov.uk
import requests, os, bs4
totalNums = {'2005': 3601,
'2006': 3511,
'2007': 3662,
'2008': 3327,
'2009': 3468,
'2010': 3117,
'2011': 3136,
'2012': 3329,
'2013': 3318,
'2014': 3563}
for y in range(2005,2015):
os.makedirs('C:\\Users\\Radoslaw\\Documents\\4-Academic-Current\\projects\\memo_project\\memoranda_pdfs\\'+str(y))
os.chdir('C:\\Users\\Radoslaw\\Documents\\4-Academic-Current\\projects\\memo_project\\memoranda_pdfs\\'+str(y))
missing = open('missing_si_'+str(y)+'.txt','w')
missing.write('SI Memoranda ('+str(y)+') Not Found on http://legislation.gov.uk:\n\n')
errors = open('errors_+'+str(y)+'.txt','w')
errors.write('PDF address extraction and downloading errors ('+str(y)+'):\n\n')
for i in range(1,totalNums[str(y)]+1):
#1. download the page
print('Dowloading page http://www.legislation.gov.uk/uksi/'+str(y)+'/%s/memorandum/contents...' % i)
url = 'http://www.legislation.gov.uk/uksi/'+str(y)+'/'+str(i)+'/memorandum/contents'
res = requests.get(url)
if res.status_code != requests.codes.ok:
missing.write(str(i)+'/'+str(y)+': Memorandum Not Found\n')
continue
soup = bs4.BeautifulSoup(res.text, "html.parser")
#2. find url for pdf
si = soup.select('#viewLegContents a')
if si == []:
errors.write(str(i)+'/'+str(y)+': Error when extracting address\n')
else:
siURL = si[0].get('href')
#2.1. download pdf
res = requests.get('http://www.legislation.gov.uk/'+ siURL)
if res.status_code != requests.codes.ok:
errors.write(str(i)+'/'+str(y)+': Error when downloading pdf\n')
continue
#2.2. save pdf
pdfFile = open(os.path.basename(siURL),'wb')
for chunk in res.iter_content(100000):
pdfFile.write(chunk)
pdfFile.close()
missing.close()
errors.close()
print('Done')