uksiem-parser/draft_EM_scraper.py at master · finiteprods/uksiem-parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

#! python3

# Memos_download.py - Scrapes SI explanatory memoranda (pfs) from www.legislation.gov.uk

import requests, os, bs4

totalNums = {'2005': 3601,
             '2006': 3511,
             '2007': 3662,
             '2008': 3327,
             '2009': 3468,
             '2010': 3117,
             '2011': 3136,
             '2012': 3329,
             '2013': 3318,
             '2014': 3563}

for y in range(2005,2015):

    os.makedirs('C:\\Users\\Radoslaw\\Documents\\4-Academic-Current\\projects\\memo_project\\memoranda_pdfs\\'+str(y))
    os.chdir('C:\\Users\\Radoslaw\\Documents\\4-Academic-Current\\projects\\memo_project\\memoranda_pdfs\\'+str(y))

    missing = open('missing_si_'+str(y)+'.txt','w')
    missing.write('SI Memoranda ('+str(y)+') Not Found on http://legislation.gov.uk:\n\n')

    errors = open('errors_+'+str(y)+'.txt','w')
    errors.write('PDF address extraction and downloading errors ('+str(y)+'):\n\n')

    for i in range(1,totalNums[str(y)]+1):

        #1. download the page
        print('Dowloading page http://www.legislation.gov.uk/uksi/'+str(y)+'/%s/memorandum/contents...' % i)
        url = 'http://www.legislation.gov.uk/uksi/'+str(y)+'/'+str(i)+'/memorandum/contents'
        res = requests.get(url)
        if res.status_code != requests.codes.ok:
            missing.write(str(i)+'/'+str(y)+': Memorandum Not Found\n')
            continue

        soup = bs4.BeautifulSoup(res.text, "html.parser")

        #2. find url for pdf

        si = soup.select('#viewLegContents a')
        if si == []:
            errors.write(str(i)+'/'+str(y)+': Error when extracting address\n')
        else:
            siURL = si[0].get('href')

            #2.1. download pdf
            res = requests.get('http://www.legislation.gov.uk/'+ siURL)
            if res.status_code != requests.codes.ok:
                errors.write(str(i)+'/'+str(y)+': Error when downloading pdf\n')
                continue

            #2.2. save pdf
            pdfFile = open(os.path.basename(siURL),'wb')
            for chunk in res.iter_content(100000):
                pdfFile.write(chunk)
            pdfFile.close()

    missing.close()
    errors.close()

print('Done')