MiurScraper/ScraperMiur.py at main · sfrances/MiurScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from selenium import webdriver
import re
import json
from pprint import pprint

from bs4 import BeautifulSoup
class scraper:

    writeGraphite = False

    def __init__(self, simulate, WEBPAGE):
        self.webpage = WEBPAGE
        if not simulate:
            op = webdriver.FirefoxOptions()
            op.add_argument("--headless")
            self.driver = webdriver.Firefox(options=op)
            print("Firefox instance started...")
        else:
            print("Running in simulation mode")
        self.page = ""

    def GetPageContent(self):
        if not simulate:
            self.driver.get(self.webpage)
            self.driver.implicitly_wait(2)
            try:
                html = self.driver.page_source
                #print(html)
                ### Update local file
                with open("content.html",'w') as outf:
                    outf.write(html)
            except Exception as exc:
                print("catched into error, skip this call")
                raise(exc)
        else:
            with open("content.html","r") as inF:
                html = inF.read()
        self.page = html

    def ScrapeHtml(self):
        if len(self.page)==0:
            print("HTML output seems to be of lenght 0, exiting")
            exit(1)

        box=""
        for line in self.page.splitlines():

            if "Risultato della ricerca bandi" in line:
                box=line
                parsed_html = BeautifulSoup(box,features="html.parser")
                print(parsed_html.text)
                box=""
            if "<p>" in line:
                box=""
                box+=(line)+"\n"
            if len(box)>0:
                box+=(line)+"\n"
                if "</p>" in line:
                    ### If this point is reached the <p></p> box is saved
                    ### Then one can apply filters

                    store = False

                    ### Primo check, voglio solo i bandi
                    if ('aperto' in box) or ('scaduto' in box):
                        store = True

                    ### Va migliorato, non sempre c'è FIS/01 e poi è interessa
                    # if not "FIS/01" in box:
                    #     store = False

                    # if not "RTT" in box:
                    #     store = False

                    if store:
                        parsed_html = BeautifulSoup(box,features="html.parser")
                        outtext = parsed_html.text.replace("\n"," ")
                        links = parsed_html.find_all('a',href=True)
                        if len(links)==0:
                            url = None
                        elif len(links)==1:
                            url = "https://bandi.mur.gov.it/" + links[0]['href']
                        outtext+=" " + url
                        print(outtext)
                    ### Reset box
                    box=""


if __name__ == "__main__":

    simulate = False


    ### All available institutes are kept in this json file
    with open('institutes.json') as iff:
        institutes = json.load(iff)


    institute = 'ROMA1'
    #institute = 'UNIPI'
    WEBPAGE = "https://bandi.mur.gov.it/jobs.php/public/cercaJobs?jv_comp_status_id=*&bb_type_code="+institute+"&azione=cerca&orderby=scadenza_desc"
    scraper = scraper(simulate, WEBPAGE )
    scraper.GetPageContent()
    scraper.ScrapeHtml()
    #scraper.OpenDashboard()