|
1 | | -import re |
2 | | - |
3 | | -import scrapy |
| 1 | +import xmltodict |
4 | 2 | from scrapy import Item |
5 | 3 |
|
6 | 4 | from jedeschule.items import School |
|
9 | 7 |
|
10 | 8 | class ThueringenSpider(SchoolSpider): |
11 | 9 | name = "thueringen" |
12 | | - base_url = "https://www.schulportal-thueringen.de" |
13 | | - |
14 | 10 | start_urls = [ |
15 | | - "https://www.schulportal-thueringen.de/tip/schulportraet_suche/search.action?tspi=&tspm=&vsid=none&mode=&extended=0&anwf=schulportraet&freitextsuche=&name=&schulnummer=&strasse=&plz=&ort=&schulartDecode=&schulamtDecode=&kzFreierTraeger_cb=1&kzFreierTraeger=2&schultraegerDecode=&sortierungDecode=Schulname&rowsPerPage=999&schulartCode=&schulamtCode=&schultraegerCode=&sortierungCode=10&uniquePortletId=portlet_schulportraet_suche_WAR_tip1109990a_e473_4c62_872b_4ef69bdb6c5d&ajaxId=schulportraet_suche_results" |
| 11 | + "https://www.geoproxy.geoportal-th.de/geoproxy/services/kommunal/komm_wfs?" |
| 12 | + "SERVICE=WFS&REQUEST=GetFeature&typeNames=kommunal:komm_schul&" |
| 13 | + "srsname=EPSG:4326&VERSION=2.0.0" |
16 | 14 | ] |
17 | 15 |
|
18 | | - # TODO: parse last_modified |
19 | | - def parse(self, response): |
20 | | - headers = [ |
21 | | - header.css("::text").extract_first().strip() |
22 | | - for header in response.css("th") |
23 | | - ] |
24 | | - for tr in response.css(".tispo_row_odd,.tispo_row_normal"): |
25 | | - collection = {} |
26 | | - tds = tr.css("td") |
27 | | - for index, td in enumerate(tds): |
28 | | - key = headers[index] |
29 | | - value = td.css("::text").extract_first() |
30 | | - # The school name is hidden in a link so we check if there |
31 | | - # is a link and if yes extract the value from that |
32 | | - link_text = td.css("a ::text").extract_first() |
33 | | - if link_text: |
34 | | - value = link_text |
35 | | - collection[key] = value.strip() |
36 | | - # inspect_response(response, self) |
37 | | - url = tds[1].css("::attr(href)").extract_first().strip() |
38 | | - request = scrapy.Request(self.base_url + url, callback=self.parse_overview) |
39 | | - request.meta["collection"] = collection |
40 | | - yield request |
| 16 | + def parse(self, response, **kwargs): |
| 17 | + data = xmltodict.parse(response.text) |
| 18 | + members = data.get("wfs:FeatureCollection", {}).get("wfs:member", []) |
41 | 19 |
|
42 | | - def parse_overview(self, response): |
43 | | - # inspect_response(response, self) |
44 | | - collection = response.meta["collection"] |
45 | | - for tr in response.css(".tispo_labelValueView tr"): |
46 | | - tds = tr.css("td ::text").extract() |
47 | | - # sometimes there is no value for the key |
48 | | - if len(tds) >= 2: |
49 | | - collection[tds[0][:-1].strip()] = "".join( |
50 | | - [td.strip() for td in tds[1:]] |
51 | | - ) |
52 | | - collection["data_url"] = response.url |
53 | | - collection["Leitbild"] = " ".join( |
54 | | - response.css(".tispo_htmlUserContent ::text").extract() |
55 | | - ) |
56 | | - yield collection |
| 20 | + if not isinstance(members, list): |
| 21 | + members = [members] |
| 22 | + |
| 23 | + for member in members: |
| 24 | + school = member.get("kommunal:komm_schul", {}) |
| 25 | + |
| 26 | + data_elem = {} |
| 27 | + |
| 28 | + # Extract geometry coordinates |
| 29 | + geom = school.get("kommunal:GEOM", {}) |
| 30 | + point = geom.get("gml:Point", {}) |
| 31 | + pos = point.get("gml:pos", "") |
| 32 | + if pos: |
| 33 | + lon, lat = pos.split() |
| 34 | + data_elem["lat"] = float(lat) |
| 35 | + data_elem["lon"] = float(lon) |
| 36 | + |
| 37 | + # Extract all other fields |
| 38 | + for key, value in school.items(): |
| 39 | + if key not in ("kommunal:GEOM", "@gml:id") and value: |
| 40 | + # Remove namespace prefix |
| 41 | + clean_key = key.split(":", 1)[-1] if ":" in key else key |
| 42 | + data_elem[clean_key] = value |
| 43 | + |
| 44 | + yield data_elem |
57 | 45 |
|
58 | 46 | @staticmethod |
59 | 47 | def normalize(item: Item) -> School: |
60 | | - city_parts = item.get("Ort").split() |
61 | | - zip, city = city_parts[0], " ".join(city_parts[1:]) |
62 | 48 | return School( |
63 | | - name=item.get("Schulname"), |
| 49 | + name=item.get("Name"), |
64 | 50 | id="TH-{}".format(item.get("Schulnummer")), |
65 | | - address=item.get("Straße"), |
66 | | - zip=zip, |
67 | | - city=city, |
68 | | - website=item.get("Internet"), |
69 | | - email=ThueringenSpider._deobfuscate_email(item.get("E-Mail")), |
| 51 | + address=" ".join( |
| 52 | + filter(None, [item.get("Strasse"), item.get("Hausnummer")]) |
| 53 | + ), |
| 54 | + zip=item.get("PLZ"), |
| 55 | + city=item.get("Ort"), |
| 56 | + website=item.get("Webseite"), |
| 57 | + email=item.get("EMail"), |
70 | 58 | school_type=item.get("Schulart"), |
71 | | - provider=item.get("Schulträger"), |
72 | | - fax=item.get("Telefax"), |
73 | | - phone=item.get("Telefon"), |
| 59 | + provider=item.get("Traeger"), |
| 60 | + fax=item.get("Faxnummer"), |
| 61 | + phone=item.get("Telefonnummer"), |
| 62 | + latitude=item.get("lat"), |
| 63 | + longitude=item.get("lon"), |
74 | 64 | ) |
75 | | - |
76 | | - @staticmethod |
77 | | - def _deobfuscate_email(orig): |
78 | | - """ |
79 | | - Reverse-engineered version of the deobfuscation code on the website. |
80 | | -
|
81 | | - :param orig: the obfuscated string or the whole function call (`$(function() {...})`), |
82 | | - as long as it contains the prefix `#3b` and the suffix `3e#`. |
83 | | - :return: the deofuscated string |
84 | | - """ |
85 | | - |
86 | | - result = "" |
87 | | - if orig and re.search(r"#3b[a-z0-9 ]+3e#", orig): |
88 | | - orig = re.search(r"#3b[a-z0-9 ]+3e#", orig).group(0) |
89 | | - s = ( |
90 | | - orig.replace(" ", "") |
91 | | - .replace("#3b", "") |
92 | | - .replace("3e#", "") |
93 | | - .replace("o", "") |
94 | | - ) |
95 | | - |
96 | | - last_value = 0 |
97 | | - current_value = 0 |
98 | | - for i, c in enumerate(s): |
99 | | - if c.isnumeric(): |
100 | | - current_value = int(c) |
101 | | - else: |
102 | | - current_value = ord(c) - 97 + 10 |
103 | | - |
104 | | - if i % 2 == 1: |
105 | | - t = int(last_value * 23 + current_value) // 2 |
106 | | - result += chr(t) |
107 | | - last_value = current_value |
108 | | - |
109 | | - return result |
0 commit comments