Skip to content

Commit a0f6bd3

Browse files
authored
Merge pull request #212 from tifa365/feature/thueringen-wfs
Replace Thüringen HTML scraper with WFS scraper for geolocation support
2 parents 094f6dd + 1599d21 commit a0f6bd3

File tree

2 files changed

+47
-92
lines changed

2 files changed

+47
-92
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ In details, the IDs are sourced as follows:
3636
|SL| `OBJECTID` from the WFS service | `SL-255` |❌ no (confirmed with data provider but no alternative available) |
3737
|SN| Field `id` from the API | `SN-4062` |✅ likely|
3838
|ST| `ID` query param from the details page URL | `ST-1001186` |❓ probably?|
39-
|TH| `Schulnumer` from school list | `TH-10601` |✅ likely|
39+
|TH| `Schulnummer` from the WFS service | `TH-10601` |✅ likely|
4040

4141
## Geolocations
4242
When available, we try to use the geolocations provided by the data publishers.
@@ -57,7 +57,7 @@ When available, we try to use the geolocations provided by the data publishers.
5757
| SL | ✅ Yes | WFS |
5858
| SN | ✅ Yes | API |
5959
| ST | ❌ No | - |
60-
| TH | ❌ No | - |
60+
| TH | ✅ Yes | WFS |
6161

6262
## Installation
6363
Dependency management is done using [uv](https://docs.astral.sh/uv/). Make sure

jedeschule/spiders/thueringen.py

Lines changed: 45 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
import re
2-
3-
import scrapy
1+
import xmltodict
42
from scrapy import Item
53

64
from jedeschule.items import School
@@ -9,101 +7,58 @@
97

108
class ThueringenSpider(SchoolSpider):
119
name = "thueringen"
12-
base_url = "https://www.schulportal-thueringen.de"
13-
1410
start_urls = [
15-
"https://www.schulportal-thueringen.de/tip/schulportraet_suche/search.action?tspi=&tspm=&vsid=none&mode=&extended=0&anwf=schulportraet&freitextsuche=&name=&schulnummer=&strasse=&plz=&ort=&schulartDecode=&schulamtDecode=&kzFreierTraeger_cb=1&kzFreierTraeger=2&schultraegerDecode=&sortierungDecode=Schulname&rowsPerPage=999&schulartCode=&schulamtCode=&schultraegerCode=&sortierungCode=10&uniquePortletId=portlet_schulportraet_suche_WAR_tip1109990a_e473_4c62_872b_4ef69bdb6c5d&ajaxId=schulportraet_suche_results"
11+
"https://www.geoproxy.geoportal-th.de/geoproxy/services/kommunal/komm_wfs?"
12+
"SERVICE=WFS&REQUEST=GetFeature&typeNames=kommunal:komm_schul&"
13+
"srsname=EPSG:4326&VERSION=2.0.0"
1614
]
1715

18-
# TODO: parse last_modified
19-
def parse(self, response):
20-
headers = [
21-
header.css("::text").extract_first().strip()
22-
for header in response.css("th")
23-
]
24-
for tr in response.css(".tispo_row_odd,.tispo_row_normal"):
25-
collection = {}
26-
tds = tr.css("td")
27-
for index, td in enumerate(tds):
28-
key = headers[index]
29-
value = td.css("::text").extract_first()
30-
# The school name is hidden in a link so we check if there
31-
# is a link and if yes extract the value from that
32-
link_text = td.css("a ::text").extract_first()
33-
if link_text:
34-
value = link_text
35-
collection[key] = value.strip()
36-
# inspect_response(response, self)
37-
url = tds[1].css("::attr(href)").extract_first().strip()
38-
request = scrapy.Request(self.base_url + url, callback=self.parse_overview)
39-
request.meta["collection"] = collection
40-
yield request
16+
def parse(self, response, **kwargs):
17+
data = xmltodict.parse(response.text)
18+
members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
4119

42-
def parse_overview(self, response):
43-
# inspect_response(response, self)
44-
collection = response.meta["collection"]
45-
for tr in response.css(".tispo_labelValueView tr"):
46-
tds = tr.css("td ::text").extract()
47-
# sometimes there is no value for the key
48-
if len(tds) >= 2:
49-
collection[tds[0][:-1].strip()] = "".join(
50-
[td.strip() for td in tds[1:]]
51-
)
52-
collection["data_url"] = response.url
53-
collection["Leitbild"] = " ".join(
54-
response.css(".tispo_htmlUserContent ::text").extract()
55-
)
56-
yield collection
20+
if not isinstance(members, list):
21+
members = [members]
22+
23+
for member in members:
24+
school = member.get("kommunal:komm_schul", {})
25+
26+
data_elem = {}
27+
28+
# Extract geometry coordinates
29+
geom = school.get("kommunal:GEOM", {})
30+
point = geom.get("gml:Point", {})
31+
pos = point.get("gml:pos", "")
32+
if pos:
33+
lon, lat = pos.split()
34+
data_elem["lat"] = float(lat)
35+
data_elem["lon"] = float(lon)
36+
37+
# Extract all other fields
38+
for key, value in school.items():
39+
if key not in ("kommunal:GEOM", "@gml:id") and value:
40+
# Remove namespace prefix
41+
clean_key = key.split(":", 1)[-1] if ":" in key else key
42+
data_elem[clean_key] = value
43+
44+
yield data_elem
5745

5846
@staticmethod
5947
def normalize(item: Item) -> School:
60-
city_parts = item.get("Ort").split()
61-
zip, city = city_parts[0], " ".join(city_parts[1:])
6248
return School(
63-
name=item.get("Schulname"),
49+
name=item.get("Name"),
6450
id="TH-{}".format(item.get("Schulnummer")),
65-
address=item.get("Straße"),
66-
zip=zip,
67-
city=city,
68-
website=item.get("Internet"),
69-
email=ThueringenSpider._deobfuscate_email(item.get("E-Mail")),
51+
address=" ".join(
52+
filter(None, [item.get("Strasse"), item.get("Hausnummer")])
53+
),
54+
zip=item.get("PLZ"),
55+
city=item.get("Ort"),
56+
website=item.get("Webseite"),
57+
email=item.get("EMail"),
7058
school_type=item.get("Schulart"),
71-
provider=item.get("Schulträger"),
72-
fax=item.get("Telefax"),
73-
phone=item.get("Telefon"),
59+
provider=item.get("Traeger"),
60+
fax=item.get("Faxnummer"),
61+
phone=item.get("Telefonnummer"),
62+
latitude=item.get("lat"),
63+
longitude=item.get("lon"),
7464
)
75-
76-
@staticmethod
77-
def _deobfuscate_email(orig):
78-
"""
79-
Reverse-engineered version of the deobfuscation code on the website.
80-
81-
:param orig: the obfuscated string or the whole function call (`$(function() {...})`),
82-
as long as it contains the prefix `#3b` and the suffix `3e#`.
83-
:return: the deofuscated string
84-
"""
85-
86-
result = ""
87-
if orig and re.search(r"#3b[a-z0-9 ]+3e#", orig):
88-
orig = re.search(r"#3b[a-z0-9 ]+3e#", orig).group(0)
89-
s = (
90-
orig.replace(" ", "")
91-
.replace("#3b", "")
92-
.replace("3e#", "")
93-
.replace("o", "")
94-
)
95-
96-
last_value = 0
97-
current_value = 0
98-
for i, c in enumerate(s):
99-
if c.isnumeric():
100-
current_value = int(c)
101-
else:
102-
current_value = ord(c) - 97 + 10
103-
104-
if i % 2 == 1:
105-
t = int(last_value * 23 + current_value) // 2
106-
result += chr(t)
107-
last_value = current_value
108-
109-
return result

0 commit comments

Comments
 (0)