Skip to content

Commit f8451ac

Browse files
authored
Merge pull request #158 from Datenschule/saarland-geoportal
Saarland geoportal
2 parents 7e64f6f + 5ec94ed commit f8451ac

File tree

3 files changed

+50
-94
lines changed

3 files changed

+50
-94
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# Changelog
22

3+
## 2025-05-12
4+
- [SL]: Switch to data from Geoportal instead of web scraping. We do not get
5+
contact details such as email and phone anymore but we (might) get more
6+
stable ids and also get geolocation data now. ⚠️ This breaks existing ids.
7+
38
## 2025-04-30
49
- [BY]: Latitude and Longitude were swapped in the database. This has been fixed.
510
- [NW]: Latitude and Longitude were swapped in the database. This has been fixed.

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ ISO-3166-2 code (without the `DE-` prefix).
2020
In details, the IDs are sourced as follows:
2121

2222

23-
|State| ID-Source | exmaple-id |stable|
24-
|-----|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|------|
23+
|State| ID-Source | exmaple-id |stable|
24+
|-----|--------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|------|
2525
|BW| Field `DISCH` (Dienststellenschüssel) in the JSON repsonse | `BW-04154817` |✅ likely|
2626
|BY| id from the WFS service | `BY-SCHUL_SCHULSTANDORTEGRUNDSCHULEN_2acb7d31-915d-40a9-adcf-27b38251fa48` |❓ unlikely (although we reached out to ask for canonical IDs to be published)|
2727
|BE| Field `bsn` (Berliner Schulnummer) from the WFS Service | `BE-02K10` |✅ likely|
@@ -33,7 +33,7 @@ In details, the IDs are sourced as follows:
3333
|NI| Field `schulnr` from the JSON in the details payload | `NI-67763` |✅ likely|
3434
|NW| Column `Schulnummer` from the CSV | `NW-162437` |✅ likely|
3535
|RP| `Schulnummer` from the school's details page | `RP-50720` |✅ likely|
36-
|SL| Phone number, email or name from the item. Depending on what is available first | `SL-gem-kleinblittersdorfATschule.saarland` |❌ unlikely (e.g. if name was used before and now a phone number is added to the school, it will be used instead)|
36+
|SL| `OBJECTID` from the WFS service | `SL-255` |❓ unlikely |
3737
|SN| Field `id` from the API | `SN-4062` |✅ likely|
3838
|ST| `ID` query param from the details page URL | `ST-1001186` |❓ probably?|
3939
|TH| `Schulnumer` from school list | `TH-10601` |✅ likely|
@@ -54,7 +54,7 @@ When available, we try to use the geolocations provided by the data publishers.
5454
| NI | ❌ No | - |
5555
| NW | ✅ Yes | Converted from EPSG:25832 in source CSV data |
5656
| RP | ❌ No | - |
57-
| SL | ❌ No | - |
57+
| SL | ✅ Yes | WFS |
5858
| SN | ✅ Yes | API |
5959
| ST | ❌ No | - |
6060
| TH | ❌ No | - |

jedeschule/spiders/saarland.py

Lines changed: 41 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,104 +1,55 @@
1-
from scrapy.spiders import Rule, CrawlSpider
2-
from scrapy.linkextractors import LinkExtractor
3-
from scrapy import Item, FormRequest, Request
1+
from scrapy import Item
2+
import xml.etree.ElementTree as ET
43

54
from jedeschule.items import School
65
from jedeschule.spiders.school_spider import SchoolSpider
76

8-
# School types: Berufliche Schule, Erweitere Realschule, Förderschule, Freie Waldorfschule,
9-
# Gemeinschatsschule, Grundschule, Gymnasium, Lyzeum, Realschule, Studienseminare
107

11-
12-
class SaarlandSpider(CrawlSpider, SchoolSpider):
8+
class SaarlandSpider(SchoolSpider):
139
name = "saarland"
1410
start_urls = [
15-
"https://www.saarland.de/mbk/DE/portale/bildungsserver/schulen-und-bildungswege/schuldatenbank"
11+
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
1612
]
1713

18-
rules = (
19-
Rule(
20-
LinkExtractor(allow=(), restrict_xpaths=('//a[@class="forward button"]',)),
21-
callback="parse_start_url",
22-
follow=True,
23-
),
24-
)
25-
26-
def parse_start_url(self, response):
27-
yield FormRequest.from_response(
28-
response, formname="searchSchool", callback=self.parse_page
29-
)
30-
31-
def parse_page(self, response):
32-
for school in self.parse_schools(response):
33-
yield school
34-
next_button = response.xpath(
35-
'//a[@class="forward button"]/@href'
36-
).extract_first()
37-
if next_button:
38-
yield Request(next_button, callback=self.parse_page)
39-
40-
def parse_schools(self, response):
41-
cards = response.xpath('//div[@class="c-teaser-card"]')
42-
43-
for card in cards:
44-
school = {}
45-
school["name"] = card.xpath(".//h3/text()").extract_first().strip()
46-
47-
badges = card.css(".c-badge")
48-
school["schultyp"] = badges[0].css("::text").extract_first()
49-
school["ort"] = badges[1].css("::text").extract_first()
50-
51-
address = card.xpath(".//p/text()").extract_first().split(", ")
52-
53-
school["straße"] = address[0]
54-
school["plz"] = address[1].strip(" " + school["ort"])
55-
56-
keys = card.xpath(".//dt/text()").extract()
57-
info = card.xpath(".//dd/text()").extract()
58-
59-
for index in range(0, len(keys)):
60-
key = keys[index].strip(":").lower()
61-
62-
if key == "homepage":
63-
school["homepage"] = card.xpath(
64-
'.//a[@target="_blank"]/text()'
65-
).extract_first()
66-
67-
if key == "e-mail":
68-
school["e-mail"] = (
69-
card.xpath('.//a[contains(@title, "E-Mail senden an:")]/@href')
70-
.extract_first()
71-
.strip("mailto:")
72-
)
73-
74-
if key != "homepage" and key != "e-mail":
75-
school[key] = info[index]
76-
77-
yield school
78-
79-
@staticmethod
80-
def get_id(item: Item) -> str:
81-
# There are no IDs on the page that we could use.
82-
# We will fall back to phone number, e-mail or name
83-
# in the worst case
84-
if tel := item.get("telefon"):
85-
return tel.replace(" ", "-")
86-
if email := item.get("e-mail"):
87-
return email.replace("@", "AT")
88-
return item.get("name")
14+
def parse(self, response):
15+
tree = ET.fromstring(response.body)
16+
17+
namespaces = {
18+
"gml": "http://www.opengis.net/gml/3.2",
19+
"SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer",
20+
}
21+
22+
for school in tree.iter(
23+
"{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL"
24+
):
25+
data_elem = {}
26+
for entry in school:
27+
if (
28+
entry.tag
29+
== "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape"
30+
):
31+
# This nested entry contains the coordinates that we would like to expand
32+
lat, lon = entry.findtext(
33+
"gml:Point/gml:pos", namespaces=namespaces
34+
).split(" ")
35+
data_elem["lat"] = lat
36+
data_elem["lon"] = lon
37+
continue
38+
# strip the namespace before returning
39+
data_elem[entry.tag.split("}", 1)[1]] = entry.text
40+
yield data_elem
8941

9042
@staticmethod
9143
def normalize(item: Item) -> School:
44+
# The data also contains a field called `SCHULKENNZ` which implies that it might be an id
45+
# that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions
46+
id = item.get("OBJECTID")
47+
9248
return School(
93-
name=item.get("name"),
94-
phone=item.get("telefon"),
95-
fax=item.get("telefax"),
96-
website=item.get("homepage"),
97-
email=item.get("e-mail"),
98-
address=item.get("straße"),
99-
city=item.get("ort"),
100-
zip=item.get("plz"),
101-
school_type=item.get("schultyp"),
102-
director=item.get("schulleitung"),
103-
id="SL-{}".format(SaarlandSpider.get_id(item)),
49+
name=item.get("SCHULNAME"),
50+
address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]),
51+
city=item.get("ORT_NAME"),
52+
zip=item.get("PLZ"),
53+
school_type=item.get("SCHULFORM"),
54+
id=f"SL-{id}",
10455
)

0 commit comments

Comments
 (0)