Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 34 additions & 30 deletions jedeschule/spiders/saarland.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import xmltodict
from scrapy import Item
import xml.etree.ElementTree as ET

from jedeschule.items import School
from jedeschule.spiders.school_spider import SchoolSpider
Expand All @@ -8,48 +8,52 @@
class SaarlandSpider(SchoolSpider):
name = "saarland"
start_urls = [
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?"
"SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
]

def parse(self, response):
tree = ET.fromstring(response.body)
def parse(self, response, **kwargs):
data = xmltodict.parse(response.text)
members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])

namespaces = {
"gml": "http://www.opengis.net/gml/3.2",
"SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer",
}
if not isinstance(members, list):
members = [members]

for school in tree.iter(
"{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL"
):
for member in members:
school = member.get("Staatliche_Dienste:Schulen_SL", {})
data_elem = {}
for entry in school:
if (
entry.tag
== "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape"
):
# This nested entry contains the coordinates that we would like to expand
lat, lon = entry.findtext(
"gml:Point/gml:pos", namespaces=namespaces
).split(" ")
data_elem["lat"] = lat
data_elem["lon"] = lon

for key, value in school.items():
if key == "Staatliche_Dienste:SHAPE":
pos = (value.get("gml:Point", {})
.get("gml:pos", "")
.strip())
if pos:
lat, lon = pos.split()
data_elem["lat"] = float(lat)
data_elem["lon"] = float(lon)

continue
# strip the namespace before returning
data_elem[entry.tag.split("}", 1)[1]] = entry.text

clean_key = key.split(":")[-1]
if clean_key == "PLZ":
value = value.split(".")[0]

data_elem[clean_key] = value

yield data_elem

@staticmethod
def normalize(item: Item) -> School:
# The data also contains a field called `SCHULKENNZ` which implies that it might be an id
# that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions
id = item.get("OBJECTID")
school_id = item.get("OBJECTID")

return School(
name=item.get("SCHULNAME"),
address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]),
city=item.get("ORT_NAME"),
name=item.get("Bezeichnun"),
address=item.get("Straße", "").strip(),
city=item.get("Ort"),
zip=item.get("PLZ"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the configuration of their WFS server is now wrong and returns the PLZ as a float where actually it should be a string (not even an integer since PLZs are always 5 digits in Germany and might start with zero). Can you parse to an integer here and pad with zeros (padding is strictly not necessary since the 0-leading zips are all in Sachsen afaik), please? 🙏

school_type=item.get("SCHULFORM"),
id=f"SL-{id}",
school_type=item.get("Schulform"),
id=f"SL-{school_id}",
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dependencies = [
"requests==2.32.4",
"scrapy==2.13.3",
"sqlalchemy==1.3.10",
"xmltodict==0.13.0",
]
70 changes: 70 additions & 0 deletions test/test_saarland.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import unittest

from scrapy.http import TextResponse

from jedeschule.spiders.saarland import SaarlandSpider


class TestSaarlandSpider(unittest.TestCase):
def test_parse(self):
xml_response = """<?xml version="1.0" encoding="utf-8" ?>
<wfs:FeatureCollection xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:wfs="http://www.opengis.net/wfs/2.0" xmlns:gml="http://www.opengis.net/gml/3.2" xmlns:Staatliche_Dienste="https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" timeStamp="2025-07-20T17:40:21Z" numberMatched="317" numberReturned="1" xsi:schemaLocation="http://www.opengis.net/wfs/2.0 http://schemas.opengis.net/wfs/2.0/wfs.xsd http://www.opengis.net/gml/3.2 http://schemas.opengis.net/gml/3.2.1/gml.xsd https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?service=wfs%26version=2.0.0%26request=DescribeFeatureType">
<wfs:member>
<Staatliche_Dienste:Schulen_SL gml:id="Schulen_SL.1">
<Staatliche_Dienste:SHAPE>
<gml:Point gml:id="Schulen_SL.1.pn.0" srsName="urn:ogc:def:crs:EPSG::4326">
<gml:pos>49.24067452 7.02085050</gml:pos>
</gml:Point>
</Staatliche_Dienste:SHAPE>
<Staatliche_Dienste:OBJECTID>1</Staatliche_Dienste:OBJECTID>
<Staatliche_Dienste:fid>1.00000000</Staatliche_Dienste:fid>
<Staatliche_Dienste:Gemeindenu>1100.00000000</Staatliche_Dienste:Gemeindenu>
<Staatliche_Dienste:PLZ>66123.00000000</Staatliche_Dienste:PLZ>
<Staatliche_Dienste:Ort>Saarbrücken</Staatliche_Dienste:Ort>
<Staatliche_Dienste:Straße>Kohlweg 7</Staatliche_Dienste:Straße>
<Staatliche_Dienste:Bezeichnun>Deutsch-Französiche Hochschule, Université franco-allemande</Staatliche_Dienste:Bezeichnun>
<Staatliche_Dienste:Telefon>0681-93812100</Staatliche_Dienste:Telefon>
<Staatliche_Dienste:Fax>0681-93812111</Staatliche_Dienste:Fax>
<Staatliche_Dienste:Email>[email protected]</Staatliche_Dienste:Email>
<Staatliche_Dienste:Schulform>Hochschule</Staatliche_Dienste:Schulform>
<Staatliche_Dienste:Homepage>https://www.dfh-ufa.org/</Staatliche_Dienste:Homepage>
<Staatliche_Dienste:Schulregio>Saarbrücken</Staatliche_Dienste:Schulregio>
<Staatliche_Dienste:KARTENERST>Hochschule</Staatliche_Dienste:KARTENERST>
<Staatliche_Dienste:Rechtswert>355942.97630000</Staatliche_Dienste:Rechtswert>
<Staatliche_Dienste:Hochwert>5456095.93600000</Staatliche_Dienste:Hochwert>
<Staatliche_Dienste:Aktualisie>20.05.2025</Staatliche_Dienste:Aktualisie>
</Staatliche_Dienste:Schulen_SL>
</wfs:member>
</wfs:FeatureCollection>
"""

spider = SaarlandSpider()
response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8")
schools = list(spider.parse(response))
self.assertEqual(len(schools), 1)

school = schools[0]

self.assertEqual(school["OBJECTID"], "1")
self.assertEqual(school["fid"], "1.00000000")
self.assertEqual(school["Gemeindenu"], "1100.00000000")
self.assertEqual(school["PLZ"], "66123")
self.assertEqual(school["Ort"], "Saarbrücken")
self.assertEqual(school["Straße"], "Kohlweg 7")
self.assertEqual(school["Bezeichnun"], "Deutsch-Französiche Hochschule, Université franco-allemande")
self.assertEqual(school["Telefon"], "0681-93812100")
self.assertEqual(school["Fax"], "0681-93812111")
self.assertEqual(school["Email"], "[email protected]")
self.assertEqual(school["Schulform"], "Hochschule")
self.assertEqual(school["Homepage"], "https://www.dfh-ufa.org/")
self.assertEqual(school["Schulregio"], "Saarbrücken")
self.assertEqual(school["KARTENERST"], "Hochschule")
self.assertEqual(school["Rechtswert"], "355942.97630000")
self.assertEqual(school["Hochwert"], "5456095.93600000")
self.assertEqual(school["Aktualisie"], "20.05.2025")
self.assertAlmostEqual(school["lat"], 49.24067452)
self.assertAlmostEqual(school["lon"], 7.02085050)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion test_changes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -euxo pipefail
git fetch origin main

if [ "${CI:-}" = "true" ]; then
HEAD_REF="${GITHUB_SHA}"
HEAD_REF="${GITHUB_SHA:-$(git rev-parse HEAD)}"
else
HEAD_REF="HEAD"
fi
Expand Down
11 changes: 11 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.