Skip to content

Commit 4509671

Browse files
authored
Merge pull request #183 from SimonMand/rework_saarland_spider
Rework Saarland parser
2 parents 2721cd8 + ae278fd commit 4509671

File tree

5 files changed

+117
-31
lines changed

5 files changed

+117
-31
lines changed

jedeschule/spiders/saarland.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1+
import xmltodict
12
from scrapy import Item
2-
import xml.etree.ElementTree as ET
33

44
from jedeschule.items import School
55
from jedeschule.spiders.school_spider import SchoolSpider
@@ -8,48 +8,52 @@
88
class SaarlandSpider(SchoolSpider):
99
name = "saarland"
1010
start_urls = [
11-
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
11+
"https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?"
12+
"SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326"
1213
]
1314

14-
def parse(self, response):
15-
tree = ET.fromstring(response.body)
15+
def parse(self, response, **kwargs):
16+
data = xmltodict.parse(response.text)
17+
members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
1618

17-
namespaces = {
18-
"gml": "http://www.opengis.net/gml/3.2",
19-
"SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer",
20-
}
19+
if not isinstance(members, list):
20+
members = [members]
2121

22-
for school in tree.iter(
23-
"{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL"
24-
):
22+
for member in members:
23+
school = member.get("Staatliche_Dienste:Schulen_SL", {})
2524
data_elem = {}
26-
for entry in school:
27-
if (
28-
entry.tag
29-
== "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape"
30-
):
31-
# This nested entry contains the coordinates that we would like to expand
32-
lat, lon = entry.findtext(
33-
"gml:Point/gml:pos", namespaces=namespaces
34-
).split(" ")
35-
data_elem["lat"] = lat
36-
data_elem["lon"] = lon
25+
26+
for key, value in school.items():
27+
if key == "Staatliche_Dienste:SHAPE":
28+
pos = (value.get("gml:Point", {})
29+
.get("gml:pos", "")
30+
.strip())
31+
if pos:
32+
lat, lon = pos.split()
33+
data_elem["lat"] = float(lat)
34+
data_elem["lon"] = float(lon)
35+
3736
continue
38-
# strip the namespace before returning
39-
data_elem[entry.tag.split("}", 1)[1]] = entry.text
37+
38+
clean_key = key.split(":")[-1]
39+
if clean_key == "PLZ":
40+
value = value.split(".")[0]
41+
42+
data_elem[clean_key] = value
43+
4044
yield data_elem
4145

4246
@staticmethod
4347
def normalize(item: Item) -> School:
4448
# The data also contains a field called `SCHULKENNZ` which implies that it might be an id
4549
# that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions
46-
id = item.get("OBJECTID")
50+
school_id = item.get("OBJECTID")
4751

4852
return School(
49-
name=item.get("SCHULNAME"),
50-
address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]),
51-
city=item.get("ORT_NAME"),
53+
name=item.get("Bezeichnun"),
54+
address=item.get("Straße", "").strip(),
55+
city=item.get("Ort"),
5256
zip=item.get("PLZ"),
53-
school_type=item.get("SCHULFORM"),
54-
id=f"SL-{id}",
57+
school_type=item.get("Schulform"),
58+
id=f"SL-{school_id}",
5559
)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ dependencies = [
1313
"requests==2.32.4",
1414
"scrapy==2.13.3",
1515
"sqlalchemy==1.3.10",
16+
"xmltodict==0.13.0",
1617
]

test/test_saarland.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import unittest
2+
3+
from scrapy.http import TextResponse
4+
5+
from jedeschule.spiders.saarland import SaarlandSpider
6+
7+
8+
class TestSaarlandSpider(unittest.TestCase):
9+
def test_parse(self):
10+
xml_response = """<?xml version="1.0" encoding="utf-8" ?>
11+
<wfs:FeatureCollection xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:wfs="http://www.opengis.net/wfs/2.0" xmlns:gml="http://www.opengis.net/gml/3.2" xmlns:Staatliche_Dienste="https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" timeStamp="2025-07-20T17:40:21Z" numberMatched="317" numberReturned="1" xsi:schemaLocation="http://www.opengis.net/wfs/2.0 http://schemas.opengis.net/wfs/2.0/wfs.xsd http://www.opengis.net/gml/3.2 http://schemas.opengis.net/gml/3.2.1/gml.xsd https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?service=wfs%26version=2.0.0%26request=DescribeFeatureType">
12+
<wfs:member>
13+
<Staatliche_Dienste:Schulen_SL gml:id="Schulen_SL.1">
14+
<Staatliche_Dienste:SHAPE>
15+
<gml:Point gml:id="Schulen_SL.1.pn.0" srsName="urn:ogc:def:crs:EPSG::4326">
16+
<gml:pos>49.24067452 7.02085050</gml:pos>
17+
</gml:Point>
18+
</Staatliche_Dienste:SHAPE>
19+
<Staatliche_Dienste:OBJECTID>1</Staatliche_Dienste:OBJECTID>
20+
<Staatliche_Dienste:fid>1.00000000</Staatliche_Dienste:fid>
21+
<Staatliche_Dienste:Gemeindenu>1100.00000000</Staatliche_Dienste:Gemeindenu>
22+
<Staatliche_Dienste:PLZ>66123.00000000</Staatliche_Dienste:PLZ>
23+
<Staatliche_Dienste:Ort>Saarbrücken</Staatliche_Dienste:Ort>
24+
<Staatliche_Dienste:Straße>Kohlweg 7</Staatliche_Dienste:Straße>
25+
<Staatliche_Dienste:Bezeichnun>Deutsch-Französiche Hochschule, Université franco-allemande</Staatliche_Dienste:Bezeichnun>
26+
<Staatliche_Dienste:Telefon>0681-93812100</Staatliche_Dienste:Telefon>
27+
<Staatliche_Dienste:Fax>0681-93812111</Staatliche_Dienste:Fax>
28+
<Staatliche_Dienste:Email>info@dfh-ufa.org</Staatliche_Dienste:Email>
29+
<Staatliche_Dienste:Schulform>Hochschule</Staatliche_Dienste:Schulform>
30+
<Staatliche_Dienste:Homepage>https://www.dfh-ufa.org/</Staatliche_Dienste:Homepage>
31+
<Staatliche_Dienste:Schulregio>Saarbrücken</Staatliche_Dienste:Schulregio>
32+
<Staatliche_Dienste:KARTENERST>Hochschule</Staatliche_Dienste:KARTENERST>
33+
<Staatliche_Dienste:Rechtswert>355942.97630000</Staatliche_Dienste:Rechtswert>
34+
<Staatliche_Dienste:Hochwert>5456095.93600000</Staatliche_Dienste:Hochwert>
35+
<Staatliche_Dienste:Aktualisie>20.05.2025</Staatliche_Dienste:Aktualisie>
36+
</Staatliche_Dienste:Schulen_SL>
37+
</wfs:member>
38+
</wfs:FeatureCollection>
39+
"""
40+
41+
spider = SaarlandSpider()
42+
response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8")
43+
schools = list(spider.parse(response))
44+
self.assertEqual(len(schools), 1)
45+
46+
school = schools[0]
47+
48+
self.assertEqual(school["OBJECTID"], "1")
49+
self.assertEqual(school["fid"], "1.00000000")
50+
self.assertEqual(school["Gemeindenu"], "1100.00000000")
51+
self.assertEqual(school["PLZ"], "66123")
52+
self.assertEqual(school["Ort"], "Saarbrücken")
53+
self.assertEqual(school["Straße"], "Kohlweg 7")
54+
self.assertEqual(school["Bezeichnun"], "Deutsch-Französiche Hochschule, Université franco-allemande")
55+
self.assertEqual(school["Telefon"], "0681-93812100")
56+
self.assertEqual(school["Fax"], "0681-93812111")
57+
self.assertEqual(school["Email"], "info@dfh-ufa.org")
58+
self.assertEqual(school["Schulform"], "Hochschule")
59+
self.assertEqual(school["Homepage"], "https://www.dfh-ufa.org/")
60+
self.assertEqual(school["Schulregio"], "Saarbrücken")
61+
self.assertEqual(school["KARTENERST"], "Hochschule")
62+
self.assertEqual(school["Rechtswert"], "355942.97630000")
63+
self.assertEqual(school["Hochwert"], "5456095.93600000")
64+
self.assertEqual(school["Aktualisie"], "20.05.2025")
65+
self.assertAlmostEqual(school["lat"], 49.24067452)
66+
self.assertAlmostEqual(school["lon"], 7.02085050)
67+
68+
69+
if __name__ == "__main__":
70+
unittest.main()

test_changes.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ set -euxo pipefail
55
git fetch origin main
66

77
if [ "${CI:-}" = "true" ]; then
8-
HEAD_REF="${GITHUB_SHA}"
8+
HEAD_REF="${GITHUB_SHA:-$(git rev-parse HEAD)}"
99
else
1010
HEAD_REF="HEAD"
1111
fi

uv.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)