Skip to content

Commit 0fabd2c

Browse files
authored
Merge pull request #184 from SimonMand/rework_bayern_spider
Reworked the bayern parser
2 parents 91ea304 + d55a8a0 commit 0fabd2c

File tree

2 files changed

+88
-33
lines changed

2 files changed

+88
-33
lines changed

jedeschule/spiders/bayern.py

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import xml.etree.ElementTree as ET
2-
import scrapy
1+
import xmltodict
32
from scrapy import Item
43

54
from jedeschule.items import School
@@ -9,40 +8,44 @@
98
class BayernSpider(SchoolSpider):
109
name = "bayern"
1110
start_urls = [
12-
"https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities"
11+
"https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?"
12+
"SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename="
13+
"schul:SchulstandorteGrundschulen,"
14+
"schul:SchulstandorteMittelschulen,"
15+
"schul:SchulstandorteRealschulen,"
16+
"schul:SchulstandorteGymnasien,"
17+
"schul:SchulstandorteBeruflicheSchulen,"
18+
"schul:SchulstandorteFoerderzentren,"
19+
"schul:SchulstandorteWeitererSchulen"
1320
]
1421

1522
def parse(self, response, **kwargs):
16-
tree = ET.fromstring(response.body)
17-
base_url = "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename="
18-
for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"):
19-
feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title")
20-
yield scrapy.Request(
21-
f"{base_url}{feature}",
22-
callback=self.parse_resource,
23-
cb_kwargs={"feature": feature},
24-
)
25-
26-
def parse_resource(self, response, feature):
27-
tree = ET.fromstring(response.body)
28-
namespaces = {
29-
"gml": "http://www.opengis.net/gml/3.2",
30-
"schul": "http://gdi.bayern/brbschul",
31-
}
32-
key = "{http://gdi.bayern/brbschul}" + feature
33-
for school in tree.iter(key):
34-
data_elem = {"id": school.attrib["{http://www.opengis.net/gml/3.2}id"]}
35-
36-
for entry in school:
37-
if entry.tag == "{http://gdi.bayern/brbschul}geometry":
38-
lon, lat = entry.findtext(
39-
"gml:Point/gml:pos", namespaces=namespaces
40-
).split(" ")
41-
data_elem["lat"] = lat
42-
data_elem["lon"] = lon
43-
continue
44-
# strip the namespace before returning
45-
data_elem[entry.tag.split("}", 1)[1]] = entry.text
23+
data = xmltodict.parse(response.text)
24+
members = data.get("wfs:FeatureCollection", {}).get("wfs:member", [])
25+
26+
if not isinstance(members, list):
27+
members = [members]
28+
29+
for member in members:
30+
# Each member is a dict with one key = school tag, value = school data dict
31+
school = next(iter(member.values()), {})
32+
33+
data_elem = {
34+
"id": school.get("@gml:id")
35+
}
36+
37+
for key, value in school.items():
38+
if key == "schul:geometry":
39+
point = value.get("gml:Point", {})
40+
pos = point.get("gml:pos", "")
41+
if pos:
42+
lon, lat = pos.split()
43+
data_elem["lat"] = float(lat)
44+
data_elem["lon"] = float(lon)
45+
elif not key.startswith("@"):
46+
clean_key = key.split(":", 1)[-1]
47+
data_elem[clean_key] = value
48+
4649
yield data_elem
4750

4851
@staticmethod

test/test_bayern.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import unittest
2+
3+
from scrapy.http import TextResponse
4+
5+
from jedeschule.spiders.bayern import BayernSpider
6+
7+
8+
class TestBayernSpider(unittest.TestCase):
9+
def test_parse(self):
10+
xml_response = """<?xml version='1.0' encoding='UTF-8'?>
11+
<wfs:FeatureCollection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
12+
xsi:schemaLocation="http://www.opengis.net/wfs/2.0"
13+
xmlns:wfs="http://www.opengis.net/wfs/2.0" timeStamp="2025-08-11T09:35:15Z"
14+
xmlns:gml="http://www.opengis.net/gml/3.2" numberMatched="unknown" numberReturned="0">
15+
<wfs:member>
16+
<schul:SchulstandorteFoerderzentren xmlns:schul="http://gdi.bayern/brbschul"
17+
gml:id="SCHUL_SCHULSTANDORTEFOERDERZENTREN_3721b800-751d-49a1-a6d2-19d237e7bcc8">
18+
<schul:schulname>Bayerische Landesschule</schul:schulname>
19+
<schul:strasse>Kurzstr. 2</schul:strasse>
20+
<schul:postleitzahl>81547</schul:postleitzahl>
21+
<schul:ort>München</schul:ort>
22+
<schul:schulart>Förderzentren</schul:schulart>
23+
<schul:geometry>
24+
<gml:Point
25+
gml:id="SCHUL_SCHULSTANDORTEFOERDERZENTREN_3721b800-751d-49a1-a6d2-19d237e7bcc8_SCHUL_GEOMETRY"
26+
srsName="EPSG:4326">
27+
<gml:pos>11.5686076923 48.1047906989</gml:pos>
28+
</gml:Point>
29+
</schul:geometry>
30+
</schul:SchulstandorteFoerderzentren>
31+
</wfs:member>
32+
</wfs:FeatureCollection>
33+
"""
34+
35+
spider = BayernSpider()
36+
response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8")
37+
schools = list(spider.parse(response))
38+
self.assertEqual(len(schools), 1)
39+
40+
school = schools[0]
41+
42+
self.assertEqual(school["schulname"], "Bayerische Landesschule")
43+
self.assertEqual(school["strasse"], "Kurzstr. 2")
44+
self.assertEqual(school["postleitzahl"], "81547")
45+
self.assertEqual(school["ort"], "München")
46+
self.assertEqual(school["schulart"], "Förderzentren")
47+
self.assertEqual(school["lon"], 11.5686076923)
48+
self.assertEqual(school["lat"], 48.1047906989)
49+
50+
51+
if __name__ == "__main__":
52+
unittest.main()

0 commit comments

Comments
 (0)