diff --git a/jedeschule/spiders/bayern.py b/jedeschule/spiders/bayern.py index bad6241..cc27d51 100644 --- a/jedeschule/spiders/bayern.py +++ b/jedeschule/spiders/bayern.py @@ -1,5 +1,4 @@ -import xml.etree.ElementTree as ET -import scrapy +import xmltodict from scrapy import Item from jedeschule.items import School @@ -9,40 +8,44 @@ class BayernSpider(SchoolSpider): name = "bayern" start_urls = [ - "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities" + "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?" + "SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=" + "schul:SchulstandorteGrundschulen," + "schul:SchulstandorteMittelschulen," + "schul:SchulstandorteRealschulen," + "schul:SchulstandorteGymnasien," + "schul:SchulstandorteBeruflicheSchulen," + "schul:SchulstandorteFoerderzentren," + "schul:SchulstandorteWeitererSchulen" ] def parse(self, response, **kwargs): - tree = ET.fromstring(response.body) - base_url = "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=" - for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"): - feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title") - yield scrapy.Request( - f"{base_url}{feature}", - callback=self.parse_resource, - cb_kwargs={"feature": feature}, - ) - - def parse_resource(self, response, feature): - tree = ET.fromstring(response.body) - namespaces = { - "gml": "http://www.opengis.net/gml/3.2", - "schul": "http://gdi.bayern/brbschul", - } - key = "{http://gdi.bayern/brbschul}" + feature - for school in tree.iter(key): - data_elem = {"id": school.attrib["{http://www.opengis.net/gml/3.2}id"]} - - for entry in school: - if entry.tag == "{http://gdi.bayern/brbschul}geometry": - lon, lat = entry.findtext( - "gml:Point/gml:pos", namespaces=namespaces - ).split(" ") - data_elem["lat"] = lat - data_elem["lon"] = lon - continue - # strip the namespace before returning - data_elem[entry.tag.split("}", 1)[1]] = entry.text + data = xmltodict.parse(response.text) + members = data.get("wfs:FeatureCollection", {}).get("wfs:member", []) + + if not isinstance(members, list): + members = [members] + + for member in members: + # Each member is a dict with one key = school tag, value = school data dict + school = next(iter(member.values()), {}) + + data_elem = { + "id": school.get("@gml:id") + } + + for key, value in school.items(): + if key == "schul:geometry": + point = value.get("gml:Point", {}) + pos = point.get("gml:pos", "") + if pos: + lon, lat = pos.split() + data_elem["lat"] = float(lat) + data_elem["lon"] = float(lon) + elif not key.startswith("@"): + clean_key = key.split(":", 1)[-1] + data_elem[clean_key] = value + yield data_elem @staticmethod diff --git a/test/test_bayern.py b/test/test_bayern.py new file mode 100644 index 0000000..dcd101a --- /dev/null +++ b/test/test_bayern.py @@ -0,0 +1,52 @@ +import unittest + +from scrapy.http import TextResponse + +from jedeschule.spiders.bayern import BayernSpider + + +class TestBayernSpider(unittest.TestCase): + def test_parse(self): + xml_response = """ + + + + Bayerische Landesschule + Kurzstr. 2 + 81547 + München + Förderzentren + + + 11.5686076923 48.1047906989 + + + + + + """ + + spider = BayernSpider() + response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8") + schools = list(spider.parse(response)) + self.assertEqual(len(schools), 1) + + school = schools[0] + + self.assertEqual(school["schulname"], "Bayerische Landesschule") + self.assertEqual(school["strasse"], "Kurzstr. 2") + self.assertEqual(school["postleitzahl"], "81547") + self.assertEqual(school["ort"], "München") + self.assertEqual(school["schulart"], "Förderzentren") + self.assertEqual(school["lon"], 11.5686076923) + self.assertEqual(school["lat"], 48.1047906989) + + +if __name__ == "__main__": + unittest.main()