From 05cffd76e72ea42d2c91129bf9201df9194dc71a Mon Sep 17 00:00:00 2001 From: simonmand Date: Mon, 11 Aug 2025 11:26:38 +0200 Subject: [PATCH 1/3] Reworked the parse method --- jedeschule/spiders/bayern.py | 69 +++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/jedeschule/spiders/bayern.py b/jedeschule/spiders/bayern.py index bad6241..a3b8fb5 100644 --- a/jedeschule/spiders/bayern.py +++ b/jedeschule/spiders/bayern.py @@ -1,5 +1,4 @@ -import xml.etree.ElementTree as ET -import scrapy +import xmltodict from scrapy import Item from jedeschule.items import School @@ -9,40 +8,44 @@ class BayernSpider(SchoolSpider): name = "bayern" start_urls = [ - "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetCapabilities" + "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?" + "SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=" + "schul:SchulstandorteGrundschulen," + "schul:SchulstandorteMittelschulen," + "schul:SchulstandorteRealschulen," + "schul:SchulstandorteGymnasien," + "schul:SchulstandorteBeruflicheSchulen," + "schul:SchulstandorteFoerderzentren," + "schul:SchulstandorteWeitererSchulen" ] def parse(self, response, **kwargs): - tree = ET.fromstring(response.body) - base_url = "https://gdiserv.bayern.de/srv112940/services/schulstandortebayern-wfs?SERVICE=WFS&VERSION=2.0.0&REQUEST=GetFeature&srsname=EPSG:4326&typename=" - for feature_type in tree.iter("{http://www.opengis.net/wfs/2.0}FeatureType"): - feature = feature_type.findtext("{http://www.opengis.net/wfs/2.0}Title") - yield scrapy.Request( - f"{base_url}{feature}", - callback=self.parse_resource, - cb_kwargs={"feature": feature}, - ) - - def parse_resource(self, response, feature): - tree = ET.fromstring(response.body) - namespaces = { - "gml": "http://www.opengis.net/gml/3.2", - "schul": "http://gdi.bayern/brbschul", - } - key = "{http://gdi.bayern/brbschul}" + feature - for school in tree.iter(key): - data_elem = {"id": school.attrib["{http://www.opengis.net/gml/3.2}id"]} - - for entry in school: - if entry.tag == "{http://gdi.bayern/brbschul}geometry": - lon, lat = entry.findtext( - "gml:Point/gml:pos", namespaces=namespaces - ).split(" ") - data_elem["lat"] = lat - data_elem["lon"] = lon - continue - # strip the namespace before returning - data_elem[entry.tag.split("}", 1)[1]] = entry.text + data = xmltodict.parse(response.text) + members = data.get("wfs:FeatureCollection", {}).get("wfs:member", []) + + if not isinstance(members, list): + members = [members] + + for member in members: + # Each member is a dict with one key = school tag, value = school data dict + school = next(iter(member.values()), {}) + + data_elem = { + "id": school.get("@gml:id") + } + + for key, value in school.items(): + if key == "schul:geometry": + point = value.get("gml:Point", {}) + pos = point.get("gml:pos", "") + if pos: + lon, lat = pos.split() + data_elem["lat"] = lat + data_elem["lon"] = lon + elif not key.startswith("@"): + clean_key = key.split(":", 1)[-1] + data_elem[clean_key] = value + yield data_elem @staticmethod From bad83f561856029892e57ceec2a7c2c7d8bc85a3 Mon Sep 17 00:00:00 2001 From: simonmand Date: Mon, 11 Aug 2025 11:51:02 +0200 Subject: [PATCH 2/3] Added 1 test --- test/test_bayern.py | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 test/test_bayern.py diff --git a/test/test_bayern.py b/test/test_bayern.py new file mode 100644 index 0000000..4c2c038 --- /dev/null +++ b/test/test_bayern.py @@ -0,0 +1,52 @@ +import unittest + +from scrapy.http import TextResponse + +from jedeschule.spiders.bayern import BayernSpider + + +class TestBayernSpider(unittest.TestCase): + def test_parse(self): + xml_response = """ + + + + Bayerische Landesschule + Kurzstr. 2 + 81547 + München + Förderzentren + + + 11.5686076923 48.1047906989 + + + + + + """ + + spider = BayernSpider() + response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8") + schools = list(spider.parse(response)) + self.assertEqual(len(schools), 1) + + school = schools[0] + + self.assertEqual(school["schulname"], "Bayerische Landesschule") + self.assertEqual(school["strasse"], "Kurzstr. 2") + self.assertEqual(school["postleitzahl"], "81547") + self.assertEqual(school["ort"], "München") + self.assertEqual(school["schulart"], "Förderzentren") + self.assertEqual(school["lon"], "11.5686076923") + self.assertEqual(school["lat"], "48.1047906989") + + +if __name__ == "__main__": + unittest.main() From d55a8a0e9b3b85b2363131594eaaf102b54ec224 Mon Sep 17 00:00:00 2001 From: simonmand Date: Fri, 15 Aug 2025 16:03:51 +0200 Subject: [PATCH 3/3] Convert coordinates to float --- jedeschule/spiders/bayern.py | 4 ++-- test/test_bayern.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jedeschule/spiders/bayern.py b/jedeschule/spiders/bayern.py index a3b8fb5..cc27d51 100644 --- a/jedeschule/spiders/bayern.py +++ b/jedeschule/spiders/bayern.py @@ -40,8 +40,8 @@ def parse(self, response, **kwargs): pos = point.get("gml:pos", "") if pos: lon, lat = pos.split() - data_elem["lat"] = lat - data_elem["lon"] = lon + data_elem["lat"] = float(lat) + data_elem["lon"] = float(lon) elif not key.startswith("@"): clean_key = key.split(":", 1)[-1] data_elem[clean_key] = value diff --git a/test/test_bayern.py b/test/test_bayern.py index 4c2c038..dcd101a 100644 --- a/test/test_bayern.py +++ b/test/test_bayern.py @@ -44,8 +44,8 @@ def test_parse(self): self.assertEqual(school["postleitzahl"], "81547") self.assertEqual(school["ort"], "München") self.assertEqual(school["schulart"], "Förderzentren") - self.assertEqual(school["lon"], "11.5686076923") - self.assertEqual(school["lat"], "48.1047906989") + self.assertEqual(school["lon"], 11.5686076923) + self.assertEqual(school["lat"], 48.1047906989) if __name__ == "__main__":