diff --git a/jedeschule/spiders/saarland.py b/jedeschule/spiders/saarland.py index b3ace2d..a59510e 100644 --- a/jedeschule/spiders/saarland.py +++ b/jedeschule/spiders/saarland.py @@ -1,5 +1,5 @@ +import xmltodict from scrapy import Item -import xml.etree.ElementTree as ET from jedeschule.items import School from jedeschule.spiders.school_spider import SchoolSpider @@ -8,48 +8,52 @@ class SaarlandSpider(SchoolSpider): name = "saarland" start_urls = [ - "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326" + "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?" + "SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326" ] - def parse(self, response): - tree = ET.fromstring(response.body) + def parse(self, response, **kwargs): + data = xmltodict.parse(response.text) + members = data.get("wfs:FeatureCollection", {}).get("wfs:member", []) - namespaces = { - "gml": "http://www.opengis.net/gml/3.2", - "SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer", - } + if not isinstance(members, list): + members = [members] - for school in tree.iter( - "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL" - ): + for member in members: + school = member.get("Staatliche_Dienste:Schulen_SL", {}) data_elem = {} - for entry in school: - if ( - entry.tag - == "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape" - ): - # This nested entry contains the coordinates that we would like to expand - lat, lon = entry.findtext( - "gml:Point/gml:pos", namespaces=namespaces - ).split(" ") - data_elem["lat"] = lat - data_elem["lon"] = lon + + for key, value in school.items(): + if key == "Staatliche_Dienste:SHAPE": + pos = (value.get("gml:Point", {}) + .get("gml:pos", "") + .strip()) + if pos: + lat, lon = pos.split() + data_elem["lat"] = float(lat) + data_elem["lon"] = float(lon) + continue - # strip the namespace before returning - data_elem[entry.tag.split("}", 1)[1]] = entry.text + + clean_key = key.split(":")[-1] + if clean_key == "PLZ": + value = value.split(".")[0] + + data_elem[clean_key] = value + yield data_elem @staticmethod def normalize(item: Item) -> School: # The data also contains a field called `SCHULKENNZ` which implies that it might be an id # that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions - id = item.get("OBJECTID") + school_id = item.get("OBJECTID") return School( - name=item.get("SCHULNAME"), - address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]), - city=item.get("ORT_NAME"), + name=item.get("Bezeichnun"), + address=item.get("Straße", "").strip(), + city=item.get("Ort"), zip=item.get("PLZ"), - school_type=item.get("SCHULFORM"), - id=f"SL-{id}", + school_type=item.get("Schulform"), + id=f"SL-{school_id}", ) diff --git a/pyproject.toml b/pyproject.toml index 58baf6c..89e0b60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,4 +13,5 @@ dependencies = [ "requests==2.32.4", "scrapy==2.13.3", "sqlalchemy==1.3.10", + "xmltodict==0.13.0", ] diff --git a/test/test_saarland.py b/test/test_saarland.py new file mode 100644 index 0000000..5a67f7f --- /dev/null +++ b/test/test_saarland.py @@ -0,0 +1,70 @@ +import unittest + +from scrapy.http import TextResponse + +from jedeschule.spiders.saarland import SaarlandSpider + + +class TestSaarlandSpider(unittest.TestCase): + def test_parse(self): + xml_response = """ + + + + + + 49.24067452 7.02085050 + + + 1 + 1.00000000 + 1100.00000000 + 66123.00000000 + Saarbrücken + Kohlweg 7 + Deutsch-Französiche Hochschule, Université franco-allemande + 0681-93812100 + 0681-93812111 + info@dfh-ufa.org + Hochschule + https://www.dfh-ufa.org/ + Saarbrücken + Hochschule + 355942.97630000 + 5456095.93600000 + 20.05.2025 + + + + """ + + spider = SaarlandSpider() + response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8") + schools = list(spider.parse(response)) + self.assertEqual(len(schools), 1) + + school = schools[0] + + self.assertEqual(school["OBJECTID"], "1") + self.assertEqual(school["fid"], "1.00000000") + self.assertEqual(school["Gemeindenu"], "1100.00000000") + self.assertEqual(school["PLZ"], "66123") + self.assertEqual(school["Ort"], "Saarbrücken") + self.assertEqual(school["Straße"], "Kohlweg 7") + self.assertEqual(school["Bezeichnun"], "Deutsch-Französiche Hochschule, Université franco-allemande") + self.assertEqual(school["Telefon"], "0681-93812100") + self.assertEqual(school["Fax"], "0681-93812111") + self.assertEqual(school["Email"], "info@dfh-ufa.org") + self.assertEqual(school["Schulform"], "Hochschule") + self.assertEqual(school["Homepage"], "https://www.dfh-ufa.org/") + self.assertEqual(school["Schulregio"], "Saarbrücken") + self.assertEqual(school["KARTENERST"], "Hochschule") + self.assertEqual(school["Rechtswert"], "355942.97630000") + self.assertEqual(school["Hochwert"], "5456095.93600000") + self.assertEqual(school["Aktualisie"], "20.05.2025") + self.assertAlmostEqual(school["lat"], 49.24067452) + self.assertAlmostEqual(school["lon"], 7.02085050) + + +if __name__ == "__main__": + unittest.main() diff --git a/test_changes.sh b/test_changes.sh index 229fc34..62aed3f 100755 --- a/test_changes.sh +++ b/test_changes.sh @@ -5,7 +5,7 @@ set -euxo pipefail git fetch origin main if [ "${CI:-}" = "true" ]; then - HEAD_REF="${GITHUB_SHA}" + HEAD_REF="${GITHUB_SHA:-$(git rev-parse HEAD)}" else HEAD_REF="HEAD" fi diff --git a/uv.lock b/uv.lock index fd5d191..d71826e 100644 --- a/uv.lock +++ b/uv.lock @@ -270,6 +270,7 @@ dependencies = [ { name = "requests" }, { name = "scrapy" }, { name = "sqlalchemy" }, + { name = "xmltodict" }, ] [package.metadata] @@ -282,6 +283,7 @@ requires-dist = [ { name = "requests", specifier = "==2.32.4" }, { name = "scrapy", specifier = "==2.13.3" }, { name = "sqlalchemy", specifier = "==1.3.10" }, + { name = "xmltodict", specifier = "==0.13.0" }, ] [[package]] @@ -712,6 +714,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/58/dd/56f0d8af71e475ed194d702f8b4cf9cea812c95e82ad823d239023c6558c/w3lib-2.3.1-py3-none-any.whl", hash = "sha256:9ccd2ae10c8c41c7279cd8ad4fe65f834be894fe7bfdd7304b991fd69325847b", size = 21751, upload-time = "2025-01-27T14:22:09.421Z" }, ] +[[package]] +name = "xmltodict" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/39/0d/40df5be1e684bbaecdb9d1e0e40d5d482465de6b00cbb92b84ee5d243c7f/xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56", size = 33813, upload-time = "2022-05-08T07:00:04.916Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/db/fd0326e331726f07ff7f40675cd86aa804bfd2e5016c727fa761c934990e/xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852", size = 9971, upload-time = "2022-05-08T07:00:02.898Z" }, +] + [[package]] name = "zope-interface" version = "7.2"