diff --git a/CHANGELOG.md b/CHANGELOG.md index 31f69da..7e43e78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 2025-05-12 +- [SL]: Switch to data from Geoportal instead of web scraping. We do not get + contact details such as email and phone anymore but we (might) get more + stable ids and also get geolocation data now. ⚠️ This breaks existing ids. + ## 2025-04-30 - [BY]: Latitude and Longitude were swapped in the database. This has been fixed. - [NW]: Latitude and Longitude were swapped in the database. This has been fixed. diff --git a/README.md b/README.md index 0533889..9fb20ee 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ ISO-3166-2 code (without the `DE-` prefix). In details, the IDs are sourced as follows: -|State| ID-Source | exmaple-id |stable| -|-----|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|------| +|State| ID-Source | exmaple-id |stable| +|-----|--------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------|------| |BW| Field `DISCH` (Dienststellenschüssel) in the JSON repsonse | `BW-04154817` |✅ likely| |BY| id from the WFS service | `BY-SCHUL_SCHULSTANDORTEGRUNDSCHULEN_2acb7d31-915d-40a9-adcf-27b38251fa48` |❓ unlikely (although we reached out to ask for canonical IDs to be published)| |BE| Field `bsn` (Berliner Schulnummer) from the WFS Service | `BE-02K10` |✅ likely| @@ -33,7 +33,7 @@ In details, the IDs are sourced as follows: |NI| Field `schulnr` from the JSON in the details payload | `NI-67763` |✅ likely| |NW| Column `Schulnummer` from the CSV | `NW-162437` |✅ likely| |RP| `Schulnummer` from the school's details page | `RP-50720` |✅ likely| -|SL| Phone number, email or name from the item. Depending on what is available first | `SL-gem-kleinblittersdorfATschule.saarland` |❌ unlikely (e.g. if name was used before and now a phone number is added to the school, it will be used instead)| +|SL| `OBJECTID` from the WFS service | `SL-255` |❓ unlikely | |SN| Field `id` from the API | `SN-4062` |✅ likely| |ST| `ID` query param from the details page URL | `ST-1001186` |❓ probably?| |TH| `Schulnumer` from school list | `TH-10601` |✅ likely| @@ -54,7 +54,7 @@ When available, we try to use the geolocations provided by the data publishers. | NI | ❌ No | - | | NW | ✅ Yes | Converted from EPSG:25832 in source CSV data | | RP | ❌ No | - | -| SL | ❌ No | - | +| SL | ✅ Yes | WFS | | SN | ✅ Yes | API | | ST | ❌ No | - | | TH | ❌ No | - | diff --git a/jedeschule/spiders/saarland.py b/jedeschule/spiders/saarland.py index 6231bf2..b3ace2d 100644 --- a/jedeschule/spiders/saarland.py +++ b/jedeschule/spiders/saarland.py @@ -1,104 +1,55 @@ -from scrapy.spiders import Rule, CrawlSpider -from scrapy.linkextractors import LinkExtractor -from scrapy import Item, FormRequest, Request +from scrapy import Item +import xml.etree.ElementTree as ET from jedeschule.items import School from jedeschule.spiders.school_spider import SchoolSpider -# School types: Berufliche Schule, Erweitere Realschule, Förderschule, Freie Waldorfschule, -# Gemeinschatsschule, Grundschule, Gymnasium, Lyzeum, Realschule, Studienseminare - -class SaarlandSpider(CrawlSpider, SchoolSpider): +class SaarlandSpider(SchoolSpider): name = "saarland" start_urls = [ - "https://www.saarland.de/mbk/DE/portale/bildungsserver/schulen-und-bildungswege/schuldatenbank" + "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer?SERVICE=WFS&REQUEST=GetFeature&typeName=Staatliche%5FDienste:Schulen%5FSL&srsname=EPSG:4326" ] - rules = ( - Rule( - LinkExtractor(allow=(), restrict_xpaths=('//a[@class="forward button"]',)), - callback="parse_start_url", - follow=True, - ), - ) - - def parse_start_url(self, response): - yield FormRequest.from_response( - response, formname="searchSchool", callback=self.parse_page - ) - - def parse_page(self, response): - for school in self.parse_schools(response): - yield school - next_button = response.xpath( - '//a[@class="forward button"]/@href' - ).extract_first() - if next_button: - yield Request(next_button, callback=self.parse_page) - - def parse_schools(self, response): - cards = response.xpath('//div[@class="c-teaser-card"]') - - for card in cards: - school = {} - school["name"] = card.xpath(".//h3/text()").extract_first().strip() - - badges = card.css(".c-badge") - school["schultyp"] = badges[0].css("::text").extract_first() - school["ort"] = badges[1].css("::text").extract_first() - - address = card.xpath(".//p/text()").extract_first().split(", ") - - school["straße"] = address[0] - school["plz"] = address[1].strip(" " + school["ort"]) - - keys = card.xpath(".//dt/text()").extract() - info = card.xpath(".//dd/text()").extract() - - for index in range(0, len(keys)): - key = keys[index].strip(":").lower() - - if key == "homepage": - school["homepage"] = card.xpath( - './/a[@target="_blank"]/text()' - ).extract_first() - - if key == "e-mail": - school["e-mail"] = ( - card.xpath('.//a[contains(@title, "E-Mail senden an:")]/@href') - .extract_first() - .strip("mailto:") - ) - - if key != "homepage" and key != "e-mail": - school[key] = info[index] - - yield school - - @staticmethod - def get_id(item: Item) -> str: - # There are no IDs on the page that we could use. - # We will fall back to phone number, e-mail or name - # in the worst case - if tel := item.get("telefon"): - return tel.replace(" ", "-") - if email := item.get("e-mail"): - return email.replace("@", "AT") - return item.get("name") + def parse(self, response): + tree = ET.fromstring(response.body) + + namespaces = { + "gml": "http://www.opengis.net/gml/3.2", + "SD": "https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer", + } + + for school in tree.iter( + "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Schulen_SL" + ): + data_elem = {} + for entry in school: + if ( + entry.tag + == "{https://geoportal.saarland.de/arcgis/services/Internet/Staatliche_Dienste/MapServer/WFSServer}Shape" + ): + # This nested entry contains the coordinates that we would like to expand + lat, lon = entry.findtext( + "gml:Point/gml:pos", namespaces=namespaces + ).split(" ") + data_elem["lat"] = lat + data_elem["lon"] = lon + continue + # strip the namespace before returning + data_elem[entry.tag.split("}", 1)[1]] = entry.text + yield data_elem @staticmethod def normalize(item: Item) -> School: + # The data also contains a field called `SCHULKENNZ` which implies that it might be an id + # that could be used, but some schools share ids (especially `0` or `000000`) which makes for collisions + id = item.get("OBJECTID") + return School( - name=item.get("name"), - phone=item.get("telefon"), - fax=item.get("telefax"), - website=item.get("homepage"), - email=item.get("e-mail"), - address=item.get("straße"), - city=item.get("ort"), - zip=item.get("plz"), - school_type=item.get("schultyp"), - director=item.get("schulleitung"), - id="SL-{}".format(SaarlandSpider.get_id(item)), + name=item.get("SCHULNAME"), + address=" ".join([item.get(part) for part in ["HNR", "STR_NAME"]]), + city=item.get("ORT_NAME"), + zip=item.get("PLZ"), + school_type=item.get("SCHULFORM"), + id=f"SL-{id}", )