Skip to content

Commit 230ee34

Browse files
authored
[MV] Use data from WFS (#211)
1 parent a1f224f commit 230ee34

File tree

4 files changed

+158
-37
lines changed

4 files changed

+158
-37
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ In details, the IDs are sourced as follows:
2929
|HB| `id` URL query param on the school's detail page (identical to the SNR (Schulnummer) from the overview page) | `HB-937` |✅ likely|
3030
|HH| Field `schul_id` From the WFS Service | `HH-7910-0` |✅ likely|
3131
|HE| `school_no` URL query param of the schools's details page (identical to the Dienststellennummer) | `HE-4024` |✅ likely|
32-
|MV| Column `DIENSTSTELLEN-NUMMER` from the XLSX file | `MV-75130302` |✅ likely|
32+
|MV| Field `dstnr` from the WFS | `MV-75130302` |✅ likely|
3333
|NI| Field `schulnr` from the JSON in the details payload | `NI-67763` |✅ likely|
3434
|NW| Column `Schulnummer` from the CSV | `NW-162437` |✅ likely|
3535
|RP| `Schulnummer` from the school's details page | `RP-50720` |✅ likely|
@@ -50,7 +50,7 @@ When available, we try to use the geolocations provided by the data publishers.
5050
| HB | ❌ No | - |
5151
| HH | ✅ Yes | WFS |
5252
| HE | ❌ No | - |
53-
| MV | ❌ No | - |
53+
| MV | ✅ Yes | WFS |
5454
| NI | ❌ No | - |
5555
| NW | ✅ Yes | Converted from EPSG:25832 in source CSV data |
5656
| RP | ❌ No | - |

jedeschule/spiders/bayern.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ def parse(self, response, **kwargs):
2727
members = [members]
2828

2929
for member in members:
30-
# Each member is a dict with one key = school tag, value = school data dict
3130
school = next(iter(member.values()), {})
3231

3332
data_elem = {
Lines changed: 72 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1+
import xmltodict
12
from scrapy import Item
2-
from openpyxl import load_workbook
3-
from io import BytesIO
4-
53

64
from jedeschule.items import School
75
from jedeschule.spiders.school_spider import SchoolSpider
@@ -16,40 +14,80 @@ def as_string(value: str):
1614

1715
class MecklenburgVorpommernSpider(SchoolSpider):
1816
name = "mecklenburg-vorpommern"
19-
# The state provides the data as an Excel file. The current year's
20-
# data is for sale, all older version are free to download.
21-
# We use the free data from 2022/2023
22-
# An overview of all available files can be found here:
23-
# https://www.statistischebibliothek.de/mir/receive/MVSerie_mods_00000396
24-
# Official documentation on all available data here:
25-
# https://www.laiv-mv.de/Statistik/Veröffentlichungen/Verzeichnisse/
26-
base_url = "https://www.statistischebibliothek.de/mir/servlets/MCRFileNodeServlet/MVHeft_derivate_00007470/V044%202023%2000.xlsx"
27-
start_urls = [base_url]
28-
29-
def parse(self, response):
30-
workbook = load_workbook(filename=BytesIO(response.body), data_only=True)
31-
data_sheet = workbook["Verzeichnis allg bild Schulen"]
32-
33-
rows = list(data_sheet.iter_rows(values_only=True))
34-
headers = rows[0]
35-
36-
for row in rows[1:]:
37-
yield {
38-
headers[i]: row[i]
39-
for i in range(len(headers))
40-
}
17+
start_urls = [
18+
"https://www.geodaten-mv.de/dienste/schulstandorte_wfs?"
19+
"SERVICE=WFS&REQUEST=GetFeature&VERSION=2.0.0&srsname=EPSG%3A4326&typeNames="
20+
"ms:schultyp_grund,"
21+
"ms:schultyp_regional,"
22+
"ms:schultyp_gymnasium,"
23+
"ms:schultyp_gesamt,"
24+
"ms:schultyp_waldorf,"
25+
"ms:schultyp_foerder,"
26+
"ms:schultyp_abendgym,"
27+
"ms:schultyp_berufs"
28+
]
29+
30+
def parse(self, response, **kwargs):
31+
data = xmltodict.parse(response.text)
32+
33+
feature_collection = data.get("wfs:FeatureCollection", {})
34+
members = feature_collection.get("wfs:member", [])
35+
36+
if not isinstance(members, list):
37+
members = [members]
38+
39+
for member in members:
40+
if "wfs:FeatureCollection" in member:
41+
inner_members = member["wfs:FeatureCollection"].get("wfs:member", [])
42+
if not isinstance(inner_members, list):
43+
inner_members = [inner_members]
44+
45+
for inner_member in inner_members:
46+
school_data = next(iter(inner_member.values()), {})
47+
yield self._extract_school_data(school_data)
48+
else:
49+
school_data = next(iter(member.values()), {})
50+
yield self._extract_school_data(school_data)
51+
52+
@staticmethod
53+
def _extract_school_data(school):
54+
data_elem = {}
55+
56+
for key, value in school.items():
57+
if key == "ms:msGeometry":
58+
point = value.get("gml:Point", {})
59+
pos = point.get("gml:pos", "")
60+
if pos:
61+
lat, lon = pos.split()
62+
data_elem["lat"] = float(lat)
63+
data_elem["lon"] = float(lon)
64+
elif not key.startswith("@"):
65+
clean_key = key.split(":", 1)[-1] if ":" in key else key
66+
data_elem[clean_key] = value
67+
68+
return data_elem
4169

4270
@staticmethod
4371
def normalize(item: Item) -> School:
72+
def safe_strip(value):
73+
if not value or not value.strip():
74+
return None
75+
return value.strip()
76+
4477
return School(
45-
name=item.get("NAME1"),
46-
id="MV-{}".format(as_string(item.get("DIENSTSTELLEN-NUMMER"))),
47-
address=item.get("STRASSE"),
78+
name=safe_strip(item.get("schulname")),
79+
id="MV-{}".format(as_string(item.get("dstnr", ""))),
80+
address=safe_strip(item.get("strassehnr")),
4881
address2="",
49-
zip=as_string(item.get("PLZ")).zfill(5),
50-
city=item.get("ORT"),
51-
website=item.get("INTERNET"),
52-
email=item.get("E-MAIL-ADRESSE"),
53-
phone=item.get("TELEFON"),
54-
director=item.get("SCHULLEITER/-IN"),
82+
zip=as_string(item.get("plz", "")).zfill(5),
83+
city=safe_strip(item.get("ort")),
84+
website=safe_strip(item.get("internet")),
85+
email=safe_strip(item.get("emailadresse")),
86+
phone=safe_strip(item.get("telefon")),
87+
director=safe_strip(item.get("schulleiter")),
88+
school_type=safe_strip(item.get("orgform")),
89+
legal_status=safe_strip(item.get("rechtsstatus")),
90+
provider=safe_strip(item.get("schultraeger")),
91+
latitude=item.get("lat"),
92+
longitude=item.get("lon"),
5593
)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import unittest
2+
3+
from scrapy.http import TextResponse
4+
5+
from jedeschule.spiders.mecklenburg_vorpommern import MecklenburgVorpommernSpider
6+
7+
8+
class TestMecklenburgVorpommernSpider(unittest.TestCase):
9+
def test_parse(self):
10+
xml_response = """<?xml version='1.0' encoding="UTF-8" ?>
11+
<wfs:FeatureCollection
12+
xmlns:ms="http://mapserver.gis.umn.edu/mapserver"
13+
xmlns:gml="http://www.opengis.net/gml/3.2"
14+
xmlns:wfs="http://www.opengis.net/wfs/2.0"
15+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
16+
xsi:schemaLocation="http://mapserver.gis.umn.edu/mapserver https://www.geodaten-mv.de/dienste/schulstandorte_wfs?SERVICE=WFS&amp;VERSION=2.0.0&amp;REQUEST=DescribeFeatureType&amp;TYPENAME=ms:schultyp_grund,ms:schultyp_regional,ms:schultyp_gymnasium,ms:schultyp_gesamt,ms:schultyp_waldorf,ms:schultyp_foerder,ms:schultyp_abendgym,ms:schultyp_berufs&amp;OUTPUTFORMAT=application%2Fgml%2Bxml%3B%20version%3D3.2 http://www.opengis.net/wfs/2.0 http://schemas.opengis.net/wfs/2.0/wfs.xsd http://www.opengis.net/gml/3.2 http://schemas.opengis.net/gml/3.2.1/gml.xsd"
17+
timeStamp="2025-10-10T15:56:47" numberMatched="unknown" numberReturned="1"
18+
next="https://www.geodaten-mv.de/dienste/schulstandorte_wfs?SERVICE=WFS&amp;REQUEST=GetFeature&amp;VERSION=2.0.0&amp;srsname=EPSG%3A4326&amp;typeNames=ms%3Aschultyp_grund%2Cms%3Aschultyp_regional%2Cms%3Aschultyp_gymnasium%2Cms%3Aschultyp_gesamt%2Cms%3Aschultyp_waldorf%2Cms%3Aschultyp_foerder%2Cms%3Aschultyp_abendgym%2Cms%3Aschultyp_berufs&amp;count=1&amp;STARTINDEX=1">
19+
<wfs:boundedBy>
20+
<gml:Envelope srsName="urn:ogc:def:crs:EPSG::4326">
21+
<gml:lowerCorner>53.846900 11.977407</gml:lowerCorner>
22+
<gml:upperCorner>53.846900 11.977407</gml:upperCorner>
23+
</gml:Envelope>
24+
</wfs:boundedBy>
25+
<!-- WARNING: No featureid defined for typename 'schultyp_grund'. Output will not validate. -->
26+
<wfs:member>
27+
<ms:schultyp_grund>
28+
<gml:boundedBy>
29+
<gml:Envelope srsName="urn:ogc:def:crs:EPSG::4326">
30+
<gml:lowerCorner>53.846900 11.977407</gml:lowerCorner>
31+
<gml:upperCorner>53.846900 11.977407</gml:upperCorner>
32+
</gml:Envelope>
33+
</gml:boundedBy>
34+
<ms:msGeometry>
35+
<gml:Point gml:id=".1" srsName="urn:ogc:def:crs:EPSG::4326">
36+
<gml:pos>53.846900 11.977407</gml:pos>
37+
</gml:Point>
38+
</ms:msGeometry>
39+
<ms:orgform>Grundschule</ms:orgform>
40+
<ms:schultraeger></ms:schultraeger>
41+
<ms:rechtsstatus>Öffentlich</ms:rechtsstatus>
42+
<ms:schulname>Grundschule und Freizeithaus am Schloßplatz </ms:schulname>
43+
<ms:strassehnr>Schloßplatz 3</ms:strassehnr>
44+
<ms:plz>18246</ms:plz>
45+
<ms:ort>Bützow</ms:ort>
46+
<ms:besonderheiten></ms:besonderheiten>
47+
<ms:schulleiter>Frau Beuster</ms:schulleiter>
48+
<ms:telefon>038461 - 52006</ms:telefon>
49+
<ms:emailadresse>[email protected]</ms:emailadresse>
50+
<ms:internet></ms:internet>
51+
<ms:anzahl_klassen>12</ms:anzahl_klassen>
52+
<ms:anzahl_schueler>247</ms:anzahl_schueler>
53+
<ms:strasse>Schloßplatz</ms:strasse>
54+
<ms:hnr>3</ms:hnr>
55+
<ms:dstnr>75135304</ms:dstnr>
56+
</ms:schultyp_grund>
57+
</wfs:member>
58+
</wfs:FeatureCollection>"""
59+
60+
spider = MecklenburgVorpommernSpider()
61+
response = TextResponse(url="https://test.com", body=xml_response, encoding="utf-8")
62+
schools = list(spider.parse(response))
63+
self.assertEqual(len(schools), 1)
64+
65+
school = schools[0]
66+
parsed_school = spider.normalize(school)
67+
68+
self.assertEqual(parsed_school["id"], "MV-75135304")
69+
self.assertEqual(parsed_school["name"], "Grundschule und Freizeithaus am Schloßplatz")
70+
self.assertEqual(parsed_school["address"], "Schloßplatz 3")
71+
self.assertEqual(parsed_school["city"], "Bützow")
72+
self.assertEqual(parsed_school["school_type"], "Grundschule")
73+
self.assertEqual(parsed_school["zip"], "18246")
74+
self.assertEqual(parsed_school["latitude"], 53.846900)
75+
self.assertEqual(parsed_school["longitude"], 11.977407)
76+
self.assertEqual(parsed_school["legal_status"], "Öffentlich")
77+
self.assertEqual(parsed_school["director"], "Frau Beuster")
78+
self.assertEqual(parsed_school["phone"], "038461 - 52006")
79+
self.assertEqual(parsed_school["email"], "[email protected]")
80+
self.assertIsNone(parsed_school["website"])
81+
82+
83+
if __name__ == "__main__":
84+
unittest.main()

0 commit comments

Comments
 (0)