1+ import xmltodict
12from scrapy import Item
2- from openpyxl import load_workbook
3- from io import BytesIO
4-
53
64from jedeschule .items import School
75from jedeschule .spiders .school_spider import SchoolSpider
@@ -16,40 +14,80 @@ def as_string(value: str):
1614
1715class MecklenburgVorpommernSpider (SchoolSpider ):
1816 name = "mecklenburg-vorpommern"
19- # The state provides the data as an Excel file. The current year's
20- # data is for sale, all older version are free to download.
21- # We use the free data from 2022/2023
22- # An overview of all available files can be found here:
23- # https://www.statistischebibliothek.de/mir/receive/MVSerie_mods_00000396
24- # Official documentation on all available data here:
25- # https://www.laiv-mv.de/Statistik/Veröffentlichungen/Verzeichnisse/
26- base_url = "https://www.statistischebibliothek.de/mir/servlets/MCRFileNodeServlet/MVHeft_derivate_00007470/V044%202023%2000.xlsx"
27- start_urls = [base_url ]
28-
29- def parse (self , response ):
30- workbook = load_workbook (filename = BytesIO (response .body ), data_only = True )
31- data_sheet = workbook ["Verzeichnis allg bild Schulen" ]
32-
33- rows = list (data_sheet .iter_rows (values_only = True ))
34- headers = rows [0 ]
35-
36- for row in rows [1 :]:
37- yield {
38- headers [i ]: row [i ]
39- for i in range (len (headers ))
40- }
17+ start_urls = [
18+ "https://www.geodaten-mv.de/dienste/schulstandorte_wfs?"
19+ "SERVICE=WFS&REQUEST=GetFeature&VERSION=2.0.0&srsname=EPSG%3A4326&typeNames="
20+ "ms:schultyp_grund,"
21+ "ms:schultyp_regional,"
22+ "ms:schultyp_gymnasium,"
23+ "ms:schultyp_gesamt,"
24+ "ms:schultyp_waldorf,"
25+ "ms:schultyp_foerder,"
26+ "ms:schultyp_abendgym,"
27+ "ms:schultyp_berufs"
28+ ]
29+
30+ def parse (self , response , ** kwargs ):
31+ data = xmltodict .parse (response .text )
32+
33+ feature_collection = data .get ("wfs:FeatureCollection" , {})
34+ members = feature_collection .get ("wfs:member" , [])
35+
36+ if not isinstance (members , list ):
37+ members = [members ]
38+
39+ for member in members :
40+ if "wfs:FeatureCollection" in member :
41+ inner_members = member ["wfs:FeatureCollection" ].get ("wfs:member" , [])
42+ if not isinstance (inner_members , list ):
43+ inner_members = [inner_members ]
44+
45+ for inner_member in inner_members :
46+ school_data = next (iter (inner_member .values ()), {})
47+ yield self ._extract_school_data (school_data )
48+ else :
49+ school_data = next (iter (member .values ()), {})
50+ yield self ._extract_school_data (school_data )
51+
52+ @staticmethod
53+ def _extract_school_data (school ):
54+ data_elem = {}
55+
56+ for key , value in school .items ():
57+ if key == "ms:msGeometry" :
58+ point = value .get ("gml:Point" , {})
59+ pos = point .get ("gml:pos" , "" )
60+ if pos :
61+ lat , lon = pos .split ()
62+ data_elem ["lat" ] = float (lat )
63+ data_elem ["lon" ] = float (lon )
64+ elif not key .startswith ("@" ):
65+ clean_key = key .split (":" , 1 )[- 1 ] if ":" in key else key
66+ data_elem [clean_key ] = value
67+
68+ return data_elem
4169
4270 @staticmethod
4371 def normalize (item : Item ) -> School :
72+ def safe_strip (value ):
73+ if not value or not value .strip ():
74+ return None
75+ return value .strip ()
76+
4477 return School (
45- name = item .get ("NAME1" ),
46- id = "MV-{}" .format (as_string (item .get ("DIENSTSTELLEN-NUMMER " ))),
47- address = item .get ("STRASSE" ),
78+ name = safe_strip ( item .get ("schulname" ) ),
79+ id = "MV-{}" .format (as_string (item .get ("dstnr" , " " ))),
80+ address = safe_strip ( item .get ("strassehnr" ) ),
4881 address2 = "" ,
49- zip = as_string (item .get ("PLZ" )).zfill (5 ),
50- city = item .get ("ORT" ),
51- website = item .get ("INTERNET" ),
52- email = item .get ("E-MAIL-ADRESSE" ),
53- phone = item .get ("TELEFON" ),
54- director = item .get ("SCHULLEITER/-IN" ),
82+ zip = as_string (item .get ("plz" , "" )).zfill (5 ),
83+ city = safe_strip (item .get ("ort" )),
84+ website = safe_strip (item .get ("internet" )),
85+ email = safe_strip (item .get ("emailadresse" )),
86+ phone = safe_strip (item .get ("telefon" )),
87+ director = safe_strip (item .get ("schulleiter" )),
88+ school_type = safe_strip (item .get ("orgform" )),
89+ legal_status = safe_strip (item .get ("rechtsstatus" )),
90+ provider = safe_strip (item .get ("schultraeger" )),
91+ latitude = item .get ("lat" ),
92+ longitude = item .get ("lon" ),
5593 )
0 commit comments