11# -*- coding: utf-8 -*-
22import io
3- import os
43import re
5- import hashlib
64import zipfile
75import scrapy
86from scrapy import Item
97import shapefile
108from pyproj import Transformer
11- from scrapy .exceptions import CloseSpider
129
1310from jedeschule .items import School
1411from jedeschule .spiders .school_spider import SchoolSpider
1714class BremenSpider (SchoolSpider ):
1815 name = "bremen"
1916 ZIP_URL = "https://gdi2.geo.bremen.de/inspire/download/Schulstandorte/data/Schulstandorte_HB_BHV.zip"
20- CACHE_DIR = "cache"
21- CACHE_FILE = os .path .join (CACHE_DIR , "Schulstandorte_HB_BHV.zip" )
22-
23- # Friendly names → shapefile field names
24- REQUIRED_MAP = {
25- "name" : "nam" ,
26- "address" : "strasse" ,
27- "zip" : "plz" ,
28- "city" : "ort" ,
29- }
3017
3118 start_urls = [ZIP_URL ]
3219
3320 def parse (self , response ):
34- os .makedirs (self .CACHE_DIR , exist_ok = True )
35-
36- # Save ZIP and compute checksum
37- sha256 = hashlib .sha256 (response .body ).hexdigest ()
38- with open (self .CACHE_FILE , "wb" ) as f :
39- f .write (response .body )
40- self .logger .info (f"Downloaded ZIP SHA256={ sha256 } " )
41-
4221 # CRS: EPSG:25832 → EPSG:4326
4322 transformer = Transformer .from_crs (25832 , 4326 , always_xy = True )
4423
4524 # Read both shapefiles directly from ZIP (no extractall)
4625 with zipfile .ZipFile (io .BytesIO (response .body ), "r" ) as zf :
47- for stem , city_name in (
48- ("gdi_schulen_hb" , "Bremen" ),
49- ("gdi_schulen_bhv" , "Bremerhaven" ),
50- ):
26+ for stem in ("gdi_schulen_hb" , "gdi_schulen_bhv" ):
5127 shp_bytes = io .BytesIO (zf .read (f"{ stem } .shp" ))
5228 shx_bytes = io .BytesIO (zf .read (f"{ stem } .shx" ))
5329 dbf_bytes = io .BytesIO (zf .read (f"{ stem } .dbf" ))
@@ -64,75 +40,38 @@ def parse(self, response):
6440 shp = shp_bytes , shx = shx_bytes , dbf = dbf_bytes , encoding = encoding
6541 )
6642
67- # Build robust field-name map
68- # sf.fields: [("DeletionFlag","C",1,0), ("NAM","C",80,0), ...]
69- field_names = [f [0 ].lower () for f in sf .fields [1 :]] # skip DeletionFlag
70- required = set (self .REQUIRED_MAP .values ())
71- missing = required .difference (field_names )
72- if missing :
73- raise ValueError (
74- f"Missing required fields in { stem } : { missing } . Found: { field_names } "
75- )
76-
77- # Iterate records
78- seen_ids = set ()
43+ field_names = [f [0 ] for f in sf .fields [1 :]] # skip DeletionFlag
7944 for sr in sf .iterShapeRecords ():
8045 rec = dict (zip (field_names , sr .record ))
8146
82- snr_txt = (rec .get ("snr_txt" ) or "" ).strip ()
83- if not re .fullmatch (r"\d{3}" , snr_txt ):
84- raise ValueError (
85- f"[{ city_name } ] Invalid SNR format '{ snr_txt } ' for { rec .get ('nam' )} (expected 3 digits)"
86- )
87- if snr_txt in seen_ids :
88- raise ValueError (f"[{ city_name } ] Duplicate SNR '{ snr_txt } '" )
89- seen_ids .add (snr_txt )
90-
91- # Validate core fields (non-silent). Aggregate and act once.
92- core = {
93- k : (rec .get (v ) or "" ).strip ()
94- for k , v in self .REQUIRED_MAP .items ()
95- }
96- missing_core = [k for k , val in core .items () if not val ]
97- if missing_core :
98- msg = f"[{ city_name } ] Missing required fields { missing_core } for SNR '{ snr_txt } '"
99- if getattr (self , "fail_on_missing_core" , False ):
100- raise CloseSpider (reason = msg )
101- self .logger .error (msg )
102- continue
103-
10447 # geometry
10548 shp = sr .shape
106- lat = lon = None
10749 if shp and shp .points :
10850 # Expect Point; take first coordinate defensively
10951 x , y = shp .points [0 ]
110- lon , lat = transformer .transform (x , y )
111-
112- yield {
113- "snr" : snr_txt ,
114- "name" : core ["name" ],
115- "address" : core ["address" ],
116- "zip" : core ["zip" ],
117- "city" : core ["city" ],
118- "district" : rec .get ("ortsteilna" ),
119- "school_type" : rec .get ("schulart_2" ),
120- "provider" : rec .get ("traegernam" ),
121- "latitude" : lat ,
122- "longitude" : lon ,
123- }
52+ rec ['lon' ], rec ['lat' ] = transformer .transform (x , y )
53+ yield rec
12454
12555 @staticmethod
12656 def normalize (item : Item ) -> School :
127- school_id = f"HB-{ item .get ('snr' )} "
57+ # Create case-insensitive lookup
58+ item_lower = {k .lower (): v for k , v in item .items ()}
59+
60+ # Extract and validate school ID
61+ snr_txt = (item_lower .get ("snr_txt" ) or "" ).strip ()
62+ if not snr_txt or not re .fullmatch (r"\d{3}" , snr_txt ):
63+ raise ValueError (f"Invalid or missing SNR_TXT: '{ snr_txt } '" )
64+
65+ school_id = f"HB-{ snr_txt } "
66+
12867 return School (
129- name = item .get ("name" ),
13068 id = school_id ,
131- address = item .get ("address" ),
132- zip = item .get ("zip" ),
133- city = item .get ("city" ),
134- school_type = item .get ("school_type" ),
135- provider = item .get ("provider" ),
136- latitude = item .get ("latitude" ),
137- longitude = item .get ("longitude" ),
69+ name = (item_lower .get ("nam" ) or "" ).strip (),
70+ address = (item_lower .get ("strasse" ) or "" ).strip (),
71+ zip = (item_lower .get ("plz" ) or "" ).strip (),
72+ city = (item_lower .get ("ort" ) or "" ).strip (),
73+ school_type = (item_lower .get ("schulart_2" ) or "" ).strip (),
74+ provider = (item_lower .get ("traegernam" ) or "" ).strip (),
75+ latitude = item .get ("lat" ),
76+ longitude = item .get ("lon" ),
13877 )
0 commit comments