Skip to content

Commit 0a290ee

Browse files
author
tim
committed
Address PR feedback: simplify Bremen spider
- Remove unnecessary file caching and SHA256 computation - Simplify parse() to return raw shapefile data - Move field mapping and validation to normalize() - Remove unused imports (os, hashlib, CloseSpider) This follows the architecture pattern where parse() keeps data as close to the source as possible, and normalize() handles the transformation to our standard schema.
1 parent a19c8a7 commit 0a290ee

File tree

1 file changed

+22
-83
lines changed

1 file changed

+22
-83
lines changed

jedeschule/spiders/bremen.py

Lines changed: 22 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
# -*- coding: utf-8 -*-
22
import io
3-
import os
43
import re
5-
import hashlib
64
import zipfile
75
import scrapy
86
from scrapy import Item
97
import shapefile
108
from pyproj import Transformer
11-
from scrapy.exceptions import CloseSpider
129

1310
from jedeschule.items import School
1411
from jedeschule.spiders.school_spider import SchoolSpider
@@ -17,37 +14,16 @@
1714
class BremenSpider(SchoolSpider):
1815
name = "bremen"
1916
ZIP_URL = "https://gdi2.geo.bremen.de/inspire/download/Schulstandorte/data/Schulstandorte_HB_BHV.zip"
20-
CACHE_DIR = "cache"
21-
CACHE_FILE = os.path.join(CACHE_DIR, "Schulstandorte_HB_BHV.zip")
22-
23-
# Friendly names → shapefile field names
24-
REQUIRED_MAP = {
25-
"name": "nam",
26-
"address": "strasse",
27-
"zip": "plz",
28-
"city": "ort",
29-
}
3017

3118
start_urls = [ZIP_URL]
3219

3320
def parse(self, response):
34-
os.makedirs(self.CACHE_DIR, exist_ok=True)
35-
36-
# Save ZIP and compute checksum
37-
sha256 = hashlib.sha256(response.body).hexdigest()
38-
with open(self.CACHE_FILE, "wb") as f:
39-
f.write(response.body)
40-
self.logger.info(f"Downloaded ZIP SHA256={sha256}")
41-
4221
# CRS: EPSG:25832 → EPSG:4326
4322
transformer = Transformer.from_crs(25832, 4326, always_xy=True)
4423

4524
# Read both shapefiles directly from ZIP (no extractall)
4625
with zipfile.ZipFile(io.BytesIO(response.body), "r") as zf:
47-
for stem, city_name in (
48-
("gdi_schulen_hb", "Bremen"),
49-
("gdi_schulen_bhv", "Bremerhaven"),
50-
):
26+
for stem in ("gdi_schulen_hb", "gdi_schulen_bhv"):
5127
shp_bytes = io.BytesIO(zf.read(f"{stem}.shp"))
5228
shx_bytes = io.BytesIO(zf.read(f"{stem}.shx"))
5329
dbf_bytes = io.BytesIO(zf.read(f"{stem}.dbf"))
@@ -64,75 +40,38 @@ def parse(self, response):
6440
shp=shp_bytes, shx=shx_bytes, dbf=dbf_bytes, encoding=encoding
6541
)
6642

67-
# Build robust field-name map
68-
# sf.fields: [("DeletionFlag","C",1,0), ("NAM","C",80,0), ...]
69-
field_names = [f[0].lower() for f in sf.fields[1:]] # skip DeletionFlag
70-
required = set(self.REQUIRED_MAP.values())
71-
missing = required.difference(field_names)
72-
if missing:
73-
raise ValueError(
74-
f"Missing required fields in {stem}: {missing}. Found: {field_names}"
75-
)
76-
77-
# Iterate records
78-
seen_ids = set()
43+
field_names = [f[0] for f in sf.fields[1:]] # skip DeletionFlag
7944
for sr in sf.iterShapeRecords():
8045
rec = dict(zip(field_names, sr.record))
8146

82-
snr_txt = (rec.get("snr_txt") or "").strip()
83-
if not re.fullmatch(r"\d{3}", snr_txt):
84-
raise ValueError(
85-
f"[{city_name}] Invalid SNR format '{snr_txt}' for {rec.get('nam')} (expected 3 digits)"
86-
)
87-
if snr_txt in seen_ids:
88-
raise ValueError(f"[{city_name}] Duplicate SNR '{snr_txt}'")
89-
seen_ids.add(snr_txt)
90-
91-
# Validate core fields (non-silent). Aggregate and act once.
92-
core = {
93-
k: (rec.get(v) or "").strip()
94-
for k, v in self.REQUIRED_MAP.items()
95-
}
96-
missing_core = [k for k, val in core.items() if not val]
97-
if missing_core:
98-
msg = f"[{city_name}] Missing required fields {missing_core} for SNR '{snr_txt}'"
99-
if getattr(self, "fail_on_missing_core", False):
100-
raise CloseSpider(reason=msg)
101-
self.logger.error(msg)
102-
continue
103-
10447
# geometry
10548
shp = sr.shape
106-
lat = lon = None
10749
if shp and shp.points:
10850
# Expect Point; take first coordinate defensively
10951
x, y = shp.points[0]
110-
lon, lat = transformer.transform(x, y)
111-
112-
yield {
113-
"snr": snr_txt,
114-
"name": core["name"],
115-
"address": core["address"],
116-
"zip": core["zip"],
117-
"city": core["city"],
118-
"district": rec.get("ortsteilna"),
119-
"school_type": rec.get("schulart_2"),
120-
"provider": rec.get("traegernam"),
121-
"latitude": lat,
122-
"longitude": lon,
123-
}
52+
rec['lon'], rec['lat'] = transformer.transform(x, y)
53+
yield rec
12454

12555
@staticmethod
12656
def normalize(item: Item) -> School:
127-
school_id = f"HB-{item.get('snr')}"
57+
# Create case-insensitive lookup
58+
item_lower = {k.lower(): v for k, v in item.items()}
59+
60+
# Extract and validate school ID
61+
snr_txt = (item_lower.get("snr_txt") or "").strip()
62+
if not snr_txt or not re.fullmatch(r"\d{3}", snr_txt):
63+
raise ValueError(f"Invalid or missing SNR_TXT: '{snr_txt}'")
64+
65+
school_id = f"HB-{snr_txt}"
66+
12867
return School(
129-
name=item.get("name"),
13068
id=school_id,
131-
address=item.get("address"),
132-
zip=item.get("zip"),
133-
city=item.get("city"),
134-
school_type=item.get("school_type"),
135-
provider=item.get("provider"),
136-
latitude=item.get("latitude"),
137-
longitude=item.get("longitude"),
69+
name=(item_lower.get("nam") or "").strip(),
70+
address=(item_lower.get("strasse") or "").strip(),
71+
zip=(item_lower.get("plz") or "").strip(),
72+
city=(item_lower.get("ort") or "").strip(),
73+
school_type=(item_lower.get("schulart_2") or "").strip(),
74+
provider=(item_lower.get("traegernam") or "").strip(),
75+
latitude=item.get("lat"),
76+
longitude=item.get("lon"),
13877
)

0 commit comments

Comments
 (0)