Skip to content

Commit 2721cd8

Browse files
authored
Merge pull request #182 from SimonMand/rework_hamburg_spider
Reworked Hamburg spider
2 parents fd46106 + d139de3 commit 2721cd8

File tree

3 files changed

+107
-25
lines changed

3 files changed

+107
-25
lines changed

jedeschule/spiders/hamburg.py

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,28 @@
1-
import xml.etree.ElementTree as ET
2-
31
from scrapy import Item
42

5-
from jedeschule.spiders.school_spider import SchoolSpider
63
from jedeschule.items import School
4+
from jedeschule.spiders.school_spider import SchoolSpider
5+
from jedeschule.wfs_basic_parsers import parse_geojson_features
76

87

98
class HamburgSpider(SchoolSpider):
109
name = "hamburg"
1110

1211
start_urls = [
13-
"https://geodienste.hamburg.de/HH_WFS_Schulen?SERVICE=WFS&VERSION=1.1.0&REQUEST=GetFeature&typename=de.hh.up:nicht_staatliche_schulen,de.hh.up:staatliche_schulen&srsname=EPSG:4326"
12+
"https://api.hamburg.de/datasets/v1/schulen/collections/staatliche_schulen/items"
13+
"?limit=1000",
14+
"https://api.hamburg.de/datasets/v1/schulen/collections/nicht_staatliche_schulen/items"
15+
"?limit=1000"
1416
]
1517

16-
17-
def parse(self, response):
18-
namespaces = {
19-
"gml": "http://www.opengis.net/gml",
18+
custom_settings = {
19+
"DEFAULT_REQUEST_HEADERS": {
20+
"Accept": "application/geo+json, application/json, */*"
2021
}
22+
}
2123

22-
elem = ET.fromstring(response.body)
23-
24-
for member in elem:
25-
data_elem = {}
26-
for attr in member[0]:
27-
if attr.tag == "{https://registry.gdi-de.org/id/de.hh.up}the_geom":
28-
# This nested entry contains the coordinates that we would like to expand
29-
lon, lat = attr.findtext(
30-
"gml:Point/gml:pos", namespaces=namespaces
31-
).split(" ")
32-
data_elem["lat"] = lat
33-
data_elem["lon"] = lon
34-
continue
35-
# strip the namespace before returning
36-
data_elem[attr.tag.split("}", 1)[1]] = attr.text
37-
yield data_elem
24+
def parse(self, response, **kwargs):
25+
yield from parse_geojson_features(response)
3826

3927
@staticmethod
4028
def normalize(item: Item) -> School:

jedeschule/wfs_basic_parsers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,4 @@ def parse_geojson_features(response: Response):
1313
properties["lon"] = coords[0]
1414
properties["lat"] = coords[1]
1515

16-
1716
yield properties

test/test_hamburg.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import unittest
2+
3+
from scrapy.http import TextResponse
4+
5+
from jedeschule.spiders.hamburg import HamburgSpider
6+
7+
8+
class TestHamburgSpider(unittest.TestCase):
9+
def test_parse(self):
10+
json_response = """
11+
{
12+
"type": "FeatureCollection",
13+
"numberReturned": 1,
14+
"numberMatched": 453,
15+
"timeStamp": "2025-07-14T19:20:02Z",
16+
"features": [
17+
{
18+
"type": "Feature",
19+
"geometry": {
20+
"type": "Point",
21+
"coordinates": [
22+
10.047106063058099,
23+
53.601522503676144
24+
]
25+
},
26+
"properties": {
27+
"abschluss": "Allgemeine Hochschulreife|erster allgemeinbildender Schulabschluss|Erweiterter erster allgemeinbildender Schulabschluss|mittlerer Schulabschluss|schulischer Teil der Fachhochschulreife",
28+
"adresse_ort": "22307 Hamburg",
29+
"adresse_strasse_hausnr": "Benzenbergweg 2",
30+
"ansprechp_klasse_5": "Nadine Kalsow",
31+
"ansprechp_buero": "Janka Gierck",
32+
"anzahl_schueler": 996,
33+
"anzahl_schueler_gesamt": "1261 an 2 Standorten",
34+
"bezirk": "Hamburg-Nord",
35+
"fax": "+49 40 428 88 15 22",
36+
"fremdsprache": "Englisch|Französisch|Spanisch|Spanisch",
37+
"fremdsprache_mit_klasse": "Englisch ab Klasse 5|Französisch ab Klasse 7|Spanisch ab Klasse 11|Spanisch ab Klasse 7",
38+
"ganztagsform": "GTS teilweise gebunden",
39+
"is_rebbz": "true",
40+
"kapitelbezeichnung": "Stadtteilschulen",
41+
"lgv_standortk_erwachsenenbildung": "No",
42+
"name_schulleiter": "Bianca Thies",
43+
"name_stellv_schulleiter": "Christian Pape",
44+
"name_oberstufenkoordinator": "Frau Scheuermann-Andersen *49 40 428 88 15-61",
45+
"name_verwaltungsleitung": "Grit Sobottka",
46+
"rebbz_homepage": "http://rebbz-winterhude.hamburg.de/",
47+
"rechtsform": "staatlich",
48+
"schueleranzahl_schuljahr": "2024",
49+
"schul_email": "[email protected]",
50+
"schul_homepage": "https://helmuthhuebener.de",
51+
"schul_id": "5043-0",
52+
"schul_telefonnr": "+49 40 428 88 15 0",
53+
"schulaufsicht": "Christine Zopff",
54+
"schulform": "Stadtteilschule",
55+
"schulinspektion_link": "https://www.hamburg.de/politik-und-verwaltung/behoerden/schulbehoerde/themen/schulaufsicht/inspektionsberichte/weiterfuehrende-schulen-hamburg-nord",
56+
"schulname": "Stadtteilschule Helmuth Hübener",
57+
"schultyp": "Hauptstandort",
58+
"sozialindex": "Stufe 2",
59+
"stadtteil": "Barmbek-Nord",
60+
"standort_id": "431",
61+
"zuegigkeit_kl_5": "7",
62+
"zustaendiges_rebbz": "ReBBZ Winterhude"
63+
},
64+
"id": 875415
65+
}
66+
],
67+
"links": []
68+
}
69+
"""
70+
71+
spider = HamburgSpider()
72+
response = TextResponse(
73+
url="http://test_webserver.com",
74+
body=json_response.encode("utf-8"),
75+
encoding="utf-8",
76+
)
77+
78+
schools = list(spider.parse(response))
79+
self.assertEqual(len(schools), 1)
80+
81+
school = schools[0]
82+
self.assertAlmostEqual(school["lon"], 10.047106063058099)
83+
self.assertAlmostEqual(school["lat"], 53.601522503676144)
84+
self.assertEqual(school["schul_id"], "5043-0")
85+
self.assertEqual(school["schulname"], "Stadtteilschule Helmuth Hübener")
86+
self.assertEqual(school["adresse_ort"], "22307 Hamburg")
87+
self.assertEqual(school["adresse_strasse_hausnr"], "Benzenbergweg 2")
88+
self.assertEqual(school["schul_telefonnr"], "+49 40 428 88 15 0")
89+
self.assertEqual(school["fax"], "+49 40 428 88 15 22")
90+
self.assertEqual(school["schul_email"], "[email protected]")
91+
self.assertEqual(school["schul_homepage"], "https://helmuthhuebener.de")
92+
93+
94+
if __name__ == "__main__":
95+
unittest.main()

0 commit comments

Comments
 (0)